1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Implements the AMDGPU specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUSubtarget.h" 15 #include "AMDGPUCallLowering.h" 16 #include "AMDGPUInstructionSelector.h" 17 #include "AMDGPULegalizerInfo.h" 18 #include "AMDGPURegisterBankInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "R600Subtarget.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "Utils/AMDGPUBaseInfo.h" 23 #include "llvm/ADT/SmallString.h" 24 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h" 25 #include "llvm/CodeGen/MachineScheduler.h" 26 #include "llvm/CodeGen/TargetFrameLowering.h" 27 #include "llvm/IR/IntrinsicsAMDGPU.h" 28 #include "llvm/IR/IntrinsicsR600.h" 29 #include "llvm/IR/MDBuilder.h" 30 #include "llvm/MC/MCSubtargetInfo.h" 31 #include <algorithm> 32 33 using namespace llvm; 34 35 #define DEBUG_TYPE "amdgpu-subtarget" 36 37 #define GET_SUBTARGETINFO_TARGET_DESC 38 #define GET_SUBTARGETINFO_CTOR 39 #define AMDGPUSubtarget GCNSubtarget 40 #include "AMDGPUGenSubtargetInfo.inc" 41 #undef AMDGPUSubtarget 42 43 static cl::opt<bool> DisablePowerSched( 44 "amdgpu-disable-power-sched", 45 cl::desc("Disable scheduling to minimize mAI power bursts"), 46 cl::init(false)); 47 48 static cl::opt<bool> EnableVGPRIndexMode( 49 "amdgpu-vgpr-index-mode", 50 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), 51 cl::init(false)); 52 53 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen", 54 cl::desc("Enable the use of AA during codegen."), 55 cl::init(true)); 56 57 GCNSubtarget::~GCNSubtarget() = default; 58 59 GCNSubtarget & 60 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 61 StringRef GPU, StringRef FS) { 62 // Determine default and user-specified characteristics 63 // 64 // We want to be able to turn these off, but making this a subtarget feature 65 // for SI has the unhelpful behavior that it unsets everything else if you 66 // disable it. 67 // 68 // Similarly we want enable-prt-strict-null to be on by default and not to 69 // unset everything else if it is disabled 70 71 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,"); 72 73 // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default 74 if (isAmdHsaOS()) 75 FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,"; 76 77 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS 78 79 // Disable mutually exclusive bits. 80 if (FS.contains_insensitive("+wavefrontsize")) { 81 if (!FS.contains_insensitive("wavefrontsize16")) 82 FullFS += "-wavefrontsize16,"; 83 if (!FS.contains_insensitive("wavefrontsize32")) 84 FullFS += "-wavefrontsize32,"; 85 if (!FS.contains_insensitive("wavefrontsize64")) 86 FullFS += "-wavefrontsize64,"; 87 } 88 89 FullFS += FS; 90 91 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS); 92 93 // Implement the "generic" processors, which acts as the default when no 94 // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to 95 // the first amdgcn target that supports flat addressing. Other OSes defaults 96 // to the first amdgcn target. 97 if (Gen == AMDGPUSubtarget::INVALID) { 98 Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS 99 : AMDGPUSubtarget::SOUTHERN_ISLANDS; 100 } 101 102 // We don't support FP64 for EG/NI atm. 103 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 104 105 // Targets must either support 64-bit offsets for MUBUF instructions, and/or 106 // support flat operations, otherwise they cannot access a 64-bit global 107 // address space 108 assert(hasAddr64() || hasFlat()); 109 // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets 110 // that do not support ADDR64 variants of MUBUF instructions. Such targets 111 // cannot use a 64 bit offset with a MUBUF instruction to access the global 112 // address space 113 if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) { 114 ToggleFeature(AMDGPU::FeatureFlatForGlobal); 115 FlatForGlobal = true; 116 } 117 // Unless +-flat-for-global is specified, use MUBUF instructions for global 118 // address space access if flat operations are not available. 119 if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) { 120 ToggleFeature(AMDGPU::FeatureFlatForGlobal); 121 FlatForGlobal = false; 122 } 123 124 // Set defaults if needed. 125 if (MaxPrivateElementSize == 0) 126 MaxPrivateElementSize = 4; 127 128 if (LDSBankCount == 0) 129 LDSBankCount = 32; 130 131 if (TT.getArch() == Triple::amdgcn) { 132 if (LocalMemorySize == 0) 133 LocalMemorySize = 32768; 134 135 // Do something sensible for unspecified target. 136 if (!HasMovrel && !HasVGPRIndexMode) 137 HasMovrel = true; 138 } 139 140 // Don't crash on invalid devices. 141 if (WavefrontSizeLog2 == 0) 142 WavefrontSizeLog2 = 5; 143 144 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 145 HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9; 146 147 TargetID.setTargetIDFromFeaturesString(FS); 148 149 LLVM_DEBUG(dbgs() << "xnack setting for subtarget: " 150 << TargetID.getXnackSetting() << '\n'); 151 LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: " 152 << TargetID.getSramEccSetting() << '\n'); 153 154 return *this; 155 } 156 157 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : 158 TargetTriple(TT), 159 GCN3Encoding(false), 160 Has16BitInsts(false), 161 HasMadMixInsts(false), 162 HasMadMacF32Insts(false), 163 HasDsSrc2Insts(false), 164 HasSDWA(false), 165 HasVOP3PInsts(false), 166 HasMulI24(true), 167 HasMulU24(true), 168 HasSMulHi(false), 169 HasInv2PiInlineImm(false), 170 HasFminFmaxLegacy(true), 171 EnablePromoteAlloca(false), 172 HasTrigReducedRange(false), 173 MaxWavesPerEU(10), 174 LocalMemorySize(0), 175 WavefrontSizeLog2(0) 176 { } 177 178 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 179 const GCNTargetMachine &TM) 180 : // clang-format off 181 AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS), 182 AMDGPUSubtarget(TT), 183 TargetTriple(TT), 184 TargetID(*this), 185 Gen(INVALID), 186 InstrItins(getInstrItineraryForCPU(GPU)), 187 LDSBankCount(0), 188 MaxPrivateElementSize(0), 189 190 FastFMAF32(false), 191 FastDenormalF32(false), 192 HalfRate64Ops(false), 193 FullRate64Ops(false), 194 195 FlatForGlobal(false), 196 AutoWaitcntBeforeBarrier(false), 197 BackOffBarrier(false), 198 UnalignedScratchAccess(false), 199 UnalignedAccessMode(false), 200 201 HasApertureRegs(false), 202 SupportsXNACK(false), 203 EnableXNACK(false), 204 EnableTgSplit(false), 205 EnableCuMode(false), 206 TrapHandler(false), 207 208 EnableLoadStoreOpt(false), 209 EnableUnsafeDSOffsetFolding(false), 210 EnableSIScheduler(false), 211 EnableDS128(false), 212 EnablePRTStrictNull(false), 213 DumpCode(false), 214 215 FP64(false), 216 CIInsts(false), 217 GFX8Insts(false), 218 GFX9Insts(false), 219 GFX90AInsts(false), 220 GFX940Insts(false), 221 GFX10Insts(false), 222 GFX10_3Insts(false), 223 GFX7GFX8GFX9Insts(false), 224 SGPRInitBug(false), 225 NegativeScratchOffsetBug(false), 226 NegativeUnalignedScratchOffsetBug(false), 227 HasSMemRealTime(false), 228 HasIntClamp(false), 229 HasFmaMixInsts(false), 230 HasMovrel(false), 231 HasVGPRIndexMode(false), 232 HasScalarStores(false), 233 HasScalarAtomics(false), 234 HasSDWAOmod(false), 235 HasSDWAScalar(false), 236 HasSDWASdst(false), 237 HasSDWAMac(false), 238 HasSDWAOutModsVOPC(false), 239 HasDPP(false), 240 HasDPP8(false), 241 Has64BitDPP(false), 242 HasPackedFP32Ops(false), 243 HasImageInsts(false), 244 HasExtendedImageInsts(false), 245 HasR128A16(false), 246 HasGFX10A16(false), 247 HasG16(false), 248 HasNSAEncoding(false), 249 NSAMaxSize(0), 250 GFX10_AEncoding(false), 251 GFX10_BEncoding(false), 252 HasDLInsts(false), 253 HasDot1Insts(false), 254 HasDot2Insts(false), 255 HasDot3Insts(false), 256 HasDot4Insts(false), 257 HasDot5Insts(false), 258 HasDot6Insts(false), 259 HasDot7Insts(false), 260 HasMAIInsts(false), 261 HasPkFmacF16Inst(false), 262 HasAtomicFaddInsts(false), 263 SupportsSRAMECC(false), 264 EnableSRAMECC(false), 265 HasNoSdstCMPX(false), 266 HasVscnt(false), 267 HasGetWaveIdInst(false), 268 HasSMemTimeInst(false), 269 HasShaderCyclesRegister(false), 270 HasVOP3Literal(false), 271 HasNoDataDepHazard(false), 272 FlatAddressSpace(false), 273 FlatInstOffsets(false), 274 FlatGlobalInsts(false), 275 FlatScratchInsts(false), 276 ScalarFlatScratchInsts(false), 277 HasArchitectedFlatScratch(false), 278 EnableFlatScratch(false), 279 AddNoCarryInsts(false), 280 HasUnpackedD16VMem(false), 281 LDSMisalignedBug(false), 282 HasMFMAInlineLiteralBug(false), 283 UnalignedBufferAccess(false), 284 UnalignedDSAccess(false), 285 HasPackedTID(false), 286 287 ScalarizeGlobal(false), 288 289 HasVcmpxPermlaneHazard(false), 290 HasVMEMtoScalarWriteHazard(false), 291 HasSMEMtoVectorWriteHazard(false), 292 HasInstFwdPrefetchBug(false), 293 HasVcmpxExecWARHazard(false), 294 HasLdsBranchVmemWARHazard(false), 295 HasNSAtoVMEMBug(false), 296 HasNSAClauseBug(false), 297 HasOffset3fBug(false), 298 HasFlatSegmentOffsetBug(false), 299 HasImageStoreD16Bug(false), 300 HasImageGather4D16Bug(false), 301 302 FeatureDisable(false), 303 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 304 TLInfo(TM, *this), 305 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 306 // clang-format on 307 MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this); 308 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 309 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering())); 310 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 311 RegBankInfo.reset(new AMDGPURegisterBankInfo(*this)); 312 InstSelector.reset(new AMDGPUInstructionSelector( 313 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 314 } 315 316 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { 317 if (getGeneration() < GFX10) 318 return 1; 319 320 switch (Opcode) { 321 case AMDGPU::V_LSHLREV_B64_e64: 322 case AMDGPU::V_LSHLREV_B64_gfx10: 323 case AMDGPU::V_LSHL_B64_e64: 324 case AMDGPU::V_LSHRREV_B64_e64: 325 case AMDGPU::V_LSHRREV_B64_gfx10: 326 case AMDGPU::V_LSHR_B64_e64: 327 case AMDGPU::V_ASHRREV_I64_e64: 328 case AMDGPU::V_ASHRREV_I64_gfx10: 329 case AMDGPU::V_ASHR_I64_e64: 330 return 1; 331 } 332 333 return 2; 334 } 335 336 /// This list was mostly derived from experimentation. 337 bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const { 338 switch (Opcode) { 339 case AMDGPU::V_CVT_F16_F32_e32: 340 case AMDGPU::V_CVT_F16_F32_e64: 341 case AMDGPU::V_CVT_F16_U16_e32: 342 case AMDGPU::V_CVT_F16_U16_e64: 343 case AMDGPU::V_CVT_F16_I16_e32: 344 case AMDGPU::V_CVT_F16_I16_e64: 345 case AMDGPU::V_RCP_F16_e64: 346 case AMDGPU::V_RCP_F16_e32: 347 case AMDGPU::V_RSQ_F16_e64: 348 case AMDGPU::V_RSQ_F16_e32: 349 case AMDGPU::V_SQRT_F16_e64: 350 case AMDGPU::V_SQRT_F16_e32: 351 case AMDGPU::V_LOG_F16_e64: 352 case AMDGPU::V_LOG_F16_e32: 353 case AMDGPU::V_EXP_F16_e64: 354 case AMDGPU::V_EXP_F16_e32: 355 case AMDGPU::V_SIN_F16_e64: 356 case AMDGPU::V_SIN_F16_e32: 357 case AMDGPU::V_COS_F16_e64: 358 case AMDGPU::V_COS_F16_e32: 359 case AMDGPU::V_FLOOR_F16_e64: 360 case AMDGPU::V_FLOOR_F16_e32: 361 case AMDGPU::V_CEIL_F16_e64: 362 case AMDGPU::V_CEIL_F16_e32: 363 case AMDGPU::V_TRUNC_F16_e64: 364 case AMDGPU::V_TRUNC_F16_e32: 365 case AMDGPU::V_RNDNE_F16_e64: 366 case AMDGPU::V_RNDNE_F16_e32: 367 case AMDGPU::V_FRACT_F16_e64: 368 case AMDGPU::V_FRACT_F16_e32: 369 case AMDGPU::V_FREXP_MANT_F16_e64: 370 case AMDGPU::V_FREXP_MANT_F16_e32: 371 case AMDGPU::V_FREXP_EXP_I16_F16_e64: 372 case AMDGPU::V_FREXP_EXP_I16_F16_e32: 373 case AMDGPU::V_LDEXP_F16_e64: 374 case AMDGPU::V_LDEXP_F16_e32: 375 case AMDGPU::V_LSHLREV_B16_e64: 376 case AMDGPU::V_LSHLREV_B16_e32: 377 case AMDGPU::V_LSHRREV_B16_e64: 378 case AMDGPU::V_LSHRREV_B16_e32: 379 case AMDGPU::V_ASHRREV_I16_e64: 380 case AMDGPU::V_ASHRREV_I16_e32: 381 case AMDGPU::V_ADD_U16_e64: 382 case AMDGPU::V_ADD_U16_e32: 383 case AMDGPU::V_SUB_U16_e64: 384 case AMDGPU::V_SUB_U16_e32: 385 case AMDGPU::V_SUBREV_U16_e64: 386 case AMDGPU::V_SUBREV_U16_e32: 387 case AMDGPU::V_MUL_LO_U16_e64: 388 case AMDGPU::V_MUL_LO_U16_e32: 389 case AMDGPU::V_ADD_F16_e64: 390 case AMDGPU::V_ADD_F16_e32: 391 case AMDGPU::V_SUB_F16_e64: 392 case AMDGPU::V_SUB_F16_e32: 393 case AMDGPU::V_SUBREV_F16_e64: 394 case AMDGPU::V_SUBREV_F16_e32: 395 case AMDGPU::V_MUL_F16_e64: 396 case AMDGPU::V_MUL_F16_e32: 397 case AMDGPU::V_MAX_F16_e64: 398 case AMDGPU::V_MAX_F16_e32: 399 case AMDGPU::V_MIN_F16_e64: 400 case AMDGPU::V_MIN_F16_e32: 401 case AMDGPU::V_MAX_U16_e64: 402 case AMDGPU::V_MAX_U16_e32: 403 case AMDGPU::V_MIN_U16_e64: 404 case AMDGPU::V_MIN_U16_e32: 405 case AMDGPU::V_MAX_I16_e64: 406 case AMDGPU::V_MAX_I16_e32: 407 case AMDGPU::V_MIN_I16_e64: 408 case AMDGPU::V_MIN_I16_e32: 409 case AMDGPU::V_MAD_F16_e64: 410 case AMDGPU::V_MAD_U16_e64: 411 case AMDGPU::V_MAD_I16_e64: 412 case AMDGPU::V_FMA_F16_e64: 413 case AMDGPU::V_DIV_FIXUP_F16_e64: 414 // On gfx10, all 16-bit instructions preserve the high bits. 415 return getGeneration() <= AMDGPUSubtarget::GFX9; 416 case AMDGPU::V_MADAK_F16: 417 case AMDGPU::V_MADMK_F16: 418 case AMDGPU::V_MAC_F16_e64: 419 case AMDGPU::V_MAC_F16_e32: 420 case AMDGPU::V_FMAMK_F16: 421 case AMDGPU::V_FMAAK_F16: 422 case AMDGPU::V_FMAC_F16_e64: 423 case AMDGPU::V_FMAC_F16_e32: 424 // In gfx9, the preferred handling of the unused high 16-bits changed. Most 425 // instructions maintain the legacy behavior of 0ing. Some instructions 426 // changed to preserving the high bits. 427 return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS; 428 case AMDGPU::V_MAD_MIXLO_F16: 429 case AMDGPU::V_MAD_MIXHI_F16: 430 default: 431 return false; 432 } 433 } 434 435 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 436 const Function &F) const { 437 if (NWaves == 1) 438 return getLocalMemorySize(); 439 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 440 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 441 if (!WorkGroupsPerCu) 442 return 0; 443 unsigned MaxWaves = getMaxWavesPerEU(); 444 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 445 } 446 447 // FIXME: Should return min,max range. 448 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 449 const Function &F) const { 450 const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second; 451 const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize); 452 if (!MaxWorkGroupsPerCu) 453 return 0; 454 455 const unsigned WaveSize = getWavefrontSize(); 456 457 // FIXME: Do we need to account for alignment requirement of LDS rounding the 458 // size up? 459 // Compute restriction based on LDS usage 460 unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u); 461 462 // This can be queried with more LDS than is possible, so just assume the 463 // worst. 464 if (NumGroups == 0) 465 return 1; 466 467 NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups); 468 469 // Round to the number of waves. 470 const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize; 471 unsigned MaxWaves = NumGroups * MaxGroupNumWaves; 472 473 // Clamp to the maximum possible number of waves. 474 MaxWaves = std::min(MaxWaves, getMaxWavesPerEU()); 475 476 // FIXME: Needs to be a multiple of the group size? 477 //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves); 478 479 assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() && 480 "computed invalid occupancy"); 481 return MaxWaves; 482 } 483 484 unsigned 485 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 486 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 487 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 488 } 489 490 std::pair<unsigned, unsigned> 491 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 492 switch (CC) { 493 case CallingConv::AMDGPU_VS: 494 case CallingConv::AMDGPU_LS: 495 case CallingConv::AMDGPU_HS: 496 case CallingConv::AMDGPU_ES: 497 case CallingConv::AMDGPU_GS: 498 case CallingConv::AMDGPU_PS: 499 return std::make_pair(1, getWavefrontSize()); 500 default: 501 return std::make_pair(1u, getMaxFlatWorkGroupSize()); 502 } 503 } 504 505 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 506 const Function &F) const { 507 // Default minimum/maximum flat work group sizes. 508 std::pair<unsigned, unsigned> Default = 509 getDefaultFlatWorkGroupSize(F.getCallingConv()); 510 511 // Requested minimum/maximum flat work group sizes. 512 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 513 F, "amdgpu-flat-work-group-size", Default); 514 515 // Make sure requested minimum is less than requested maximum. 516 if (Requested.first > Requested.second) 517 return Default; 518 519 // Make sure requested values do not violate subtarget's specifications. 520 if (Requested.first < getMinFlatWorkGroupSize()) 521 return Default; 522 if (Requested.second > getMaxFlatWorkGroupSize()) 523 return Default; 524 525 return Requested; 526 } 527 528 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 529 const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const { 530 // Default minimum/maximum number of waves per execution unit. 531 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 532 533 // If minimum/maximum flat work group sizes were explicitly requested using 534 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 535 // number of waves per execution unit to values implied by requested 536 // minimum/maximum flat work group sizes. 537 unsigned MinImpliedByFlatWorkGroupSize = 538 getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second); 539 Default.first = MinImpliedByFlatWorkGroupSize; 540 541 // Requested minimum/maximum number of waves per execution unit. 542 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 543 F, "amdgpu-waves-per-eu", Default, true); 544 545 // Make sure requested minimum is less than requested maximum. 546 if (Requested.second && Requested.first > Requested.second) 547 return Default; 548 549 // Make sure requested values do not violate subtarget's specifications. 550 if (Requested.first < getMinWavesPerEU() || 551 Requested.second > getMaxWavesPerEU()) 552 return Default; 553 554 // Make sure requested values are compatible with values implied by requested 555 // minimum/maximum flat work group sizes. 556 if (Requested.first < MinImpliedByFlatWorkGroupSize) 557 return Default; 558 559 return Requested; 560 } 561 562 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) { 563 auto Node = Kernel.getMetadata("reqd_work_group_size"); 564 if (Node && Node->getNumOperands() == 3) 565 return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue(); 566 return std::numeric_limits<unsigned>::max(); 567 } 568 569 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const { 570 return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv()); 571 } 572 573 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel, 574 unsigned Dimension) const { 575 unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension); 576 if (ReqdSize != std::numeric_limits<unsigned>::max()) 577 return ReqdSize - 1; 578 return getFlatWorkGroupSizes(Kernel).second - 1; 579 } 580 581 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 582 Function *Kernel = I->getParent()->getParent(); 583 unsigned MinSize = 0; 584 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 585 bool IdQuery = false; 586 587 // If reqd_work_group_size is present it narrows value down. 588 if (auto *CI = dyn_cast<CallInst>(I)) { 589 const Function *F = CI->getCalledFunction(); 590 if (F) { 591 unsigned Dim = UINT_MAX; 592 switch (F->getIntrinsicID()) { 593 case Intrinsic::amdgcn_workitem_id_x: 594 case Intrinsic::r600_read_tidig_x: 595 IdQuery = true; 596 LLVM_FALLTHROUGH; 597 case Intrinsic::r600_read_local_size_x: 598 Dim = 0; 599 break; 600 case Intrinsic::amdgcn_workitem_id_y: 601 case Intrinsic::r600_read_tidig_y: 602 IdQuery = true; 603 LLVM_FALLTHROUGH; 604 case Intrinsic::r600_read_local_size_y: 605 Dim = 1; 606 break; 607 case Intrinsic::amdgcn_workitem_id_z: 608 case Intrinsic::r600_read_tidig_z: 609 IdQuery = true; 610 LLVM_FALLTHROUGH; 611 case Intrinsic::r600_read_local_size_z: 612 Dim = 2; 613 break; 614 default: 615 break; 616 } 617 618 if (Dim <= 3) { 619 unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim); 620 if (ReqdSize != std::numeric_limits<unsigned>::max()) 621 MinSize = MaxSize = ReqdSize; 622 } 623 } 624 } 625 626 if (!MaxSize) 627 return false; 628 629 // Range metadata is [Lo, Hi). For ID query we need to pass max size 630 // as Hi. For size query we need to pass Hi + 1. 631 if (IdQuery) 632 MinSize = 0; 633 else 634 ++MaxSize; 635 636 MDBuilder MDB(I->getContext()); 637 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 638 APInt(32, MaxSize)); 639 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 640 return true; 641 } 642 643 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const { 644 assert(AMDGPU::isKernel(F.getCallingConv())); 645 646 // We don't allocate the segment if we know the implicit arguments weren't 647 // used, even if the ABI implies we need them. 648 if (F.hasFnAttribute("amdgpu-no-implicitarg-ptr")) 649 return 0; 650 651 if (isMesaKernel(F)) 652 return 16; 653 654 // Assume all implicit inputs are used by default 655 return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 56); 656 } 657 658 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 659 Align &MaxAlign) const { 660 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 661 F.getCallingConv() == CallingConv::SPIR_KERNEL); 662 663 const DataLayout &DL = F.getParent()->getDataLayout(); 664 uint64_t ExplicitArgBytes = 0; 665 MaxAlign = Align(1); 666 667 for (const Argument &Arg : F.args()) { 668 const bool IsByRef = Arg.hasByRefAttr(); 669 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); 670 MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None; 671 if (!Alignment) 672 Alignment = DL.getABITypeAlign(ArgTy); 673 674 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 675 ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize; 676 MaxAlign = max(MaxAlign, Alignment); 677 } 678 679 return ExplicitArgBytes; 680 } 681 682 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 683 Align &MaxAlign) const { 684 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 685 686 unsigned ExplicitOffset = getExplicitKernelArgOffset(F); 687 688 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 689 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 690 if (ImplicitBytes != 0) { 691 const Align Alignment = getAlignmentForImplicitArgPtr(); 692 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 693 MaxAlign = std::max(MaxAlign, Alignment); 694 } 695 696 // Being able to dereference past the end is useful for emitting scalar loads. 697 return alignTo(TotalSize, 4); 698 } 699 700 AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const { 701 return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32 702 : AMDGPUDwarfFlavour::Wave64; 703 } 704 705 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 706 unsigned NumRegionInstrs) const { 707 // Track register pressure so the scheduler can try to decrease 708 // pressure once register usage is above the threshold defined by 709 // SIRegisterInfo::getRegPressureSetLimit() 710 Policy.ShouldTrackPressure = true; 711 712 // Enabling both top down and bottom up scheduling seems to give us less 713 // register spills than just using one of these approaches on its own. 714 Policy.OnlyTopDown = false; 715 Policy.OnlyBottomUp = false; 716 717 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 718 if (!enableSIScheduler()) 719 Policy.ShouldTrackLaneMasks = true; 720 } 721 722 bool GCNSubtarget::hasMadF16() const { 723 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1; 724 } 725 726 bool GCNSubtarget::useVGPRIndexMode() const { 727 return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode()); 728 } 729 730 bool GCNSubtarget::useAA() const { return UseAA; } 731 732 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 733 if (getGeneration() >= AMDGPUSubtarget::GFX10) 734 return getMaxWavesPerEU(); 735 736 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 737 if (SGPRs <= 80) 738 return 10; 739 if (SGPRs <= 88) 740 return 9; 741 if (SGPRs <= 100) 742 return 8; 743 return 7; 744 } 745 if (SGPRs <= 48) 746 return 10; 747 if (SGPRs <= 56) 748 return 9; 749 if (SGPRs <= 64) 750 return 8; 751 if (SGPRs <= 72) 752 return 7; 753 if (SGPRs <= 80) 754 return 6; 755 return 5; 756 } 757 758 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 759 unsigned MaxWaves = getMaxWavesPerEU(); 760 unsigned Granule = getVGPRAllocGranule(); 761 if (VGPRs < Granule) 762 return MaxWaves; 763 unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule; 764 return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves); 765 } 766 767 unsigned 768 GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const { 769 if (getGeneration() >= AMDGPUSubtarget::GFX10) 770 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. 771 772 if (HasFlatScratch || HasArchitectedFlatScratch) { 773 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 774 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 775 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 776 return 4; // FLAT_SCRATCH, VCC (in that order). 777 } 778 779 if (isXNACKEnabled()) 780 return 4; // XNACK, VCC (in that order). 781 return 2; // VCC. 782 } 783 784 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 785 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 786 return getBaseReservedNumSGPRs(MFI.hasFlatScratchInit()); 787 } 788 789 unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const { 790 // In principle we do not need to reserve SGPR pair used for flat_scratch if 791 // we know flat instructions do not access the stack anywhere in the 792 // program. For now assume it's needed if we have flat instructions. 793 const bool KernelUsesFlatScratch = hasFlatAddressSpace(); 794 return getBaseReservedNumSGPRs(KernelUsesFlatScratch); 795 } 796 797 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize, 798 unsigned NumSGPRs, 799 unsigned NumVGPRs) const { 800 unsigned Occupancy = 801 std::min(getMaxWavesPerEU(), 802 getOccupancyWithLocalMemSize(LDSSize, F)); 803 if (NumSGPRs) 804 Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs)); 805 if (NumVGPRs) 806 Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs)); 807 return Occupancy; 808 } 809 810 unsigned GCNSubtarget::getBaseMaxNumSGPRs( 811 const Function &F, std::pair<unsigned, unsigned> WavesPerEU, 812 unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const { 813 // Compute maximum number of SGPRs function can use using default/requested 814 // minimum number of waves per execution unit. 815 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 816 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 817 818 // Check if maximum number of SGPRs was explicitly requested using 819 // "amdgpu-num-sgpr" attribute. 820 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 821 unsigned Requested = AMDGPU::getIntegerAttribute( 822 F, "amdgpu-num-sgpr", MaxNumSGPRs); 823 824 // Make sure requested value does not violate subtarget's specifications. 825 if (Requested && (Requested <= ReservedNumSGPRs)) 826 Requested = 0; 827 828 // If more SGPRs are required to support the input user/system SGPRs, 829 // increase to accommodate them. 830 // 831 // FIXME: This really ends up using the requested number of SGPRs + number 832 // of reserved special registers in total. Theoretically you could re-use 833 // the last input registers for these special registers, but this would 834 // require a lot of complexity to deal with the weird aliasing. 835 unsigned InputNumSGPRs = PreloadedSGPRs; 836 if (Requested && Requested < InputNumSGPRs) 837 Requested = InputNumSGPRs; 838 839 // Make sure requested value is compatible with values implied by 840 // default/requested minimum/maximum number of waves per execution unit. 841 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 842 Requested = 0; 843 if (WavesPerEU.second && 844 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 845 Requested = 0; 846 847 if (Requested) 848 MaxNumSGPRs = Requested; 849 } 850 851 if (hasSGPRInitBug()) 852 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 853 854 return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs); 855 } 856 857 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 858 const Function &F = MF.getFunction(); 859 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 860 return getBaseMaxNumSGPRs(F, MFI.getWavesPerEU(), MFI.getNumPreloadedSGPRs(), 861 getReservedNumSGPRs(MF)); 862 } 863 864 static unsigned getMaxNumPreloadedSGPRs() { 865 // Max number of user SGPRs 866 unsigned MaxUserSGPRs = 4 + // private segment buffer 867 2 + // Dispatch ptr 868 2 + // queue ptr 869 2 + // kernel segment ptr 870 2 + // dispatch ID 871 2 + // flat scratch init 872 2; // Implicit buffer ptr 873 // Max number of system SGPRs 874 unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX 875 1 + // WorkGroupIDY 876 1 + // WorkGroupIDZ 877 1 + // WorkGroupInfo 878 1; // private segment wave byte offset 879 return MaxUserSGPRs + MaxSystemSGPRs; 880 } 881 882 unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const { 883 return getBaseMaxNumSGPRs(F, getWavesPerEU(F), getMaxNumPreloadedSGPRs(), 884 getReservedNumSGPRs(F)); 885 } 886 887 unsigned GCNSubtarget::getBaseMaxNumVGPRs( 888 const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const { 889 // Compute maximum number of VGPRs function can use using default/requested 890 // minimum number of waves per execution unit. 891 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 892 893 // Check if maximum number of VGPRs was explicitly requested using 894 // "amdgpu-num-vgpr" attribute. 895 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 896 unsigned Requested = AMDGPU::getIntegerAttribute( 897 F, "amdgpu-num-vgpr", MaxNumVGPRs); 898 899 if (hasGFX90AInsts()) 900 Requested *= 2; 901 902 // Make sure requested value is compatible with values implied by 903 // default/requested minimum/maximum number of waves per execution unit. 904 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 905 Requested = 0; 906 if (WavesPerEU.second && 907 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 908 Requested = 0; 909 910 if (Requested) 911 MaxNumVGPRs = Requested; 912 } 913 914 return MaxNumVGPRs; 915 } 916 917 unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const { 918 return getBaseMaxNumVGPRs(F, getWavesPerEU(F)); 919 } 920 921 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 922 const Function &F = MF.getFunction(); 923 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 924 return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU()); 925 } 926 927 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, 928 int UseOpIdx, SDep &Dep) const { 929 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || 930 !Def->isInstr() || !Use->isInstr()) 931 return; 932 933 MachineInstr *DefI = Def->getInstr(); 934 MachineInstr *UseI = Use->getInstr(); 935 936 if (DefI->isBundle()) { 937 const SIRegisterInfo *TRI = getRegisterInfo(); 938 auto Reg = Dep.getReg(); 939 MachineBasicBlock::const_instr_iterator I(DefI->getIterator()); 940 MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end()); 941 unsigned Lat = 0; 942 for (++I; I != E && I->isBundledWithPred(); ++I) { 943 if (I->modifiesRegister(Reg, TRI)) 944 Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I); 945 else if (Lat) 946 --Lat; 947 } 948 Dep.setLatency(Lat); 949 } else if (UseI->isBundle()) { 950 const SIRegisterInfo *TRI = getRegisterInfo(); 951 auto Reg = Dep.getReg(); 952 MachineBasicBlock::const_instr_iterator I(UseI->getIterator()); 953 MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end()); 954 unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI); 955 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) { 956 if (I->readsRegister(Reg, TRI)) 957 break; 958 --Lat; 959 } 960 Dep.setLatency(Lat); 961 } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) { 962 // Work around the fact that SIInstrInfo::fixImplicitOperands modifies 963 // implicit operands which come from the MCInstrDesc, which can fool 964 // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit 965 // pseudo operands. 966 Dep.setLatency(InstrInfo.getSchedModel().computeOperandLatency( 967 DefI, DefOpIdx, UseI, UseOpIdx)); 968 } 969 } 970 971 namespace { 972 struct FillMFMAShadowMutation : ScheduleDAGMutation { 973 const SIInstrInfo *TII; 974 975 ScheduleDAGMI *DAG; 976 977 FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {} 978 979 bool isSALU(const SUnit *SU) const { 980 const MachineInstr *MI = SU->getInstr(); 981 return MI && TII->isSALU(*MI) && !MI->isTerminator(); 982 } 983 984 bool isVALU(const SUnit *SU) const { 985 const MachineInstr *MI = SU->getInstr(); 986 return MI && TII->isVALU(*MI); 987 } 988 989 bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const { 990 if (Pred->NodeNum < Succ->NodeNum) 991 return true; 992 993 SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred}); 994 995 for (unsigned I = 0; I < Succs.size(); ++I) { 996 for (const SDep &SI : Succs[I]->Succs) { 997 const SUnit *SU = SI.getSUnit(); 998 if (SU != Succs[I] && !llvm::is_contained(Succs, SU)) 999 Succs.push_back(SU); 1000 } 1001 } 1002 1003 SmallPtrSet<const SUnit*, 32> Visited; 1004 while (!Preds.empty()) { 1005 const SUnit *SU = Preds.pop_back_val(); 1006 if (llvm::is_contained(Succs, SU)) 1007 return false; 1008 Visited.insert(SU); 1009 for (const SDep &SI : SU->Preds) 1010 if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit())) 1011 Preds.push_back(SI.getSUnit()); 1012 } 1013 1014 return true; 1015 } 1016 1017 // Link as many SALU instructions in chain as possible. Return the size 1018 // of the chain. Links up to MaxChain instructions. 1019 unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain, 1020 SmallPtrSetImpl<SUnit *> &Visited) const { 1021 SmallVector<SUnit *, 8> Worklist({To}); 1022 unsigned Linked = 0; 1023 1024 while (!Worklist.empty() && MaxChain-- > 0) { 1025 SUnit *SU = Worklist.pop_back_val(); 1026 if (!Visited.insert(SU).second) 1027 continue; 1028 1029 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From); 1030 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n'); 1031 1032 if (SU->addPred(SDep(From, SDep::Artificial), false)) 1033 ++Linked; 1034 1035 for (SDep &SI : From->Succs) { 1036 SUnit *SUv = SI.getSUnit(); 1037 if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU)) 1038 SUv->addPred(SDep(SU, SDep::Artificial), false); 1039 } 1040 1041 for (SDep &SI : SU->Succs) { 1042 SUnit *Succ = SI.getSUnit(); 1043 if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ)) 1044 Worklist.push_back(Succ); 1045 } 1046 } 1047 1048 return Linked; 1049 } 1050 1051 void apply(ScheduleDAGInstrs *DAGInstrs) override { 1052 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); 1053 if (!ST.hasMAIInsts() || DisablePowerSched) 1054 return; 1055 DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); 1056 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); 1057 if (!TSchedModel || DAG->SUnits.empty()) 1058 return; 1059 1060 // Scan for MFMA long latency instructions and try to add a dependency 1061 // of available SALU instructions to give them a chance to fill MFMA 1062 // shadow. That is desirable to fill MFMA shadow with SALU instructions 1063 // rather than VALU to prevent power consumption bursts and throttle. 1064 auto LastSALU = DAG->SUnits.begin(); 1065 auto E = DAG->SUnits.end(); 1066 SmallPtrSet<SUnit*, 32> Visited; 1067 for (SUnit &SU : DAG->SUnits) { 1068 MachineInstr &MAI = *SU.getInstr(); 1069 if (!TII->isMAI(MAI) || 1070 MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 || 1071 MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64) 1072 continue; 1073 1074 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1; 1075 1076 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU); 1077 dbgs() << "Need " << Lat 1078 << " instructions to cover latency.\n"); 1079 1080 // Find up to Lat independent scalar instructions as early as 1081 // possible such that they can be scheduled after this MFMA. 1082 for ( ; Lat && LastSALU != E; ++LastSALU) { 1083 if (Visited.count(&*LastSALU)) 1084 continue; 1085 1086 if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU)) 1087 continue; 1088 1089 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited); 1090 } 1091 } 1092 } 1093 }; 1094 } // namespace 1095 1096 void GCNSubtarget::getPostRAMutations( 1097 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 1098 Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo)); 1099 } 1100 1101 std::unique_ptr<ScheduleDAGMutation> 1102 GCNSubtarget::createFillMFMAShadowMutation(const TargetInstrInfo *TII) const { 1103 return std::make_unique<FillMFMAShadowMutation>(&InstrInfo); 1104 } 1105 1106 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 1107 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 1108 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 1109 else 1110 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); 1111 } 1112 1113 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 1114 if (TM.getTargetTriple().getArch() == Triple::amdgcn) 1115 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 1116 else 1117 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); 1118 } 1119