1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Implements the AMDGPU specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUSubtarget.h" 15 #include "AMDGPUCallLowering.h" 16 #include "AMDGPUInstructionSelector.h" 17 #include "AMDGPULegalizerInfo.h" 18 #include "AMDGPURegisterBankInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "R600Subtarget.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "Utils/AMDGPUBaseInfo.h" 23 #include "llvm/ADT/SmallString.h" 24 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h" 25 #include "llvm/CodeGen/MachineScheduler.h" 26 #include "llvm/CodeGen/TargetFrameLowering.h" 27 #include "llvm/IR/IntrinsicsAMDGPU.h" 28 #include "llvm/IR/IntrinsicsR600.h" 29 #include "llvm/IR/MDBuilder.h" 30 #include "llvm/MC/MCSubtargetInfo.h" 31 #include <algorithm> 32 33 using namespace llvm; 34 35 #define DEBUG_TYPE "amdgpu-subtarget" 36 37 #define GET_SUBTARGETINFO_TARGET_DESC 38 #define GET_SUBTARGETINFO_CTOR 39 #define AMDGPUSubtarget GCNSubtarget 40 #include "AMDGPUGenSubtargetInfo.inc" 41 #undef AMDGPUSubtarget 42 43 static cl::opt<bool> DisablePowerSched( 44 "amdgpu-disable-power-sched", 45 cl::desc("Disable scheduling to minimize mAI power bursts"), 46 cl::init(false)); 47 48 static cl::opt<bool> EnableVGPRIndexMode( 49 "amdgpu-vgpr-index-mode", 50 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), 51 cl::init(false)); 52 53 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen", 54 cl::desc("Enable the use of AA during codegen."), 55 cl::init(true)); 56 57 GCNSubtarget::~GCNSubtarget() = default; 58 59 GCNSubtarget & 60 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 61 StringRef GPU, StringRef FS) { 62 // Determine default and user-specified characteristics 63 // 64 // We want to be able to turn these off, but making this a subtarget feature 65 // for SI has the unhelpful behavior that it unsets everything else if you 66 // disable it. 67 // 68 // Similarly we want enable-prt-strict-null to be on by default and not to 69 // unset everything else if it is disabled 70 71 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,"); 72 73 // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default 74 if (isAmdHsaOS()) 75 FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,"; 76 77 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS 78 79 // Disable mutually exclusive bits. 80 if (FS.contains_insensitive("+wavefrontsize")) { 81 if (!FS.contains_insensitive("wavefrontsize16")) 82 FullFS += "-wavefrontsize16,"; 83 if (!FS.contains_insensitive("wavefrontsize32")) 84 FullFS += "-wavefrontsize32,"; 85 if (!FS.contains_insensitive("wavefrontsize64")) 86 FullFS += "-wavefrontsize64,"; 87 } 88 89 FullFS += FS; 90 91 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS); 92 93 // Implement the "generic" processors, which acts as the default when no 94 // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to 95 // the first amdgcn target that supports flat addressing. Other OSes defaults 96 // to the first amdgcn target. 97 if (Gen == AMDGPUSubtarget::INVALID) { 98 Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS 99 : AMDGPUSubtarget::SOUTHERN_ISLANDS; 100 } 101 102 // We don't support FP64 for EG/NI atm. 103 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 104 105 // Targets must either support 64-bit offsets for MUBUF instructions, and/or 106 // support flat operations, otherwise they cannot access a 64-bit global 107 // address space 108 assert(hasAddr64() || hasFlat()); 109 // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets 110 // that do not support ADDR64 variants of MUBUF instructions. Such targets 111 // cannot use a 64 bit offset with a MUBUF instruction to access the global 112 // address space 113 if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) { 114 ToggleFeature(AMDGPU::FeatureFlatForGlobal); 115 FlatForGlobal = true; 116 } 117 // Unless +-flat-for-global is specified, use MUBUF instructions for global 118 // address space access if flat operations are not available. 119 if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) { 120 ToggleFeature(AMDGPU::FeatureFlatForGlobal); 121 FlatForGlobal = false; 122 } 123 124 // Set defaults if needed. 125 if (MaxPrivateElementSize == 0) 126 MaxPrivateElementSize = 4; 127 128 if (LDSBankCount == 0) 129 LDSBankCount = 32; 130 131 if (TT.getArch() == Triple::amdgcn) { 132 if (LocalMemorySize == 0) 133 LocalMemorySize = 32768; 134 135 // Do something sensible for unspecified target. 136 if (!HasMovrel && !HasVGPRIndexMode) 137 HasMovrel = true; 138 } 139 140 // Don't crash on invalid devices. 141 if (WavefrontSizeLog2 == 0) 142 WavefrontSizeLog2 = 5; 143 144 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 145 HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9; 146 147 TargetID.setTargetIDFromFeaturesString(FS); 148 149 LLVM_DEBUG(dbgs() << "xnack setting for subtarget: " 150 << TargetID.getXnackSetting() << '\n'); 151 LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: " 152 << TargetID.getSramEccSetting() << '\n'); 153 154 return *this; 155 } 156 157 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : 158 TargetTriple(TT), 159 GCN3Encoding(false), 160 Has16BitInsts(false), 161 HasMadMixInsts(false), 162 HasMadMacF32Insts(false), 163 HasDsSrc2Insts(false), 164 HasSDWA(false), 165 HasVOP3PInsts(false), 166 HasMulI24(true), 167 HasMulU24(true), 168 HasSMulHi(false), 169 HasInv2PiInlineImm(false), 170 HasFminFmaxLegacy(true), 171 EnablePromoteAlloca(false), 172 HasTrigReducedRange(false), 173 MaxWavesPerEU(10), 174 LocalMemorySize(0), 175 WavefrontSizeLog2(0) 176 { } 177 178 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 179 const GCNTargetMachine &TM) 180 : // clang-format off 181 AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS), 182 AMDGPUSubtarget(TT), 183 TargetTriple(TT), 184 TargetID(*this), 185 Gen(INVALID), 186 InstrItins(getInstrItineraryForCPU(GPU)), 187 LDSBankCount(0), 188 MaxPrivateElementSize(0), 189 190 FastFMAF32(false), 191 FastDenormalF32(false), 192 HalfRate64Ops(false), 193 FullRate64Ops(false), 194 195 FlatForGlobal(false), 196 AutoWaitcntBeforeBarrier(false), 197 UnalignedScratchAccess(false), 198 UnalignedAccessMode(false), 199 200 HasApertureRegs(false), 201 SupportsXNACK(false), 202 EnableXNACK(false), 203 EnableTgSplit(false), 204 EnableCuMode(false), 205 TrapHandler(false), 206 207 EnableLoadStoreOpt(false), 208 EnableUnsafeDSOffsetFolding(false), 209 EnableSIScheduler(false), 210 EnableDS128(false), 211 EnablePRTStrictNull(false), 212 DumpCode(false), 213 214 FP64(false), 215 CIInsts(false), 216 GFX8Insts(false), 217 GFX9Insts(false), 218 GFX90AInsts(false), 219 GFX940Insts(false), 220 GFX10Insts(false), 221 GFX10_3Insts(false), 222 GFX7GFX8GFX9Insts(false), 223 SGPRInitBug(false), 224 NegativeScratchOffsetBug(false), 225 NegativeUnalignedScratchOffsetBug(false), 226 HasSMemRealTime(false), 227 HasIntClamp(false), 228 HasFmaMixInsts(false), 229 HasMovrel(false), 230 HasVGPRIndexMode(false), 231 HasScalarStores(false), 232 HasScalarAtomics(false), 233 HasSDWAOmod(false), 234 HasSDWAScalar(false), 235 HasSDWASdst(false), 236 HasSDWAMac(false), 237 HasSDWAOutModsVOPC(false), 238 HasDPP(false), 239 HasDPP8(false), 240 Has64BitDPP(false), 241 HasPackedFP32Ops(false), 242 HasImageInsts(false), 243 HasExtendedImageInsts(false), 244 HasR128A16(false), 245 HasGFX10A16(false), 246 HasG16(false), 247 HasNSAEncoding(false), 248 NSAMaxSize(0), 249 GFX10_AEncoding(false), 250 GFX10_BEncoding(false), 251 HasDLInsts(false), 252 HasDot1Insts(false), 253 HasDot2Insts(false), 254 HasDot3Insts(false), 255 HasDot4Insts(false), 256 HasDot5Insts(false), 257 HasDot6Insts(false), 258 HasDot7Insts(false), 259 HasMAIInsts(false), 260 HasPkFmacF16Inst(false), 261 HasAtomicFaddInsts(false), 262 SupportsSRAMECC(false), 263 EnableSRAMECC(false), 264 HasNoSdstCMPX(false), 265 HasVscnt(false), 266 HasGetWaveIdInst(false), 267 HasSMemTimeInst(false), 268 HasShaderCyclesRegister(false), 269 HasVOP3Literal(false), 270 HasNoDataDepHazard(false), 271 FlatAddressSpace(false), 272 FlatInstOffsets(false), 273 FlatGlobalInsts(false), 274 FlatScratchInsts(false), 275 ScalarFlatScratchInsts(false), 276 HasArchitectedFlatScratch(false), 277 EnableFlatScratch(false), 278 AddNoCarryInsts(false), 279 HasUnpackedD16VMem(false), 280 LDSMisalignedBug(false), 281 HasMFMAInlineLiteralBug(false), 282 UnalignedBufferAccess(false), 283 UnalignedDSAccess(false), 284 HasPackedTID(false), 285 286 ScalarizeGlobal(false), 287 288 HasVcmpxPermlaneHazard(false), 289 HasVMEMtoScalarWriteHazard(false), 290 HasSMEMtoVectorWriteHazard(false), 291 HasInstFwdPrefetchBug(false), 292 HasVcmpxExecWARHazard(false), 293 HasLdsBranchVmemWARHazard(false), 294 HasNSAtoVMEMBug(false), 295 HasNSAClauseBug(false), 296 HasOffset3fBug(false), 297 HasFlatSegmentOffsetBug(false), 298 HasImageStoreD16Bug(false), 299 HasImageGather4D16Bug(false), 300 301 FeatureDisable(false), 302 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 303 TLInfo(TM, *this), 304 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 305 // clang-format on 306 MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this); 307 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 308 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering())); 309 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 310 RegBankInfo.reset(new AMDGPURegisterBankInfo(*this)); 311 InstSelector.reset(new AMDGPUInstructionSelector( 312 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 313 } 314 315 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { 316 if (getGeneration() < GFX10) 317 return 1; 318 319 switch (Opcode) { 320 case AMDGPU::V_LSHLREV_B64_e64: 321 case AMDGPU::V_LSHLREV_B64_gfx10: 322 case AMDGPU::V_LSHL_B64_e64: 323 case AMDGPU::V_LSHRREV_B64_e64: 324 case AMDGPU::V_LSHRREV_B64_gfx10: 325 case AMDGPU::V_LSHR_B64_e64: 326 case AMDGPU::V_ASHRREV_I64_e64: 327 case AMDGPU::V_ASHRREV_I64_gfx10: 328 case AMDGPU::V_ASHR_I64_e64: 329 return 1; 330 } 331 332 return 2; 333 } 334 335 /// This list was mostly derived from experimentation. 336 bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const { 337 switch (Opcode) { 338 case AMDGPU::V_CVT_F16_F32_e32: 339 case AMDGPU::V_CVT_F16_F32_e64: 340 case AMDGPU::V_CVT_F16_U16_e32: 341 case AMDGPU::V_CVT_F16_U16_e64: 342 case AMDGPU::V_CVT_F16_I16_e32: 343 case AMDGPU::V_CVT_F16_I16_e64: 344 case AMDGPU::V_RCP_F16_e64: 345 case AMDGPU::V_RCP_F16_e32: 346 case AMDGPU::V_RSQ_F16_e64: 347 case AMDGPU::V_RSQ_F16_e32: 348 case AMDGPU::V_SQRT_F16_e64: 349 case AMDGPU::V_SQRT_F16_e32: 350 case AMDGPU::V_LOG_F16_e64: 351 case AMDGPU::V_LOG_F16_e32: 352 case AMDGPU::V_EXP_F16_e64: 353 case AMDGPU::V_EXP_F16_e32: 354 case AMDGPU::V_SIN_F16_e64: 355 case AMDGPU::V_SIN_F16_e32: 356 case AMDGPU::V_COS_F16_e64: 357 case AMDGPU::V_COS_F16_e32: 358 case AMDGPU::V_FLOOR_F16_e64: 359 case AMDGPU::V_FLOOR_F16_e32: 360 case AMDGPU::V_CEIL_F16_e64: 361 case AMDGPU::V_CEIL_F16_e32: 362 case AMDGPU::V_TRUNC_F16_e64: 363 case AMDGPU::V_TRUNC_F16_e32: 364 case AMDGPU::V_RNDNE_F16_e64: 365 case AMDGPU::V_RNDNE_F16_e32: 366 case AMDGPU::V_FRACT_F16_e64: 367 case AMDGPU::V_FRACT_F16_e32: 368 case AMDGPU::V_FREXP_MANT_F16_e64: 369 case AMDGPU::V_FREXP_MANT_F16_e32: 370 case AMDGPU::V_FREXP_EXP_I16_F16_e64: 371 case AMDGPU::V_FREXP_EXP_I16_F16_e32: 372 case AMDGPU::V_LDEXP_F16_e64: 373 case AMDGPU::V_LDEXP_F16_e32: 374 case AMDGPU::V_LSHLREV_B16_e64: 375 case AMDGPU::V_LSHLREV_B16_e32: 376 case AMDGPU::V_LSHRREV_B16_e64: 377 case AMDGPU::V_LSHRREV_B16_e32: 378 case AMDGPU::V_ASHRREV_I16_e64: 379 case AMDGPU::V_ASHRREV_I16_e32: 380 case AMDGPU::V_ADD_U16_e64: 381 case AMDGPU::V_ADD_U16_e32: 382 case AMDGPU::V_SUB_U16_e64: 383 case AMDGPU::V_SUB_U16_e32: 384 case AMDGPU::V_SUBREV_U16_e64: 385 case AMDGPU::V_SUBREV_U16_e32: 386 case AMDGPU::V_MUL_LO_U16_e64: 387 case AMDGPU::V_MUL_LO_U16_e32: 388 case AMDGPU::V_ADD_F16_e64: 389 case AMDGPU::V_ADD_F16_e32: 390 case AMDGPU::V_SUB_F16_e64: 391 case AMDGPU::V_SUB_F16_e32: 392 case AMDGPU::V_SUBREV_F16_e64: 393 case AMDGPU::V_SUBREV_F16_e32: 394 case AMDGPU::V_MUL_F16_e64: 395 case AMDGPU::V_MUL_F16_e32: 396 case AMDGPU::V_MAX_F16_e64: 397 case AMDGPU::V_MAX_F16_e32: 398 case AMDGPU::V_MIN_F16_e64: 399 case AMDGPU::V_MIN_F16_e32: 400 case AMDGPU::V_MAX_U16_e64: 401 case AMDGPU::V_MAX_U16_e32: 402 case AMDGPU::V_MIN_U16_e64: 403 case AMDGPU::V_MIN_U16_e32: 404 case AMDGPU::V_MAX_I16_e64: 405 case AMDGPU::V_MAX_I16_e32: 406 case AMDGPU::V_MIN_I16_e64: 407 case AMDGPU::V_MIN_I16_e32: 408 case AMDGPU::V_MAD_F16_e64: 409 case AMDGPU::V_MAD_U16_e64: 410 case AMDGPU::V_MAD_I16_e64: 411 case AMDGPU::V_FMA_F16_e64: 412 case AMDGPU::V_DIV_FIXUP_F16_e64: 413 // On gfx10, all 16-bit instructions preserve the high bits. 414 return getGeneration() <= AMDGPUSubtarget::GFX9; 415 case AMDGPU::V_MADAK_F16: 416 case AMDGPU::V_MADMK_F16: 417 case AMDGPU::V_MAC_F16_e64: 418 case AMDGPU::V_MAC_F16_e32: 419 case AMDGPU::V_FMAMK_F16: 420 case AMDGPU::V_FMAAK_F16: 421 case AMDGPU::V_FMAC_F16_e64: 422 case AMDGPU::V_FMAC_F16_e32: 423 // In gfx9, the preferred handling of the unused high 16-bits changed. Most 424 // instructions maintain the legacy behavior of 0ing. Some instructions 425 // changed to preserving the high bits. 426 return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS; 427 case AMDGPU::V_MAD_MIXLO_F16: 428 case AMDGPU::V_MAD_MIXHI_F16: 429 default: 430 return false; 431 } 432 } 433 434 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 435 const Function &F) const { 436 if (NWaves == 1) 437 return getLocalMemorySize(); 438 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 439 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 440 if (!WorkGroupsPerCu) 441 return 0; 442 unsigned MaxWaves = getMaxWavesPerEU(); 443 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 444 } 445 446 // FIXME: Should return min,max range. 447 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 448 const Function &F) const { 449 const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second; 450 const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize); 451 if (!MaxWorkGroupsPerCu) 452 return 0; 453 454 const unsigned WaveSize = getWavefrontSize(); 455 456 // FIXME: Do we need to account for alignment requirement of LDS rounding the 457 // size up? 458 // Compute restriction based on LDS usage 459 unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u); 460 461 // This can be queried with more LDS than is possible, so just assume the 462 // worst. 463 if (NumGroups == 0) 464 return 1; 465 466 NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups); 467 468 // Round to the number of waves. 469 const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize; 470 unsigned MaxWaves = NumGroups * MaxGroupNumWaves; 471 472 // Clamp to the maximum possible number of waves. 473 MaxWaves = std::min(MaxWaves, getMaxWavesPerEU()); 474 475 // FIXME: Needs to be a multiple of the group size? 476 //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves); 477 478 assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() && 479 "computed invalid occupancy"); 480 return MaxWaves; 481 } 482 483 unsigned 484 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 485 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 486 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 487 } 488 489 std::pair<unsigned, unsigned> 490 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 491 switch (CC) { 492 case CallingConv::AMDGPU_VS: 493 case CallingConv::AMDGPU_LS: 494 case CallingConv::AMDGPU_HS: 495 case CallingConv::AMDGPU_ES: 496 case CallingConv::AMDGPU_GS: 497 case CallingConv::AMDGPU_PS: 498 return std::make_pair(1, getWavefrontSize()); 499 default: 500 return std::make_pair(1u, getMaxFlatWorkGroupSize()); 501 } 502 } 503 504 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 505 const Function &F) const { 506 // Default minimum/maximum flat work group sizes. 507 std::pair<unsigned, unsigned> Default = 508 getDefaultFlatWorkGroupSize(F.getCallingConv()); 509 510 // Requested minimum/maximum flat work group sizes. 511 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 512 F, "amdgpu-flat-work-group-size", Default); 513 514 // Make sure requested minimum is less than requested maximum. 515 if (Requested.first > Requested.second) 516 return Default; 517 518 // Make sure requested values do not violate subtarget's specifications. 519 if (Requested.first < getMinFlatWorkGroupSize()) 520 return Default; 521 if (Requested.second > getMaxFlatWorkGroupSize()) 522 return Default; 523 524 return Requested; 525 } 526 527 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 528 const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const { 529 // Default minimum/maximum number of waves per execution unit. 530 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 531 532 // If minimum/maximum flat work group sizes were explicitly requested using 533 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 534 // number of waves per execution unit to values implied by requested 535 // minimum/maximum flat work group sizes. 536 unsigned MinImpliedByFlatWorkGroupSize = 537 getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second); 538 Default.first = MinImpliedByFlatWorkGroupSize; 539 540 // Requested minimum/maximum number of waves per execution unit. 541 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 542 F, "amdgpu-waves-per-eu", Default, true); 543 544 // Make sure requested minimum is less than requested maximum. 545 if (Requested.second && Requested.first > Requested.second) 546 return Default; 547 548 // Make sure requested values do not violate subtarget's specifications. 549 if (Requested.first < getMinWavesPerEU() || 550 Requested.second > getMaxWavesPerEU()) 551 return Default; 552 553 // Make sure requested values are compatible with values implied by requested 554 // minimum/maximum flat work group sizes. 555 if (Requested.first < MinImpliedByFlatWorkGroupSize) 556 return Default; 557 558 return Requested; 559 } 560 561 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) { 562 auto Node = Kernel.getMetadata("reqd_work_group_size"); 563 if (Node && Node->getNumOperands() == 3) 564 return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue(); 565 return std::numeric_limits<unsigned>::max(); 566 } 567 568 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const { 569 return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv()); 570 } 571 572 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel, 573 unsigned Dimension) const { 574 unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension); 575 if (ReqdSize != std::numeric_limits<unsigned>::max()) 576 return ReqdSize - 1; 577 return getFlatWorkGroupSizes(Kernel).second - 1; 578 } 579 580 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 581 Function *Kernel = I->getParent()->getParent(); 582 unsigned MinSize = 0; 583 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 584 bool IdQuery = false; 585 586 // If reqd_work_group_size is present it narrows value down. 587 if (auto *CI = dyn_cast<CallInst>(I)) { 588 const Function *F = CI->getCalledFunction(); 589 if (F) { 590 unsigned Dim = UINT_MAX; 591 switch (F->getIntrinsicID()) { 592 case Intrinsic::amdgcn_workitem_id_x: 593 case Intrinsic::r600_read_tidig_x: 594 IdQuery = true; 595 LLVM_FALLTHROUGH; 596 case Intrinsic::r600_read_local_size_x: 597 Dim = 0; 598 break; 599 case Intrinsic::amdgcn_workitem_id_y: 600 case Intrinsic::r600_read_tidig_y: 601 IdQuery = true; 602 LLVM_FALLTHROUGH; 603 case Intrinsic::r600_read_local_size_y: 604 Dim = 1; 605 break; 606 case Intrinsic::amdgcn_workitem_id_z: 607 case Intrinsic::r600_read_tidig_z: 608 IdQuery = true; 609 LLVM_FALLTHROUGH; 610 case Intrinsic::r600_read_local_size_z: 611 Dim = 2; 612 break; 613 default: 614 break; 615 } 616 617 if (Dim <= 3) { 618 unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim); 619 if (ReqdSize != std::numeric_limits<unsigned>::max()) 620 MinSize = MaxSize = ReqdSize; 621 } 622 } 623 } 624 625 if (!MaxSize) 626 return false; 627 628 // Range metadata is [Lo, Hi). For ID query we need to pass max size 629 // as Hi. For size query we need to pass Hi + 1. 630 if (IdQuery) 631 MinSize = 0; 632 else 633 ++MaxSize; 634 635 MDBuilder MDB(I->getContext()); 636 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 637 APInt(32, MaxSize)); 638 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 639 return true; 640 } 641 642 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const { 643 assert(AMDGPU::isKernel(F.getCallingConv())); 644 645 // We don't allocate the segment if we know the implicit arguments weren't 646 // used, even if the ABI implies we need them. 647 if (F.hasFnAttribute("amdgpu-no-implicitarg-ptr")) 648 return 0; 649 650 if (isMesaKernel(F)) 651 return 16; 652 653 // Assume all implicit inputs are used by default 654 return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 56); 655 } 656 657 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 658 Align &MaxAlign) const { 659 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 660 F.getCallingConv() == CallingConv::SPIR_KERNEL); 661 662 const DataLayout &DL = F.getParent()->getDataLayout(); 663 uint64_t ExplicitArgBytes = 0; 664 MaxAlign = Align(1); 665 666 for (const Argument &Arg : F.args()) { 667 const bool IsByRef = Arg.hasByRefAttr(); 668 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); 669 MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None; 670 if (!Alignment) 671 Alignment = DL.getABITypeAlign(ArgTy); 672 673 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 674 ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize; 675 MaxAlign = max(MaxAlign, Alignment); 676 } 677 678 return ExplicitArgBytes; 679 } 680 681 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 682 Align &MaxAlign) const { 683 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 684 685 unsigned ExplicitOffset = getExplicitKernelArgOffset(F); 686 687 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 688 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 689 if (ImplicitBytes != 0) { 690 const Align Alignment = getAlignmentForImplicitArgPtr(); 691 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 692 MaxAlign = std::max(MaxAlign, Alignment); 693 } 694 695 // Being able to dereference past the end is useful for emitting scalar loads. 696 return alignTo(TotalSize, 4); 697 } 698 699 AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const { 700 return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32 701 : AMDGPUDwarfFlavour::Wave64; 702 } 703 704 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 705 unsigned NumRegionInstrs) const { 706 // Track register pressure so the scheduler can try to decrease 707 // pressure once register usage is above the threshold defined by 708 // SIRegisterInfo::getRegPressureSetLimit() 709 Policy.ShouldTrackPressure = true; 710 711 // Enabling both top down and bottom up scheduling seems to give us less 712 // register spills than just using one of these approaches on its own. 713 Policy.OnlyTopDown = false; 714 Policy.OnlyBottomUp = false; 715 716 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 717 if (!enableSIScheduler()) 718 Policy.ShouldTrackLaneMasks = true; 719 } 720 721 bool GCNSubtarget::hasMadF16() const { 722 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1; 723 } 724 725 bool GCNSubtarget::useVGPRIndexMode() const { 726 return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode()); 727 } 728 729 bool GCNSubtarget::useAA() const { return UseAA; } 730 731 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 732 if (getGeneration() >= AMDGPUSubtarget::GFX10) 733 return getMaxWavesPerEU(); 734 735 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 736 if (SGPRs <= 80) 737 return 10; 738 if (SGPRs <= 88) 739 return 9; 740 if (SGPRs <= 100) 741 return 8; 742 return 7; 743 } 744 if (SGPRs <= 48) 745 return 10; 746 if (SGPRs <= 56) 747 return 9; 748 if (SGPRs <= 64) 749 return 8; 750 if (SGPRs <= 72) 751 return 7; 752 if (SGPRs <= 80) 753 return 6; 754 return 5; 755 } 756 757 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 758 unsigned MaxWaves = getMaxWavesPerEU(); 759 unsigned Granule = getVGPRAllocGranule(); 760 if (VGPRs < Granule) 761 return MaxWaves; 762 unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule; 763 return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves); 764 } 765 766 unsigned 767 GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const { 768 if (getGeneration() >= AMDGPUSubtarget::GFX10) 769 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. 770 771 if (HasFlatScratch || HasArchitectedFlatScratch) { 772 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 773 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 774 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 775 return 4; // FLAT_SCRATCH, VCC (in that order). 776 } 777 778 if (isXNACKEnabled()) 779 return 4; // XNACK, VCC (in that order). 780 return 2; // VCC. 781 } 782 783 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 784 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 785 return getBaseReservedNumSGPRs(MFI.hasFlatScratchInit()); 786 } 787 788 unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const { 789 // In principle we do not need to reserve SGPR pair used for flat_scratch if 790 // we know flat instructions do not access the stack anywhere in the 791 // program. For now assume it's needed if we have flat instructions. 792 const bool KernelUsesFlatScratch = hasFlatAddressSpace(); 793 return getBaseReservedNumSGPRs(KernelUsesFlatScratch); 794 } 795 796 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize, 797 unsigned NumSGPRs, 798 unsigned NumVGPRs) const { 799 unsigned Occupancy = 800 std::min(getMaxWavesPerEU(), 801 getOccupancyWithLocalMemSize(LDSSize, F)); 802 if (NumSGPRs) 803 Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs)); 804 if (NumVGPRs) 805 Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs)); 806 return Occupancy; 807 } 808 809 unsigned GCNSubtarget::getBaseMaxNumSGPRs( 810 const Function &F, std::pair<unsigned, unsigned> WavesPerEU, 811 unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const { 812 // Compute maximum number of SGPRs function can use using default/requested 813 // minimum number of waves per execution unit. 814 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 815 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 816 817 // Check if maximum number of SGPRs was explicitly requested using 818 // "amdgpu-num-sgpr" attribute. 819 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 820 unsigned Requested = AMDGPU::getIntegerAttribute( 821 F, "amdgpu-num-sgpr", MaxNumSGPRs); 822 823 // Make sure requested value does not violate subtarget's specifications. 824 if (Requested && (Requested <= ReservedNumSGPRs)) 825 Requested = 0; 826 827 // If more SGPRs are required to support the input user/system SGPRs, 828 // increase to accommodate them. 829 // 830 // FIXME: This really ends up using the requested number of SGPRs + number 831 // of reserved special registers in total. Theoretically you could re-use 832 // the last input registers for these special registers, but this would 833 // require a lot of complexity to deal with the weird aliasing. 834 unsigned InputNumSGPRs = PreloadedSGPRs; 835 if (Requested && Requested < InputNumSGPRs) 836 Requested = InputNumSGPRs; 837 838 // Make sure requested value is compatible with values implied by 839 // default/requested minimum/maximum number of waves per execution unit. 840 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 841 Requested = 0; 842 if (WavesPerEU.second && 843 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 844 Requested = 0; 845 846 if (Requested) 847 MaxNumSGPRs = Requested; 848 } 849 850 if (hasSGPRInitBug()) 851 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 852 853 return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs); 854 } 855 856 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 857 const Function &F = MF.getFunction(); 858 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 859 return getBaseMaxNumSGPRs(F, MFI.getWavesPerEU(), MFI.getNumPreloadedSGPRs(), 860 getReservedNumSGPRs(MF)); 861 } 862 863 static unsigned getMaxNumPreloadedSGPRs() { 864 // Max number of user SGPRs 865 unsigned MaxUserSGPRs = 4 + // private segment buffer 866 2 + // Dispatch ptr 867 2 + // queue ptr 868 2 + // kernel segment ptr 869 2 + // dispatch ID 870 2 + // flat scratch init 871 2; // Implicit buffer ptr 872 // Max number of system SGPRs 873 unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX 874 1 + // WorkGroupIDY 875 1 + // WorkGroupIDZ 876 1 + // WorkGroupInfo 877 1; // private segment wave byte offset 878 return MaxUserSGPRs + MaxSystemSGPRs; 879 } 880 881 unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const { 882 return getBaseMaxNumSGPRs(F, getWavesPerEU(F), getMaxNumPreloadedSGPRs(), 883 getReservedNumSGPRs(F)); 884 } 885 886 unsigned GCNSubtarget::getBaseMaxNumVGPRs( 887 const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const { 888 // Compute maximum number of VGPRs function can use using default/requested 889 // minimum number of waves per execution unit. 890 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 891 892 // Check if maximum number of VGPRs was explicitly requested using 893 // "amdgpu-num-vgpr" attribute. 894 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 895 unsigned Requested = AMDGPU::getIntegerAttribute( 896 F, "amdgpu-num-vgpr", MaxNumVGPRs); 897 898 if (hasGFX90AInsts()) 899 Requested *= 2; 900 901 // Make sure requested value is compatible with values implied by 902 // default/requested minimum/maximum number of waves per execution unit. 903 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 904 Requested = 0; 905 if (WavesPerEU.second && 906 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 907 Requested = 0; 908 909 if (Requested) 910 MaxNumVGPRs = Requested; 911 } 912 913 return MaxNumVGPRs; 914 } 915 916 unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const { 917 return getBaseMaxNumVGPRs(F, getWavesPerEU(F)); 918 } 919 920 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 921 const Function &F = MF.getFunction(); 922 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 923 return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU()); 924 } 925 926 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, 927 int UseOpIdx, SDep &Dep) const { 928 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || 929 !Def->isInstr() || !Use->isInstr()) 930 return; 931 932 MachineInstr *DefI = Def->getInstr(); 933 MachineInstr *UseI = Use->getInstr(); 934 935 if (DefI->isBundle()) { 936 const SIRegisterInfo *TRI = getRegisterInfo(); 937 auto Reg = Dep.getReg(); 938 MachineBasicBlock::const_instr_iterator I(DefI->getIterator()); 939 MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end()); 940 unsigned Lat = 0; 941 for (++I; I != E && I->isBundledWithPred(); ++I) { 942 if (I->modifiesRegister(Reg, TRI)) 943 Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I); 944 else if (Lat) 945 --Lat; 946 } 947 Dep.setLatency(Lat); 948 } else if (UseI->isBundle()) { 949 const SIRegisterInfo *TRI = getRegisterInfo(); 950 auto Reg = Dep.getReg(); 951 MachineBasicBlock::const_instr_iterator I(UseI->getIterator()); 952 MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end()); 953 unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI); 954 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) { 955 if (I->readsRegister(Reg, TRI)) 956 break; 957 --Lat; 958 } 959 Dep.setLatency(Lat); 960 } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) { 961 // Work around the fact that SIInstrInfo::fixImplicitOperands modifies 962 // implicit operands which come from the MCInstrDesc, which can fool 963 // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit 964 // pseudo operands. 965 Dep.setLatency(InstrInfo.getSchedModel().computeOperandLatency( 966 DefI, DefOpIdx, UseI, UseOpIdx)); 967 } 968 } 969 970 namespace { 971 struct FillMFMAShadowMutation : ScheduleDAGMutation { 972 const SIInstrInfo *TII; 973 974 ScheduleDAGMI *DAG; 975 976 FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {} 977 978 bool isSALU(const SUnit *SU) const { 979 const MachineInstr *MI = SU->getInstr(); 980 return MI && TII->isSALU(*MI) && !MI->isTerminator(); 981 } 982 983 bool isVALU(const SUnit *SU) const { 984 const MachineInstr *MI = SU->getInstr(); 985 return MI && TII->isVALU(*MI); 986 } 987 988 bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const { 989 if (Pred->NodeNum < Succ->NodeNum) 990 return true; 991 992 SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred}); 993 994 for (unsigned I = 0; I < Succs.size(); ++I) { 995 for (const SDep &SI : Succs[I]->Succs) { 996 const SUnit *SU = SI.getSUnit(); 997 if (SU != Succs[I] && !llvm::is_contained(Succs, SU)) 998 Succs.push_back(SU); 999 } 1000 } 1001 1002 SmallPtrSet<const SUnit*, 32> Visited; 1003 while (!Preds.empty()) { 1004 const SUnit *SU = Preds.pop_back_val(); 1005 if (llvm::is_contained(Succs, SU)) 1006 return false; 1007 Visited.insert(SU); 1008 for (const SDep &SI : SU->Preds) 1009 if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit())) 1010 Preds.push_back(SI.getSUnit()); 1011 } 1012 1013 return true; 1014 } 1015 1016 // Link as many SALU instructions in chain as possible. Return the size 1017 // of the chain. Links up to MaxChain instructions. 1018 unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain, 1019 SmallPtrSetImpl<SUnit *> &Visited) const { 1020 SmallVector<SUnit *, 8> Worklist({To}); 1021 unsigned Linked = 0; 1022 1023 while (!Worklist.empty() && MaxChain-- > 0) { 1024 SUnit *SU = Worklist.pop_back_val(); 1025 if (!Visited.insert(SU).second) 1026 continue; 1027 1028 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From); 1029 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n'); 1030 1031 if (SU->addPred(SDep(From, SDep::Artificial), false)) 1032 ++Linked; 1033 1034 for (SDep &SI : From->Succs) { 1035 SUnit *SUv = SI.getSUnit(); 1036 if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU)) 1037 SUv->addPred(SDep(SU, SDep::Artificial), false); 1038 } 1039 1040 for (SDep &SI : SU->Succs) { 1041 SUnit *Succ = SI.getSUnit(); 1042 if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ)) 1043 Worklist.push_back(Succ); 1044 } 1045 } 1046 1047 return Linked; 1048 } 1049 1050 void apply(ScheduleDAGInstrs *DAGInstrs) override { 1051 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); 1052 if (!ST.hasMAIInsts() || DisablePowerSched) 1053 return; 1054 DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); 1055 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); 1056 if (!TSchedModel || DAG->SUnits.empty()) 1057 return; 1058 1059 // Scan for MFMA long latency instructions and try to add a dependency 1060 // of available SALU instructions to give them a chance to fill MFMA 1061 // shadow. That is desirable to fill MFMA shadow with SALU instructions 1062 // rather than VALU to prevent power consumption bursts and throttle. 1063 auto LastSALU = DAG->SUnits.begin(); 1064 auto E = DAG->SUnits.end(); 1065 SmallPtrSet<SUnit*, 32> Visited; 1066 for (SUnit &SU : DAG->SUnits) { 1067 MachineInstr &MAI = *SU.getInstr(); 1068 if (!TII->isMAI(MAI) || 1069 MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 || 1070 MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64) 1071 continue; 1072 1073 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1; 1074 1075 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU); 1076 dbgs() << "Need " << Lat 1077 << " instructions to cover latency.\n"); 1078 1079 // Find up to Lat independent scalar instructions as early as 1080 // possible such that they can be scheduled after this MFMA. 1081 for ( ; Lat && LastSALU != E; ++LastSALU) { 1082 if (Visited.count(&*LastSALU)) 1083 continue; 1084 1085 if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU)) 1086 continue; 1087 1088 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited); 1089 } 1090 } 1091 } 1092 }; 1093 } // namespace 1094 1095 void GCNSubtarget::getPostRAMutations( 1096 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 1097 Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo)); 1098 } 1099 1100 std::unique_ptr<ScheduleDAGMutation> 1101 GCNSubtarget::createFillMFMAShadowMutation(const TargetInstrInfo *TII) const { 1102 return std::make_unique<FillMFMAShadowMutation>(&InstrInfo); 1103 } 1104 1105 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 1106 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 1107 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 1108 else 1109 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); 1110 } 1111 1112 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 1113 if (TM.getTargetTriple().getArch() == Triple::amdgcn) 1114 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 1115 else 1116 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); 1117 } 1118