1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Implements the AMDGPU specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUSubtarget.h" 15 #include "AMDGPUCallLowering.h" 16 #include "AMDGPUInstructionSelector.h" 17 #include "AMDGPULegalizerInfo.h" 18 #include "AMDGPURegisterBankInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "R600Subtarget.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "Utils/AMDGPUBaseInfo.h" 23 #include "llvm/ADT/SmallString.h" 24 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h" 25 #include "llvm/CodeGen/MachineScheduler.h" 26 #include "llvm/CodeGen/TargetFrameLowering.h" 27 #include "llvm/IR/IntrinsicsAMDGPU.h" 28 #include "llvm/IR/IntrinsicsR600.h" 29 #include "llvm/IR/MDBuilder.h" 30 #include "llvm/MC/MCSubtargetInfo.h" 31 #include <algorithm> 32 33 using namespace llvm; 34 35 #define DEBUG_TYPE "amdgpu-subtarget" 36 37 #define GET_SUBTARGETINFO_TARGET_DESC 38 #define GET_SUBTARGETINFO_CTOR 39 #define AMDGPUSubtarget GCNSubtarget 40 #include "AMDGPUGenSubtargetInfo.inc" 41 #undef AMDGPUSubtarget 42 43 static cl::opt<bool> DisablePowerSched( 44 "amdgpu-disable-power-sched", 45 cl::desc("Disable scheduling to minimize mAI power bursts"), 46 cl::init(false)); 47 48 static cl::opt<bool> EnableVGPRIndexMode( 49 "amdgpu-vgpr-index-mode", 50 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), 51 cl::init(false)); 52 53 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen", 54 cl::desc("Enable the use of AA during codegen."), 55 cl::init(true)); 56 57 GCNSubtarget::~GCNSubtarget() = default; 58 59 GCNSubtarget & 60 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 61 StringRef GPU, StringRef FS) { 62 // Determine default and user-specified characteristics 63 // 64 // We want to be able to turn these off, but making this a subtarget feature 65 // for SI has the unhelpful behavior that it unsets everything else if you 66 // disable it. 67 // 68 // Similarly we want enable-prt-strict-null to be on by default and not to 69 // unset everything else if it is disabled 70 71 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,"); 72 73 // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default 74 if (isAmdHsaOS()) 75 FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,"; 76 77 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS 78 79 // Disable mutually exclusive bits. 80 if (FS.contains_insensitive("+wavefrontsize")) { 81 if (!FS.contains_insensitive("wavefrontsize16")) 82 FullFS += "-wavefrontsize16,"; 83 if (!FS.contains_insensitive("wavefrontsize32")) 84 FullFS += "-wavefrontsize32,"; 85 if (!FS.contains_insensitive("wavefrontsize64")) 86 FullFS += "-wavefrontsize64,"; 87 } 88 89 FullFS += FS; 90 91 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS); 92 93 // Implement the "generic" processors, which acts as the default when no 94 // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to 95 // the first amdgcn target that supports flat addressing. Other OSes defaults 96 // to the first amdgcn target. 97 if (Gen == AMDGPUSubtarget::INVALID) { 98 Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS 99 : AMDGPUSubtarget::SOUTHERN_ISLANDS; 100 } 101 102 // We don't support FP64 for EG/NI atm. 103 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 104 105 // Targets must either support 64-bit offsets for MUBUF instructions, and/or 106 // support flat operations, otherwise they cannot access a 64-bit global 107 // address space 108 assert(hasAddr64() || hasFlat()); 109 // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets 110 // that do not support ADDR64 variants of MUBUF instructions. Such targets 111 // cannot use a 64 bit offset with a MUBUF instruction to access the global 112 // address space 113 if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) { 114 ToggleFeature(AMDGPU::FeatureFlatForGlobal); 115 FlatForGlobal = true; 116 } 117 // Unless +-flat-for-global is specified, use MUBUF instructions for global 118 // address space access if flat operations are not available. 119 if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) { 120 ToggleFeature(AMDGPU::FeatureFlatForGlobal); 121 FlatForGlobal = false; 122 } 123 124 // Set defaults if needed. 125 if (MaxPrivateElementSize == 0) 126 MaxPrivateElementSize = 4; 127 128 if (LDSBankCount == 0) 129 LDSBankCount = 32; 130 131 if (TT.getArch() == Triple::amdgcn) { 132 if (LocalMemorySize == 0) 133 LocalMemorySize = 32768; 134 135 // Do something sensible for unspecified target. 136 if (!HasMovrel && !HasVGPRIndexMode) 137 HasMovrel = true; 138 } 139 140 // Don't crash on invalid devices. 141 if (WavefrontSizeLog2 == 0) 142 WavefrontSizeLog2 = 5; 143 144 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 145 HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9; 146 147 TargetID.setTargetIDFromFeaturesString(FS); 148 149 LLVM_DEBUG(dbgs() << "xnack setting for subtarget: " 150 << TargetID.getXnackSetting() << '\n'); 151 LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: " 152 << TargetID.getSramEccSetting() << '\n'); 153 154 return *this; 155 } 156 157 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : 158 TargetTriple(TT), 159 GCN3Encoding(false), 160 Has16BitInsts(false), 161 HasMadMixInsts(false), 162 HasMadMacF32Insts(false), 163 HasDsSrc2Insts(false), 164 HasSDWA(false), 165 HasVOP3PInsts(false), 166 HasMulI24(true), 167 HasMulU24(true), 168 HasSMulHi(false), 169 HasInv2PiInlineImm(false), 170 HasFminFmaxLegacy(true), 171 EnablePromoteAlloca(false), 172 HasTrigReducedRange(false), 173 MaxWavesPerEU(10), 174 LocalMemorySize(0), 175 WavefrontSizeLog2(0) 176 { } 177 178 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 179 const GCNTargetMachine &TM) 180 : // clang-format off 181 AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS), 182 AMDGPUSubtarget(TT), 183 TargetTriple(TT), 184 TargetID(*this), 185 Gen(INVALID), 186 InstrItins(getInstrItineraryForCPU(GPU)), 187 LDSBankCount(0), 188 MaxPrivateElementSize(0), 189 190 FastFMAF32(false), 191 FastDenormalF32(false), 192 HalfRate64Ops(false), 193 FullRate64Ops(false), 194 195 FlatForGlobal(false), 196 AutoWaitcntBeforeBarrier(false), 197 UnalignedScratchAccess(false), 198 UnalignedAccessMode(false), 199 200 HasApertureRegs(false), 201 SupportsXNACK(false), 202 EnableXNACK(false), 203 EnableTgSplit(false), 204 EnableCuMode(false), 205 TrapHandler(false), 206 207 EnableLoadStoreOpt(false), 208 EnableUnsafeDSOffsetFolding(false), 209 EnableSIScheduler(false), 210 EnableDS128(false), 211 EnablePRTStrictNull(false), 212 DumpCode(false), 213 214 FP64(false), 215 CIInsts(false), 216 GFX8Insts(false), 217 GFX9Insts(false), 218 GFX90AInsts(false), 219 GFX10Insts(false), 220 GFX10_3Insts(false), 221 GFX7GFX8GFX9Insts(false), 222 SGPRInitBug(false), 223 NegativeScratchOffsetBug(false), 224 NegativeUnalignedScratchOffsetBug(false), 225 HasSMemRealTime(false), 226 HasIntClamp(false), 227 HasFmaMixInsts(false), 228 HasMovrel(false), 229 HasVGPRIndexMode(false), 230 HasScalarStores(false), 231 HasScalarAtomics(false), 232 HasSDWAOmod(false), 233 HasSDWAScalar(false), 234 HasSDWASdst(false), 235 HasSDWAMac(false), 236 HasSDWAOutModsVOPC(false), 237 HasDPP(false), 238 HasDPP8(false), 239 Has64BitDPP(false), 240 HasPackedFP32Ops(false), 241 HasExtendedImageInsts(false), 242 HasR128A16(false), 243 HasGFX10A16(false), 244 HasG16(false), 245 HasNSAEncoding(false), 246 NSAMaxSize(0), 247 GFX10_AEncoding(false), 248 GFX10_BEncoding(false), 249 HasDLInsts(false), 250 HasDot1Insts(false), 251 HasDot2Insts(false), 252 HasDot3Insts(false), 253 HasDot4Insts(false), 254 HasDot5Insts(false), 255 HasDot6Insts(false), 256 HasDot7Insts(false), 257 HasMAIInsts(false), 258 HasPkFmacF16Inst(false), 259 HasAtomicFaddInsts(false), 260 SupportsSRAMECC(false), 261 EnableSRAMECC(false), 262 HasNoSdstCMPX(false), 263 HasVscnt(false), 264 HasGetWaveIdInst(false), 265 HasSMemTimeInst(false), 266 HasShaderCyclesRegister(false), 267 HasVOP3Literal(false), 268 HasNoDataDepHazard(false), 269 FlatAddressSpace(false), 270 FlatInstOffsets(false), 271 FlatGlobalInsts(false), 272 FlatScratchInsts(false), 273 ScalarFlatScratchInsts(false), 274 HasArchitectedFlatScratch(false), 275 EnableFlatScratch(false), 276 AddNoCarryInsts(false), 277 HasUnpackedD16VMem(false), 278 LDSMisalignedBug(false), 279 HasMFMAInlineLiteralBug(false), 280 UnalignedBufferAccess(false), 281 UnalignedDSAccess(false), 282 HasPackedTID(false), 283 284 ScalarizeGlobal(false), 285 286 HasVcmpxPermlaneHazard(false), 287 HasVMEMtoScalarWriteHazard(false), 288 HasSMEMtoVectorWriteHazard(false), 289 HasInstFwdPrefetchBug(false), 290 HasVcmpxExecWARHazard(false), 291 HasLdsBranchVmemWARHazard(false), 292 HasNSAtoVMEMBug(false), 293 HasNSAClauseBug(false), 294 HasOffset3fBug(false), 295 HasFlatSegmentOffsetBug(false), 296 HasImageStoreD16Bug(false), 297 HasImageGather4D16Bug(false), 298 299 FeatureDisable(false), 300 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 301 TLInfo(TM, *this), 302 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 303 // clang-format on 304 MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this); 305 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 306 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering())); 307 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 308 RegBankInfo.reset(new AMDGPURegisterBankInfo(*this)); 309 InstSelector.reset(new AMDGPUInstructionSelector( 310 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 311 } 312 313 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { 314 if (getGeneration() < GFX10) 315 return 1; 316 317 switch (Opcode) { 318 case AMDGPU::V_LSHLREV_B64_e64: 319 case AMDGPU::V_LSHLREV_B64_gfx10: 320 case AMDGPU::V_LSHL_B64_e64: 321 case AMDGPU::V_LSHRREV_B64_e64: 322 case AMDGPU::V_LSHRREV_B64_gfx10: 323 case AMDGPU::V_LSHR_B64_e64: 324 case AMDGPU::V_ASHRREV_I64_e64: 325 case AMDGPU::V_ASHRREV_I64_gfx10: 326 case AMDGPU::V_ASHR_I64_e64: 327 return 1; 328 } 329 330 return 2; 331 } 332 333 /// This list was mostly derived from experimentation. 334 bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const { 335 switch (Opcode) { 336 case AMDGPU::V_CVT_F16_F32_e32: 337 case AMDGPU::V_CVT_F16_F32_e64: 338 case AMDGPU::V_CVT_F16_U16_e32: 339 case AMDGPU::V_CVT_F16_U16_e64: 340 case AMDGPU::V_CVT_F16_I16_e32: 341 case AMDGPU::V_CVT_F16_I16_e64: 342 case AMDGPU::V_RCP_F16_e64: 343 case AMDGPU::V_RCP_F16_e32: 344 case AMDGPU::V_RSQ_F16_e64: 345 case AMDGPU::V_RSQ_F16_e32: 346 case AMDGPU::V_SQRT_F16_e64: 347 case AMDGPU::V_SQRT_F16_e32: 348 case AMDGPU::V_LOG_F16_e64: 349 case AMDGPU::V_LOG_F16_e32: 350 case AMDGPU::V_EXP_F16_e64: 351 case AMDGPU::V_EXP_F16_e32: 352 case AMDGPU::V_SIN_F16_e64: 353 case AMDGPU::V_SIN_F16_e32: 354 case AMDGPU::V_COS_F16_e64: 355 case AMDGPU::V_COS_F16_e32: 356 case AMDGPU::V_FLOOR_F16_e64: 357 case AMDGPU::V_FLOOR_F16_e32: 358 case AMDGPU::V_CEIL_F16_e64: 359 case AMDGPU::V_CEIL_F16_e32: 360 case AMDGPU::V_TRUNC_F16_e64: 361 case AMDGPU::V_TRUNC_F16_e32: 362 case AMDGPU::V_RNDNE_F16_e64: 363 case AMDGPU::V_RNDNE_F16_e32: 364 case AMDGPU::V_FRACT_F16_e64: 365 case AMDGPU::V_FRACT_F16_e32: 366 case AMDGPU::V_FREXP_MANT_F16_e64: 367 case AMDGPU::V_FREXP_MANT_F16_e32: 368 case AMDGPU::V_FREXP_EXP_I16_F16_e64: 369 case AMDGPU::V_FREXP_EXP_I16_F16_e32: 370 case AMDGPU::V_LDEXP_F16_e64: 371 case AMDGPU::V_LDEXP_F16_e32: 372 case AMDGPU::V_LSHLREV_B16_e64: 373 case AMDGPU::V_LSHLREV_B16_e32: 374 case AMDGPU::V_LSHRREV_B16_e64: 375 case AMDGPU::V_LSHRREV_B16_e32: 376 case AMDGPU::V_ASHRREV_I16_e64: 377 case AMDGPU::V_ASHRREV_I16_e32: 378 case AMDGPU::V_ADD_U16_e64: 379 case AMDGPU::V_ADD_U16_e32: 380 case AMDGPU::V_SUB_U16_e64: 381 case AMDGPU::V_SUB_U16_e32: 382 case AMDGPU::V_SUBREV_U16_e64: 383 case AMDGPU::V_SUBREV_U16_e32: 384 case AMDGPU::V_MUL_LO_U16_e64: 385 case AMDGPU::V_MUL_LO_U16_e32: 386 case AMDGPU::V_ADD_F16_e64: 387 case AMDGPU::V_ADD_F16_e32: 388 case AMDGPU::V_SUB_F16_e64: 389 case AMDGPU::V_SUB_F16_e32: 390 case AMDGPU::V_SUBREV_F16_e64: 391 case AMDGPU::V_SUBREV_F16_e32: 392 case AMDGPU::V_MUL_F16_e64: 393 case AMDGPU::V_MUL_F16_e32: 394 case AMDGPU::V_MAX_F16_e64: 395 case AMDGPU::V_MAX_F16_e32: 396 case AMDGPU::V_MIN_F16_e64: 397 case AMDGPU::V_MIN_F16_e32: 398 case AMDGPU::V_MAX_U16_e64: 399 case AMDGPU::V_MAX_U16_e32: 400 case AMDGPU::V_MIN_U16_e64: 401 case AMDGPU::V_MIN_U16_e32: 402 case AMDGPU::V_MAX_I16_e64: 403 case AMDGPU::V_MAX_I16_e32: 404 case AMDGPU::V_MIN_I16_e64: 405 case AMDGPU::V_MIN_I16_e32: 406 case AMDGPU::V_MAD_F16_e64: 407 case AMDGPU::V_MAD_U16_e64: 408 case AMDGPU::V_MAD_I16_e64: 409 case AMDGPU::V_FMA_F16_e64: 410 case AMDGPU::V_DIV_FIXUP_F16_e64: 411 // On gfx10, all 16-bit instructions preserve the high bits. 412 return getGeneration() <= AMDGPUSubtarget::GFX9; 413 case AMDGPU::V_MADAK_F16: 414 case AMDGPU::V_MADMK_F16: 415 case AMDGPU::V_MAC_F16_e64: 416 case AMDGPU::V_MAC_F16_e32: 417 case AMDGPU::V_FMAMK_F16: 418 case AMDGPU::V_FMAAK_F16: 419 case AMDGPU::V_FMAC_F16_e64: 420 case AMDGPU::V_FMAC_F16_e32: 421 // In gfx9, the preferred handling of the unused high 16-bits changed. Most 422 // instructions maintain the legacy behavior of 0ing. Some instructions 423 // changed to preserving the high bits. 424 return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS; 425 case AMDGPU::V_MAD_MIXLO_F16: 426 case AMDGPU::V_MAD_MIXHI_F16: 427 default: 428 return false; 429 } 430 } 431 432 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 433 const Function &F) const { 434 if (NWaves == 1) 435 return getLocalMemorySize(); 436 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 437 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 438 if (!WorkGroupsPerCu) 439 return 0; 440 unsigned MaxWaves = getMaxWavesPerEU(); 441 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 442 } 443 444 // FIXME: Should return min,max range. 445 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 446 const Function &F) const { 447 const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second; 448 const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize); 449 if (!MaxWorkGroupsPerCu) 450 return 0; 451 452 const unsigned WaveSize = getWavefrontSize(); 453 454 // FIXME: Do we need to account for alignment requirement of LDS rounding the 455 // size up? 456 // Compute restriction based on LDS usage 457 unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u); 458 459 // This can be queried with more LDS than is possible, so just assume the 460 // worst. 461 if (NumGroups == 0) 462 return 1; 463 464 NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups); 465 466 // Round to the number of waves. 467 const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize; 468 unsigned MaxWaves = NumGroups * MaxGroupNumWaves; 469 470 // Clamp to the maximum possible number of waves. 471 MaxWaves = std::min(MaxWaves, getMaxWavesPerEU()); 472 473 // FIXME: Needs to be a multiple of the group size? 474 //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves); 475 476 assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() && 477 "computed invalid occupancy"); 478 return MaxWaves; 479 } 480 481 unsigned 482 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 483 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 484 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 485 } 486 487 std::pair<unsigned, unsigned> 488 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 489 switch (CC) { 490 case CallingConv::AMDGPU_VS: 491 case CallingConv::AMDGPU_LS: 492 case CallingConv::AMDGPU_HS: 493 case CallingConv::AMDGPU_ES: 494 case CallingConv::AMDGPU_GS: 495 case CallingConv::AMDGPU_PS: 496 return std::make_pair(1, getWavefrontSize()); 497 default: 498 return std::make_pair(1u, getMaxFlatWorkGroupSize()); 499 } 500 } 501 502 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 503 const Function &F) const { 504 // Default minimum/maximum flat work group sizes. 505 std::pair<unsigned, unsigned> Default = 506 getDefaultFlatWorkGroupSize(F.getCallingConv()); 507 508 // Requested minimum/maximum flat work group sizes. 509 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 510 F, "amdgpu-flat-work-group-size", Default); 511 512 // Make sure requested minimum is less than requested maximum. 513 if (Requested.first > Requested.second) 514 return Default; 515 516 // Make sure requested values do not violate subtarget's specifications. 517 if (Requested.first < getMinFlatWorkGroupSize()) 518 return Default; 519 if (Requested.second > getMaxFlatWorkGroupSize()) 520 return Default; 521 522 return Requested; 523 } 524 525 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 526 const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const { 527 // Default minimum/maximum number of waves per execution unit. 528 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 529 530 // If minimum/maximum flat work group sizes were explicitly requested using 531 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 532 // number of waves per execution unit to values implied by requested 533 // minimum/maximum flat work group sizes. 534 unsigned MinImpliedByFlatWorkGroupSize = 535 getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second); 536 Default.first = MinImpliedByFlatWorkGroupSize; 537 538 // Requested minimum/maximum number of waves per execution unit. 539 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 540 F, "amdgpu-waves-per-eu", Default, true); 541 542 // Make sure requested minimum is less than requested maximum. 543 if (Requested.second && Requested.first > Requested.second) 544 return Default; 545 546 // Make sure requested values do not violate subtarget's specifications. 547 if (Requested.first < getMinWavesPerEU() || 548 Requested.second > getMaxWavesPerEU()) 549 return Default; 550 551 // Make sure requested values are compatible with values implied by requested 552 // minimum/maximum flat work group sizes. 553 if (Requested.first < MinImpliedByFlatWorkGroupSize) 554 return Default; 555 556 return Requested; 557 } 558 559 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) { 560 auto Node = Kernel.getMetadata("reqd_work_group_size"); 561 if (Node && Node->getNumOperands() == 3) 562 return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue(); 563 return std::numeric_limits<unsigned>::max(); 564 } 565 566 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const { 567 return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv()); 568 } 569 570 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel, 571 unsigned Dimension) const { 572 unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension); 573 if (ReqdSize != std::numeric_limits<unsigned>::max()) 574 return ReqdSize - 1; 575 return getFlatWorkGroupSizes(Kernel).second - 1; 576 } 577 578 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 579 Function *Kernel = I->getParent()->getParent(); 580 unsigned MinSize = 0; 581 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 582 bool IdQuery = false; 583 584 // If reqd_work_group_size is present it narrows value down. 585 if (auto *CI = dyn_cast<CallInst>(I)) { 586 const Function *F = CI->getCalledFunction(); 587 if (F) { 588 unsigned Dim = UINT_MAX; 589 switch (F->getIntrinsicID()) { 590 case Intrinsic::amdgcn_workitem_id_x: 591 case Intrinsic::r600_read_tidig_x: 592 IdQuery = true; 593 LLVM_FALLTHROUGH; 594 case Intrinsic::r600_read_local_size_x: 595 Dim = 0; 596 break; 597 case Intrinsic::amdgcn_workitem_id_y: 598 case Intrinsic::r600_read_tidig_y: 599 IdQuery = true; 600 LLVM_FALLTHROUGH; 601 case Intrinsic::r600_read_local_size_y: 602 Dim = 1; 603 break; 604 case Intrinsic::amdgcn_workitem_id_z: 605 case Intrinsic::r600_read_tidig_z: 606 IdQuery = true; 607 LLVM_FALLTHROUGH; 608 case Intrinsic::r600_read_local_size_z: 609 Dim = 2; 610 break; 611 default: 612 break; 613 } 614 615 if (Dim <= 3) { 616 unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim); 617 if (ReqdSize != std::numeric_limits<unsigned>::max()) 618 MinSize = MaxSize = ReqdSize; 619 } 620 } 621 } 622 623 if (!MaxSize) 624 return false; 625 626 // Range metadata is [Lo, Hi). For ID query we need to pass max size 627 // as Hi. For size query we need to pass Hi + 1. 628 if (IdQuery) 629 MinSize = 0; 630 else 631 ++MaxSize; 632 633 MDBuilder MDB(I->getContext()); 634 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 635 APInt(32, MaxSize)); 636 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 637 return true; 638 } 639 640 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const { 641 assert(AMDGPU::isKernel(F.getCallingConv())); 642 643 // We don't allocate the segment if we know the implicit arguments weren't 644 // used, even if the ABI implies we need them. 645 if (F.hasFnAttribute("amdgpu-no-implicitarg-ptr")) 646 return 0; 647 648 if (isMesaKernel(F)) 649 return 16; 650 651 // Assume all implicit inputs are used by default 652 return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 56); 653 } 654 655 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 656 Align &MaxAlign) const { 657 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 658 F.getCallingConv() == CallingConv::SPIR_KERNEL); 659 660 const DataLayout &DL = F.getParent()->getDataLayout(); 661 uint64_t ExplicitArgBytes = 0; 662 MaxAlign = Align(1); 663 664 for (const Argument &Arg : F.args()) { 665 const bool IsByRef = Arg.hasByRefAttr(); 666 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); 667 MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None; 668 if (!Alignment) 669 Alignment = DL.getABITypeAlign(ArgTy); 670 671 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 672 ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize; 673 MaxAlign = max(MaxAlign, Alignment); 674 } 675 676 return ExplicitArgBytes; 677 } 678 679 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 680 Align &MaxAlign) const { 681 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 682 683 unsigned ExplicitOffset = getExplicitKernelArgOffset(F); 684 685 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 686 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 687 if (ImplicitBytes != 0) { 688 const Align Alignment = getAlignmentForImplicitArgPtr(); 689 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 690 MaxAlign = std::max(MaxAlign, Alignment); 691 } 692 693 // Being able to dereference past the end is useful for emitting scalar loads. 694 return alignTo(TotalSize, 4); 695 } 696 697 AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const { 698 return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32 699 : AMDGPUDwarfFlavour::Wave64; 700 } 701 702 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 703 unsigned NumRegionInstrs) const { 704 // Track register pressure so the scheduler can try to decrease 705 // pressure once register usage is above the threshold defined by 706 // SIRegisterInfo::getRegPressureSetLimit() 707 Policy.ShouldTrackPressure = true; 708 709 // Enabling both top down and bottom up scheduling seems to give us less 710 // register spills than just using one of these approaches on its own. 711 Policy.OnlyTopDown = false; 712 Policy.OnlyBottomUp = false; 713 714 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 715 if (!enableSIScheduler()) 716 Policy.ShouldTrackLaneMasks = true; 717 } 718 719 bool GCNSubtarget::hasMadF16() const { 720 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1; 721 } 722 723 bool GCNSubtarget::useVGPRIndexMode() const { 724 return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode()); 725 } 726 727 bool GCNSubtarget::useAA() const { return UseAA; } 728 729 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 730 if (getGeneration() >= AMDGPUSubtarget::GFX10) 731 return getMaxWavesPerEU(); 732 733 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 734 if (SGPRs <= 80) 735 return 10; 736 if (SGPRs <= 88) 737 return 9; 738 if (SGPRs <= 100) 739 return 8; 740 return 7; 741 } 742 if (SGPRs <= 48) 743 return 10; 744 if (SGPRs <= 56) 745 return 9; 746 if (SGPRs <= 64) 747 return 8; 748 if (SGPRs <= 72) 749 return 7; 750 if (SGPRs <= 80) 751 return 6; 752 return 5; 753 } 754 755 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 756 unsigned MaxWaves = getMaxWavesPerEU(); 757 unsigned Granule = getVGPRAllocGranule(); 758 if (VGPRs < Granule) 759 return MaxWaves; 760 unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule; 761 return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves); 762 } 763 764 unsigned 765 GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const { 766 if (getGeneration() >= AMDGPUSubtarget::GFX10) 767 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. 768 769 if (HasFlatScratch || HasArchitectedFlatScratch) { 770 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 771 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 772 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 773 return 4; // FLAT_SCRATCH, VCC (in that order). 774 } 775 776 if (isXNACKEnabled()) 777 return 4; // XNACK, VCC (in that order). 778 return 2; // VCC. 779 } 780 781 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 782 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 783 return getBaseReservedNumSGPRs(MFI.hasFlatScratchInit()); 784 } 785 786 unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const { 787 // In principle we do not need to reserve SGPR pair used for flat_scratch if 788 // we know flat instructions do not access the stack anywhere in the 789 // program. For now assume it's needed if we have flat instructions. 790 const bool KernelUsesFlatScratch = hasFlatAddressSpace(); 791 return getBaseReservedNumSGPRs(KernelUsesFlatScratch); 792 } 793 794 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize, 795 unsigned NumSGPRs, 796 unsigned NumVGPRs) const { 797 unsigned Occupancy = 798 std::min(getMaxWavesPerEU(), 799 getOccupancyWithLocalMemSize(LDSSize, F)); 800 if (NumSGPRs) 801 Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs)); 802 if (NumVGPRs) 803 Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs)); 804 return Occupancy; 805 } 806 807 unsigned GCNSubtarget::getBaseMaxNumSGPRs( 808 const Function &F, std::pair<unsigned, unsigned> WavesPerEU, 809 unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const { 810 // Compute maximum number of SGPRs function can use using default/requested 811 // minimum number of waves per execution unit. 812 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 813 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 814 815 // Check if maximum number of SGPRs was explicitly requested using 816 // "amdgpu-num-sgpr" attribute. 817 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 818 unsigned Requested = AMDGPU::getIntegerAttribute( 819 F, "amdgpu-num-sgpr", MaxNumSGPRs); 820 821 // Make sure requested value does not violate subtarget's specifications. 822 if (Requested && (Requested <= ReservedNumSGPRs)) 823 Requested = 0; 824 825 // If more SGPRs are required to support the input user/system SGPRs, 826 // increase to accommodate them. 827 // 828 // FIXME: This really ends up using the requested number of SGPRs + number 829 // of reserved special registers in total. Theoretically you could re-use 830 // the last input registers for these special registers, but this would 831 // require a lot of complexity to deal with the weird aliasing. 832 unsigned InputNumSGPRs = PreloadedSGPRs; 833 if (Requested && Requested < InputNumSGPRs) 834 Requested = InputNumSGPRs; 835 836 // Make sure requested value is compatible with values implied by 837 // default/requested minimum/maximum number of waves per execution unit. 838 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 839 Requested = 0; 840 if (WavesPerEU.second && 841 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 842 Requested = 0; 843 844 if (Requested) 845 MaxNumSGPRs = Requested; 846 } 847 848 if (hasSGPRInitBug()) 849 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 850 851 return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs); 852 } 853 854 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 855 const Function &F = MF.getFunction(); 856 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 857 return getBaseMaxNumSGPRs(F, MFI.getWavesPerEU(), MFI.getNumPreloadedSGPRs(), 858 getReservedNumSGPRs(MF)); 859 } 860 861 static unsigned getMaxNumPreloadedSGPRs() { 862 // Max number of user SGPRs 863 unsigned MaxUserSGPRs = 4 + // private segment buffer 864 2 + // Dispatch ptr 865 2 + // queue ptr 866 2 + // kernel segment ptr 867 2 + // dispatch ID 868 2 + // flat scratch init 869 2; // Implicit buffer ptr 870 // Max number of system SGPRs 871 unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX 872 1 + // WorkGroupIDY 873 1 + // WorkGroupIDZ 874 1 + // WorkGroupInfo 875 1; // private segment wave byte offset 876 return MaxUserSGPRs + MaxSystemSGPRs; 877 } 878 879 unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const { 880 return getBaseMaxNumSGPRs(F, getWavesPerEU(F), getMaxNumPreloadedSGPRs(), 881 getReservedNumSGPRs(F)); 882 } 883 884 unsigned GCNSubtarget::getBaseMaxNumVGPRs( 885 const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const { 886 // Compute maximum number of VGPRs function can use using default/requested 887 // minimum number of waves per execution unit. 888 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 889 890 // Check if maximum number of VGPRs was explicitly requested using 891 // "amdgpu-num-vgpr" attribute. 892 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 893 unsigned Requested = AMDGPU::getIntegerAttribute( 894 F, "amdgpu-num-vgpr", MaxNumVGPRs); 895 896 if (hasGFX90AInsts()) 897 Requested *= 2; 898 899 // Make sure requested value is compatible with values implied by 900 // default/requested minimum/maximum number of waves per execution unit. 901 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 902 Requested = 0; 903 if (WavesPerEU.second && 904 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 905 Requested = 0; 906 907 if (Requested) 908 MaxNumVGPRs = Requested; 909 } 910 911 return MaxNumVGPRs; 912 } 913 914 unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const { 915 return getBaseMaxNumVGPRs(F, getWavesPerEU(F)); 916 } 917 918 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 919 const Function &F = MF.getFunction(); 920 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 921 return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU()); 922 } 923 924 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, 925 int UseOpIdx, SDep &Dep) const { 926 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || 927 !Def->isInstr() || !Use->isInstr()) 928 return; 929 930 MachineInstr *DefI = Def->getInstr(); 931 MachineInstr *UseI = Use->getInstr(); 932 933 if (DefI->isBundle()) { 934 const SIRegisterInfo *TRI = getRegisterInfo(); 935 auto Reg = Dep.getReg(); 936 MachineBasicBlock::const_instr_iterator I(DefI->getIterator()); 937 MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end()); 938 unsigned Lat = 0; 939 for (++I; I != E && I->isBundledWithPred(); ++I) { 940 if (I->modifiesRegister(Reg, TRI)) 941 Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I); 942 else if (Lat) 943 --Lat; 944 } 945 Dep.setLatency(Lat); 946 } else if (UseI->isBundle()) { 947 const SIRegisterInfo *TRI = getRegisterInfo(); 948 auto Reg = Dep.getReg(); 949 MachineBasicBlock::const_instr_iterator I(UseI->getIterator()); 950 MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end()); 951 unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI); 952 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) { 953 if (I->readsRegister(Reg, TRI)) 954 break; 955 --Lat; 956 } 957 Dep.setLatency(Lat); 958 } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) { 959 // Work around the fact that SIInstrInfo::fixImplicitOperands modifies 960 // implicit operands which come from the MCInstrDesc, which can fool 961 // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit 962 // pseudo operands. 963 Dep.setLatency(InstrInfo.getSchedModel().computeOperandLatency( 964 DefI, DefOpIdx, UseI, UseOpIdx)); 965 } 966 } 967 968 namespace { 969 struct FillMFMAShadowMutation : ScheduleDAGMutation { 970 const SIInstrInfo *TII; 971 972 ScheduleDAGMI *DAG; 973 974 FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {} 975 976 bool isSALU(const SUnit *SU) const { 977 const MachineInstr *MI = SU->getInstr(); 978 return MI && TII->isSALU(*MI) && !MI->isTerminator(); 979 } 980 981 bool isVALU(const SUnit *SU) const { 982 const MachineInstr *MI = SU->getInstr(); 983 return MI && TII->isVALU(*MI); 984 } 985 986 bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const { 987 if (Pred->NodeNum < Succ->NodeNum) 988 return true; 989 990 SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred}); 991 992 for (unsigned I = 0; I < Succs.size(); ++I) { 993 for (const SDep &SI : Succs[I]->Succs) { 994 const SUnit *SU = SI.getSUnit(); 995 if (SU != Succs[I] && !llvm::is_contained(Succs, SU)) 996 Succs.push_back(SU); 997 } 998 } 999 1000 SmallPtrSet<const SUnit*, 32> Visited; 1001 while (!Preds.empty()) { 1002 const SUnit *SU = Preds.pop_back_val(); 1003 if (llvm::is_contained(Succs, SU)) 1004 return false; 1005 Visited.insert(SU); 1006 for (const SDep &SI : SU->Preds) 1007 if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit())) 1008 Preds.push_back(SI.getSUnit()); 1009 } 1010 1011 return true; 1012 } 1013 1014 // Link as many SALU instructions in chain as possible. Return the size 1015 // of the chain. Links up to MaxChain instructions. 1016 unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain, 1017 SmallPtrSetImpl<SUnit *> &Visited) const { 1018 SmallVector<SUnit *, 8> Worklist({To}); 1019 unsigned Linked = 0; 1020 1021 while (!Worklist.empty() && MaxChain-- > 0) { 1022 SUnit *SU = Worklist.pop_back_val(); 1023 if (!Visited.insert(SU).second) 1024 continue; 1025 1026 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From); 1027 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n'); 1028 1029 if (SU->addPred(SDep(From, SDep::Artificial), false)) 1030 ++Linked; 1031 1032 for (SDep &SI : From->Succs) { 1033 SUnit *SUv = SI.getSUnit(); 1034 if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU)) 1035 SUv->addPred(SDep(SU, SDep::Artificial), false); 1036 } 1037 1038 for (SDep &SI : SU->Succs) { 1039 SUnit *Succ = SI.getSUnit(); 1040 if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ)) 1041 Worklist.push_back(Succ); 1042 } 1043 } 1044 1045 return Linked; 1046 } 1047 1048 void apply(ScheduleDAGInstrs *DAGInstrs) override { 1049 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); 1050 if (!ST.hasMAIInsts() || DisablePowerSched) 1051 return; 1052 DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); 1053 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); 1054 if (!TSchedModel || DAG->SUnits.empty()) 1055 return; 1056 1057 // Scan for MFMA long latency instructions and try to add a dependency 1058 // of available SALU instructions to give them a chance to fill MFMA 1059 // shadow. That is desirable to fill MFMA shadow with SALU instructions 1060 // rather than VALU to prevent power consumption bursts and throttle. 1061 auto LastSALU = DAG->SUnits.begin(); 1062 auto E = DAG->SUnits.end(); 1063 SmallPtrSet<SUnit*, 32> Visited; 1064 for (SUnit &SU : DAG->SUnits) { 1065 MachineInstr &MAI = *SU.getInstr(); 1066 if (!TII->isMAI(MAI) || 1067 MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 || 1068 MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64) 1069 continue; 1070 1071 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1; 1072 1073 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU); 1074 dbgs() << "Need " << Lat 1075 << " instructions to cover latency.\n"); 1076 1077 // Find up to Lat independent scalar instructions as early as 1078 // possible such that they can be scheduled after this MFMA. 1079 for ( ; Lat && LastSALU != E; ++LastSALU) { 1080 if (Visited.count(&*LastSALU)) 1081 continue; 1082 1083 if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU)) 1084 continue; 1085 1086 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited); 1087 } 1088 } 1089 } 1090 }; 1091 } // namespace 1092 1093 void GCNSubtarget::getPostRAMutations( 1094 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 1095 Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo)); 1096 } 1097 1098 std::unique_ptr<ScheduleDAGMutation> 1099 GCNSubtarget::createFillMFMAShadowMutation(const TargetInstrInfo *TII) const { 1100 return std::make_unique<FillMFMAShadowMutation>(&InstrInfo); 1101 } 1102 1103 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 1104 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 1105 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 1106 else 1107 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); 1108 } 1109 1110 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 1111 if (TM.getTargetTriple().getArch() == Triple::amdgcn) 1112 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 1113 else 1114 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); 1115 } 1116