1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Implements the AMDGPU specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUSubtarget.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUCallLowering.h" 17 #include "AMDGPUInstructionSelector.h" 18 #include "AMDGPULegalizerInfo.h" 19 #include "AMDGPURegisterBankInfo.h" 20 #include "AMDGPUTargetMachine.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "Utils/AMDGPUBaseInfo.h" 23 #include "llvm/ADT/SmallString.h" 24 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h" 25 #include "llvm/CodeGen/MachineScheduler.h" 26 #include "llvm/CodeGen/TargetFrameLowering.h" 27 #include "llvm/IR/IntrinsicsAMDGPU.h" 28 #include "llvm/IR/IntrinsicsR600.h" 29 #include "llvm/IR/MDBuilder.h" 30 #include "llvm/MC/MCSubtargetInfo.h" 31 #include <algorithm> 32 33 using namespace llvm; 34 35 #define DEBUG_TYPE "amdgpu-subtarget" 36 37 #define GET_SUBTARGETINFO_TARGET_DESC 38 #define GET_SUBTARGETINFO_CTOR 39 #define AMDGPUSubtarget GCNSubtarget 40 #include "AMDGPUGenSubtargetInfo.inc" 41 #define GET_SUBTARGETINFO_TARGET_DESC 42 #define GET_SUBTARGETINFO_CTOR 43 #undef AMDGPUSubtarget 44 #include "R600GenSubtargetInfo.inc" 45 46 static cl::opt<bool> DisablePowerSched( 47 "amdgpu-disable-power-sched", 48 cl::desc("Disable scheduling to minimize mAI power bursts"), 49 cl::init(false)); 50 51 static cl::opt<bool> EnableVGPRIndexMode( 52 "amdgpu-vgpr-index-mode", 53 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), 54 cl::init(false)); 55 56 static cl::opt<bool> EnableFlatScratch( 57 "amdgpu-enable-flat-scratch", 58 cl::desc("Use flat scratch instructions"), 59 cl::init(false)); 60 61 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen", 62 cl::desc("Enable the use of AA during codegen."), 63 cl::init(true)); 64 65 GCNSubtarget::~GCNSubtarget() = default; 66 67 R600Subtarget & 68 R600Subtarget::initializeSubtargetDependencies(const Triple &TT, 69 StringRef GPU, StringRef FS) { 70 SmallString<256> FullFS("+promote-alloca,"); 71 FullFS += FS; 72 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS); 73 74 HasMulU24 = getGeneration() >= EVERGREEN; 75 HasMulI24 = hasCaymanISA(); 76 77 return *this; 78 } 79 80 GCNSubtarget & 81 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 82 StringRef GPU, StringRef FS) { 83 // Determine default and user-specified characteristics 84 // 85 // We want to be able to turn these off, but making this a subtarget feature 86 // for SI has the unhelpful behavior that it unsets everything else if you 87 // disable it. 88 // 89 // Similarly we want enable-prt-strict-null to be on by default and not to 90 // unset everything else if it is disabled 91 92 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,"); 93 94 // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default 95 if (isAmdHsaOS()) 96 FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,"; 97 98 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS 99 100 // Disable mutually exclusive bits. 101 if (FS.find_insensitive("+wavefrontsize") != StringRef::npos) { 102 if (FS.find_insensitive("wavefrontsize16") == StringRef::npos) 103 FullFS += "-wavefrontsize16,"; 104 if (FS.find_insensitive("wavefrontsize32") == StringRef::npos) 105 FullFS += "-wavefrontsize32,"; 106 if (FS.find_insensitive("wavefrontsize64") == StringRef::npos) 107 FullFS += "-wavefrontsize64,"; 108 } 109 110 FullFS += FS; 111 112 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS); 113 114 // Implement the "generic" processors, which acts as the default when no 115 // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to 116 // the first amdgcn target that supports flat addressing. Other OSes defaults 117 // to the first amdgcn target. 118 if (Gen == AMDGPUSubtarget::INVALID) { 119 Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS 120 : AMDGPUSubtarget::SOUTHERN_ISLANDS; 121 } 122 123 // We don't support FP64 for EG/NI atm. 124 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 125 126 // Targets must either support 64-bit offsets for MUBUF instructions, and/or 127 // support flat operations, otherwise they cannot access a 64-bit global 128 // address space 129 assert(hasAddr64() || hasFlat()); 130 // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets 131 // that do not support ADDR64 variants of MUBUF instructions. Such targets 132 // cannot use a 64 bit offset with a MUBUF instruction to access the global 133 // address space 134 if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) { 135 ToggleFeature(AMDGPU::FeatureFlatForGlobal); 136 FlatForGlobal = true; 137 } 138 // Unless +-flat-for-global is specified, use MUBUF instructions for global 139 // address space access if flat operations are not available. 140 if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) { 141 ToggleFeature(AMDGPU::FeatureFlatForGlobal); 142 FlatForGlobal = false; 143 } 144 145 // Set defaults if needed. 146 if (MaxPrivateElementSize == 0) 147 MaxPrivateElementSize = 4; 148 149 if (LDSBankCount == 0) 150 LDSBankCount = 32; 151 152 if (TT.getArch() == Triple::amdgcn) { 153 if (LocalMemorySize == 0) 154 LocalMemorySize = 32768; 155 156 // Do something sensible for unspecified target. 157 if (!HasMovrel && !HasVGPRIndexMode) 158 HasMovrel = true; 159 } 160 161 // Don't crash on invalid devices. 162 if (WavefrontSizeLog2 == 0) 163 WavefrontSizeLog2 = 5; 164 165 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 166 167 TargetID.setTargetIDFromFeaturesString(FS); 168 169 LLVM_DEBUG(dbgs() << "xnack setting for subtarget: " 170 << TargetID.getXnackSetting() << '\n'); 171 LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: " 172 << TargetID.getSramEccSetting() << '\n'); 173 174 return *this; 175 } 176 177 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : 178 TargetTriple(TT), 179 GCN3Encoding(false), 180 Has16BitInsts(false), 181 HasMadMixInsts(false), 182 HasMadMacF32Insts(false), 183 HasDsSrc2Insts(false), 184 HasSDWA(false), 185 HasVOP3PInsts(false), 186 HasMulI24(true), 187 HasMulU24(true), 188 HasInv2PiInlineImm(false), 189 HasFminFmaxLegacy(true), 190 EnablePromoteAlloca(false), 191 HasTrigReducedRange(false), 192 MaxWavesPerEU(10), 193 LocalMemorySize(0), 194 WavefrontSizeLog2(0) 195 { } 196 197 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 198 const GCNTargetMachine &TM) 199 : // clang-format off 200 AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS), 201 AMDGPUSubtarget(TT), 202 TargetTriple(TT), 203 TargetID(*this), 204 Gen(INVALID), 205 InstrItins(getInstrItineraryForCPU(GPU)), 206 LDSBankCount(0), 207 MaxPrivateElementSize(0), 208 209 FastFMAF32(false), 210 FastDenormalF32(false), 211 HalfRate64Ops(false), 212 FullRate64Ops(false), 213 214 FlatForGlobal(false), 215 AutoWaitcntBeforeBarrier(false), 216 UnalignedScratchAccess(false), 217 UnalignedAccessMode(false), 218 219 HasApertureRegs(false), 220 SupportsXNACK(false), 221 EnableXNACK(false), 222 EnableTgSplit(false), 223 EnableCuMode(false), 224 TrapHandler(false), 225 226 EnableLoadStoreOpt(false), 227 EnableUnsafeDSOffsetFolding(false), 228 EnableSIScheduler(false), 229 EnableDS128(false), 230 EnablePRTStrictNull(false), 231 DumpCode(false), 232 233 FP64(false), 234 CIInsts(false), 235 GFX8Insts(false), 236 GFX9Insts(false), 237 GFX90AInsts(false), 238 GFX10Insts(false), 239 GFX10_3Insts(false), 240 GFX7GFX8GFX9Insts(false), 241 SGPRInitBug(false), 242 NegativeScratchOffsetBug(false), 243 NegativeUnalignedScratchOffsetBug(false), 244 HasSMemRealTime(false), 245 HasIntClamp(false), 246 HasFmaMixInsts(false), 247 HasMovrel(false), 248 HasVGPRIndexMode(false), 249 HasScalarStores(false), 250 HasScalarAtomics(false), 251 HasSDWAOmod(false), 252 HasSDWAScalar(false), 253 HasSDWASdst(false), 254 HasSDWAMac(false), 255 HasSDWAOutModsVOPC(false), 256 HasDPP(false), 257 HasDPP8(false), 258 Has64BitDPP(false), 259 HasPackedFP32Ops(false), 260 HasExtendedImageInsts(false), 261 HasR128A16(false), 262 HasGFX10A16(false), 263 HasG16(false), 264 HasNSAEncoding(false), 265 GFX10_AEncoding(false), 266 GFX10_BEncoding(false), 267 HasDLInsts(false), 268 HasDot1Insts(false), 269 HasDot2Insts(false), 270 HasDot3Insts(false), 271 HasDot4Insts(false), 272 HasDot5Insts(false), 273 HasDot6Insts(false), 274 HasDot7Insts(false), 275 HasMAIInsts(false), 276 HasPkFmacF16Inst(false), 277 HasAtomicFaddInsts(false), 278 SupportsSRAMECC(false), 279 EnableSRAMECC(false), 280 HasNoSdstCMPX(false), 281 HasVscnt(false), 282 HasGetWaveIdInst(false), 283 HasSMemTimeInst(false), 284 HasShaderCyclesRegister(false), 285 HasRegisterBanking(false), 286 HasVOP3Literal(false), 287 HasNoDataDepHazard(false), 288 FlatAddressSpace(false), 289 FlatInstOffsets(false), 290 FlatGlobalInsts(false), 291 FlatScratchInsts(false), 292 ScalarFlatScratchInsts(false), 293 HasArchitectedFlatScratch(false), 294 AddNoCarryInsts(false), 295 HasUnpackedD16VMem(false), 296 LDSMisalignedBug(false), 297 HasMFMAInlineLiteralBug(false), 298 UnalignedBufferAccess(false), 299 UnalignedDSAccess(false), 300 HasPackedTID(false), 301 302 ScalarizeGlobal(false), 303 304 HasVcmpxPermlaneHazard(false), 305 HasVMEMtoScalarWriteHazard(false), 306 HasSMEMtoVectorWriteHazard(false), 307 HasInstFwdPrefetchBug(false), 308 HasVcmpxExecWARHazard(false), 309 HasLdsBranchVmemWARHazard(false), 310 HasNSAtoVMEMBug(false), 311 HasNSAClauseBug(false), 312 HasOffset3fBug(false), 313 HasFlatSegmentOffsetBug(false), 314 HasImageStoreD16Bug(false), 315 HasImageGather4D16Bug(false), 316 317 FeatureDisable(false), 318 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 319 TLInfo(TM, *this), 320 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 321 // clang-format on 322 MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this); 323 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 324 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering())); 325 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 326 RegBankInfo.reset(new AMDGPURegisterBankInfo(*this)); 327 InstSelector.reset(new AMDGPUInstructionSelector( 328 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 329 } 330 331 bool GCNSubtarget::enableFlatScratch() const { 332 return flatScratchIsArchitected() || 333 (EnableFlatScratch && hasFlatScratchInsts()); 334 } 335 336 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { 337 if (getGeneration() < GFX10) 338 return 1; 339 340 switch (Opcode) { 341 case AMDGPU::V_LSHLREV_B64_e64: 342 case AMDGPU::V_LSHLREV_B64_gfx10: 343 case AMDGPU::V_LSHL_B64_e64: 344 case AMDGPU::V_LSHRREV_B64_e64: 345 case AMDGPU::V_LSHRREV_B64_gfx10: 346 case AMDGPU::V_LSHR_B64_e64: 347 case AMDGPU::V_ASHRREV_I64_e64: 348 case AMDGPU::V_ASHRREV_I64_gfx10: 349 case AMDGPU::V_ASHR_I64_e64: 350 return 1; 351 } 352 353 return 2; 354 } 355 356 /// This list was mostly derived from experimentation. 357 bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const { 358 switch (Opcode) { 359 case AMDGPU::V_CVT_F16_F32_e32: 360 case AMDGPU::V_CVT_F16_F32_e64: 361 case AMDGPU::V_CVT_F16_U16_e32: 362 case AMDGPU::V_CVT_F16_U16_e64: 363 case AMDGPU::V_CVT_F16_I16_e32: 364 case AMDGPU::V_CVT_F16_I16_e64: 365 case AMDGPU::V_RCP_F16_e64: 366 case AMDGPU::V_RCP_F16_e32: 367 case AMDGPU::V_RSQ_F16_e64: 368 case AMDGPU::V_RSQ_F16_e32: 369 case AMDGPU::V_SQRT_F16_e64: 370 case AMDGPU::V_SQRT_F16_e32: 371 case AMDGPU::V_LOG_F16_e64: 372 case AMDGPU::V_LOG_F16_e32: 373 case AMDGPU::V_EXP_F16_e64: 374 case AMDGPU::V_EXP_F16_e32: 375 case AMDGPU::V_SIN_F16_e64: 376 case AMDGPU::V_SIN_F16_e32: 377 case AMDGPU::V_COS_F16_e64: 378 case AMDGPU::V_COS_F16_e32: 379 case AMDGPU::V_FLOOR_F16_e64: 380 case AMDGPU::V_FLOOR_F16_e32: 381 case AMDGPU::V_CEIL_F16_e64: 382 case AMDGPU::V_CEIL_F16_e32: 383 case AMDGPU::V_TRUNC_F16_e64: 384 case AMDGPU::V_TRUNC_F16_e32: 385 case AMDGPU::V_RNDNE_F16_e64: 386 case AMDGPU::V_RNDNE_F16_e32: 387 case AMDGPU::V_FRACT_F16_e64: 388 case AMDGPU::V_FRACT_F16_e32: 389 case AMDGPU::V_FREXP_MANT_F16_e64: 390 case AMDGPU::V_FREXP_MANT_F16_e32: 391 case AMDGPU::V_FREXP_EXP_I16_F16_e64: 392 case AMDGPU::V_FREXP_EXP_I16_F16_e32: 393 case AMDGPU::V_LDEXP_F16_e64: 394 case AMDGPU::V_LDEXP_F16_e32: 395 case AMDGPU::V_LSHLREV_B16_e64: 396 case AMDGPU::V_LSHLREV_B16_e32: 397 case AMDGPU::V_LSHRREV_B16_e64: 398 case AMDGPU::V_LSHRREV_B16_e32: 399 case AMDGPU::V_ASHRREV_I16_e64: 400 case AMDGPU::V_ASHRREV_I16_e32: 401 case AMDGPU::V_ADD_U16_e64: 402 case AMDGPU::V_ADD_U16_e32: 403 case AMDGPU::V_SUB_U16_e64: 404 case AMDGPU::V_SUB_U16_e32: 405 case AMDGPU::V_SUBREV_U16_e64: 406 case AMDGPU::V_SUBREV_U16_e32: 407 case AMDGPU::V_MUL_LO_U16_e64: 408 case AMDGPU::V_MUL_LO_U16_e32: 409 case AMDGPU::V_ADD_F16_e64: 410 case AMDGPU::V_ADD_F16_e32: 411 case AMDGPU::V_SUB_F16_e64: 412 case AMDGPU::V_SUB_F16_e32: 413 case AMDGPU::V_SUBREV_F16_e64: 414 case AMDGPU::V_SUBREV_F16_e32: 415 case AMDGPU::V_MUL_F16_e64: 416 case AMDGPU::V_MUL_F16_e32: 417 case AMDGPU::V_MAX_F16_e64: 418 case AMDGPU::V_MAX_F16_e32: 419 case AMDGPU::V_MIN_F16_e64: 420 case AMDGPU::V_MIN_F16_e32: 421 case AMDGPU::V_MAX_U16_e64: 422 case AMDGPU::V_MAX_U16_e32: 423 case AMDGPU::V_MIN_U16_e64: 424 case AMDGPU::V_MIN_U16_e32: 425 case AMDGPU::V_MAX_I16_e64: 426 case AMDGPU::V_MAX_I16_e32: 427 case AMDGPU::V_MIN_I16_e64: 428 case AMDGPU::V_MIN_I16_e32: 429 // On gfx10, all 16-bit instructions preserve the high bits. 430 return getGeneration() <= AMDGPUSubtarget::GFX9; 431 case AMDGPU::V_MAD_F16_e64: 432 case AMDGPU::V_MADAK_F16: 433 case AMDGPU::V_MADMK_F16: 434 case AMDGPU::V_MAC_F16_e64: 435 case AMDGPU::V_MAC_F16_e32: 436 case AMDGPU::V_FMAMK_F16: 437 case AMDGPU::V_FMAAK_F16: 438 case AMDGPU::V_MAD_U16_e64: 439 case AMDGPU::V_MAD_I16_e64: 440 case AMDGPU::V_FMA_F16_e64: 441 case AMDGPU::V_FMAC_F16_e64: 442 case AMDGPU::V_FMAC_F16_e32: 443 case AMDGPU::V_DIV_FIXUP_F16_e64: 444 // In gfx9, the preferred handling of the unused high 16-bits changed. Most 445 // instructions maintain the legacy behavior of 0ing. Some instructions 446 // changed to preserving the high bits. 447 return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS; 448 case AMDGPU::V_MAD_MIXLO_F16: 449 case AMDGPU::V_MAD_MIXHI_F16: 450 default: 451 return false; 452 } 453 } 454 455 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 456 const Function &F) const { 457 if (NWaves == 1) 458 return getLocalMemorySize(); 459 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 460 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 461 if (!WorkGroupsPerCu) 462 return 0; 463 unsigned MaxWaves = getMaxWavesPerEU(); 464 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 465 } 466 467 // FIXME: Should return min,max range. 468 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 469 const Function &F) const { 470 const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second; 471 const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize); 472 if (!MaxWorkGroupsPerCu) 473 return 0; 474 475 const unsigned WaveSize = getWavefrontSize(); 476 477 // FIXME: Do we need to account for alignment requirement of LDS rounding the 478 // size up? 479 // Compute restriction based on LDS usage 480 unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u); 481 482 // This can be queried with more LDS than is possible, so just assume the 483 // worst. 484 if (NumGroups == 0) 485 return 1; 486 487 NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups); 488 489 // Round to the number of waves. 490 const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize; 491 unsigned MaxWaves = NumGroups * MaxGroupNumWaves; 492 493 // Clamp to the maximum possible number of waves. 494 MaxWaves = std::min(MaxWaves, getMaxWavesPerEU()); 495 496 // FIXME: Needs to be a multiple of the group size? 497 //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves); 498 499 assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() && 500 "computed invalid occupancy"); 501 return MaxWaves; 502 } 503 504 unsigned 505 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 506 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 507 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 508 } 509 510 std::pair<unsigned, unsigned> 511 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 512 switch (CC) { 513 case CallingConv::AMDGPU_VS: 514 case CallingConv::AMDGPU_LS: 515 case CallingConv::AMDGPU_HS: 516 case CallingConv::AMDGPU_ES: 517 case CallingConv::AMDGPU_GS: 518 case CallingConv::AMDGPU_PS: 519 return std::make_pair(1, getWavefrontSize()); 520 default: 521 return std::make_pair(1u, getMaxFlatWorkGroupSize()); 522 } 523 } 524 525 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 526 const Function &F) const { 527 // Default minimum/maximum flat work group sizes. 528 std::pair<unsigned, unsigned> Default = 529 getDefaultFlatWorkGroupSize(F.getCallingConv()); 530 531 // Requested minimum/maximum flat work group sizes. 532 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 533 F, "amdgpu-flat-work-group-size", Default); 534 535 // Make sure requested minimum is less than requested maximum. 536 if (Requested.first > Requested.second) 537 return Default; 538 539 // Make sure requested values do not violate subtarget's specifications. 540 if (Requested.first < getMinFlatWorkGroupSize()) 541 return Default; 542 if (Requested.second > getMaxFlatWorkGroupSize()) 543 return Default; 544 545 return Requested; 546 } 547 548 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 549 const Function &F) const { 550 // Default minimum/maximum number of waves per execution unit. 551 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 552 553 // Default/requested minimum/maximum flat work group sizes. 554 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 555 556 // If minimum/maximum flat work group sizes were explicitly requested using 557 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 558 // number of waves per execution unit to values implied by requested 559 // minimum/maximum flat work group sizes. 560 unsigned MinImpliedByFlatWorkGroupSize = 561 getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second); 562 Default.first = MinImpliedByFlatWorkGroupSize; 563 bool RequestedFlatWorkGroupSize = 564 F.hasFnAttribute("amdgpu-flat-work-group-size"); 565 566 // Requested minimum/maximum number of waves per execution unit. 567 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 568 F, "amdgpu-waves-per-eu", Default, true); 569 570 // Make sure requested minimum is less than requested maximum. 571 if (Requested.second && Requested.first > Requested.second) 572 return Default; 573 574 // Make sure requested values do not violate subtarget's specifications. 575 if (Requested.first < getMinWavesPerEU() || 576 Requested.second > getMaxWavesPerEU()) 577 return Default; 578 579 // Make sure requested values are compatible with values implied by requested 580 // minimum/maximum flat work group sizes. 581 if (RequestedFlatWorkGroupSize && 582 Requested.first < MinImpliedByFlatWorkGroupSize) 583 return Default; 584 585 return Requested; 586 } 587 588 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) { 589 auto Node = Kernel.getMetadata("reqd_work_group_size"); 590 if (Node && Node->getNumOperands() == 3) 591 return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue(); 592 return std::numeric_limits<unsigned>::max(); 593 } 594 595 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const { 596 return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv()); 597 } 598 599 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel, 600 unsigned Dimension) const { 601 unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension); 602 if (ReqdSize != std::numeric_limits<unsigned>::max()) 603 return ReqdSize - 1; 604 return getFlatWorkGroupSizes(Kernel).second - 1; 605 } 606 607 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 608 Function *Kernel = I->getParent()->getParent(); 609 unsigned MinSize = 0; 610 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 611 bool IdQuery = false; 612 613 // If reqd_work_group_size is present it narrows value down. 614 if (auto *CI = dyn_cast<CallInst>(I)) { 615 const Function *F = CI->getCalledFunction(); 616 if (F) { 617 unsigned Dim = UINT_MAX; 618 switch (F->getIntrinsicID()) { 619 case Intrinsic::amdgcn_workitem_id_x: 620 case Intrinsic::r600_read_tidig_x: 621 IdQuery = true; 622 LLVM_FALLTHROUGH; 623 case Intrinsic::r600_read_local_size_x: 624 Dim = 0; 625 break; 626 case Intrinsic::amdgcn_workitem_id_y: 627 case Intrinsic::r600_read_tidig_y: 628 IdQuery = true; 629 LLVM_FALLTHROUGH; 630 case Intrinsic::r600_read_local_size_y: 631 Dim = 1; 632 break; 633 case Intrinsic::amdgcn_workitem_id_z: 634 case Intrinsic::r600_read_tidig_z: 635 IdQuery = true; 636 LLVM_FALLTHROUGH; 637 case Intrinsic::r600_read_local_size_z: 638 Dim = 2; 639 break; 640 default: 641 break; 642 } 643 644 if (Dim <= 3) { 645 unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim); 646 if (ReqdSize != std::numeric_limits<unsigned>::max()) 647 MinSize = MaxSize = ReqdSize; 648 } 649 } 650 } 651 652 if (!MaxSize) 653 return false; 654 655 // Range metadata is [Lo, Hi). For ID query we need to pass max size 656 // as Hi. For size query we need to pass Hi + 1. 657 if (IdQuery) 658 MinSize = 0; 659 else 660 ++MaxSize; 661 662 MDBuilder MDB(I->getContext()); 663 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 664 APInt(32, MaxSize)); 665 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 666 return true; 667 } 668 669 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const { 670 if (isMesaKernel(F)) 671 return 16; 672 return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0); 673 } 674 675 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 676 Align &MaxAlign) const { 677 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 678 F.getCallingConv() == CallingConv::SPIR_KERNEL); 679 680 const DataLayout &DL = F.getParent()->getDataLayout(); 681 uint64_t ExplicitArgBytes = 0; 682 MaxAlign = Align(1); 683 684 for (const Argument &Arg : F.args()) { 685 const bool IsByRef = Arg.hasByRefAttr(); 686 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); 687 MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None; 688 if (!Alignment) 689 Alignment = DL.getABITypeAlign(ArgTy); 690 691 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 692 ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize; 693 MaxAlign = max(MaxAlign, Alignment); 694 } 695 696 return ExplicitArgBytes; 697 } 698 699 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 700 Align &MaxAlign) const { 701 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 702 703 unsigned ExplicitOffset = getExplicitKernelArgOffset(F); 704 705 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 706 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 707 if (ImplicitBytes != 0) { 708 const Align Alignment = getAlignmentForImplicitArgPtr(); 709 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 710 } 711 712 // Being able to dereference past the end is useful for emitting scalar loads. 713 return alignTo(TotalSize, 4); 714 } 715 716 AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const { 717 return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32 718 : AMDGPUDwarfFlavour::Wave64; 719 } 720 721 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 722 const TargetMachine &TM) : 723 R600GenSubtargetInfo(TT, GPU, /*TuneCPU*/GPU, FS), 724 AMDGPUSubtarget(TT), 725 InstrInfo(*this), 726 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 727 FMA(false), 728 CaymanISA(false), 729 CFALUBug(false), 730 HasVertexCache(false), 731 R600ALUInst(false), 732 FP64(false), 733 TexVTXClauseSize(0), 734 Gen(R600), 735 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)), 736 InstrItins(getInstrItineraryForCPU(GPU)) { } 737 738 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 739 unsigned NumRegionInstrs) const { 740 // Track register pressure so the scheduler can try to decrease 741 // pressure once register usage is above the threshold defined by 742 // SIRegisterInfo::getRegPressureSetLimit() 743 Policy.ShouldTrackPressure = true; 744 745 // Enabling both top down and bottom up scheduling seems to give us less 746 // register spills than just using one of these approaches on its own. 747 Policy.OnlyTopDown = false; 748 Policy.OnlyBottomUp = false; 749 750 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 751 if (!enableSIScheduler()) 752 Policy.ShouldTrackLaneMasks = true; 753 } 754 755 bool GCNSubtarget::hasMadF16() const { 756 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1; 757 } 758 759 bool GCNSubtarget::useVGPRIndexMode() const { 760 return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode()); 761 } 762 763 bool GCNSubtarget::useAA() const { return UseAA; } 764 765 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 766 if (getGeneration() >= AMDGPUSubtarget::GFX10) 767 return getMaxWavesPerEU(); 768 769 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 770 if (SGPRs <= 80) 771 return 10; 772 if (SGPRs <= 88) 773 return 9; 774 if (SGPRs <= 100) 775 return 8; 776 return 7; 777 } 778 if (SGPRs <= 48) 779 return 10; 780 if (SGPRs <= 56) 781 return 9; 782 if (SGPRs <= 64) 783 return 8; 784 if (SGPRs <= 72) 785 return 7; 786 if (SGPRs <= 80) 787 return 6; 788 return 5; 789 } 790 791 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 792 unsigned MaxWaves = getMaxWavesPerEU(); 793 unsigned Granule = getVGPRAllocGranule(); 794 if (VGPRs < Granule) 795 return MaxWaves; 796 unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule; 797 return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves); 798 } 799 800 unsigned 801 GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratchInit) const { 802 if (getGeneration() >= AMDGPUSubtarget::GFX10) 803 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. 804 805 if (HasFlatScratchInit) { 806 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 807 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 808 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 809 return 4; // FLAT_SCRATCH, VCC (in that order). 810 } 811 812 if (isXNACKEnabled()) 813 return 4; // XNACK, VCC (in that order). 814 return 2; // VCC. 815 } 816 817 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 818 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 819 return getBaseReservedNumSGPRs(MFI.hasFlatScratchInit()); 820 } 821 822 unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const { 823 // The logic to detect if the function has 824 // flat scratch init is same as how MachineFunctionInfo derives. 825 bool FunctionHasFlatScratchInit = false; 826 bool HasCalls = F.hasFnAttribute("amdgpu-calls"); 827 bool HasStackObjects = F.hasFnAttribute("amdgpu-stack-objects"); 828 if (hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(F.getCallingConv()) && 829 (isAmdHsaOrMesa(F) || enableFlatScratch()) && 830 !flatScratchIsArchitected()) { 831 if (HasCalls || HasStackObjects || enableFlatScratch()) 832 FunctionHasFlatScratchInit = true; 833 } 834 return getBaseReservedNumSGPRs(FunctionHasFlatScratchInit); 835 } 836 837 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize, 838 unsigned NumSGPRs, 839 unsigned NumVGPRs) const { 840 unsigned Occupancy = 841 std::min(getMaxWavesPerEU(), 842 getOccupancyWithLocalMemSize(LDSSize, F)); 843 if (NumSGPRs) 844 Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs)); 845 if (NumVGPRs) 846 Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs)); 847 return Occupancy; 848 } 849 850 unsigned GCNSubtarget::getBaseMaxNumSGPRs( 851 const Function &F, std::pair<unsigned, unsigned> WavesPerEU, 852 unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const { 853 // Compute maximum number of SGPRs function can use using default/requested 854 // minimum number of waves per execution unit. 855 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 856 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 857 858 // Check if maximum number of SGPRs was explicitly requested using 859 // "amdgpu-num-sgpr" attribute. 860 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 861 unsigned Requested = AMDGPU::getIntegerAttribute( 862 F, "amdgpu-num-sgpr", MaxNumSGPRs); 863 864 // Make sure requested value does not violate subtarget's specifications. 865 if (Requested && (Requested <= ReservedNumSGPRs)) 866 Requested = 0; 867 868 // If more SGPRs are required to support the input user/system SGPRs, 869 // increase to accommodate them. 870 // 871 // FIXME: This really ends up using the requested number of SGPRs + number 872 // of reserved special registers in total. Theoretically you could re-use 873 // the last input registers for these special registers, but this would 874 // require a lot of complexity to deal with the weird aliasing. 875 unsigned InputNumSGPRs = PreloadedSGPRs; 876 if (Requested && Requested < InputNumSGPRs) 877 Requested = InputNumSGPRs; 878 879 // Make sure requested value is compatible with values implied by 880 // default/requested minimum/maximum number of waves per execution unit. 881 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 882 Requested = 0; 883 if (WavesPerEU.second && 884 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 885 Requested = 0; 886 887 if (Requested) 888 MaxNumSGPRs = Requested; 889 } 890 891 if (hasSGPRInitBug()) 892 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 893 894 return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs); 895 } 896 897 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 898 const Function &F = MF.getFunction(); 899 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 900 return getBaseMaxNumSGPRs(F, MFI.getWavesPerEU(), MFI.getNumPreloadedSGPRs(), 901 getReservedNumSGPRs(MF)); 902 } 903 904 static unsigned getMaxNumPreloadedSGPRs() { 905 // Max number of user SGPRs 906 unsigned MaxUserSGPRs = 4 + // private segment buffer 907 2 + // Dispatch ptr 908 2 + // queue ptr 909 2 + // kernel segment ptr 910 2 + // dispatch ID 911 2 + // flat scratch init 912 2; // Implicit buffer ptr 913 // Max number of system SGPRs 914 unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX 915 1 + // WorkGroupIDY 916 1 + // WorkGroupIDZ 917 1 + // WorkGroupInfo 918 1; // private segment wave byte offset 919 return MaxUserSGPRs + MaxSystemSGPRs; 920 } 921 922 unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const { 923 return getBaseMaxNumSGPRs(F, getWavesPerEU(F), getMaxNumPreloadedSGPRs(), 924 getReservedNumSGPRs(F)); 925 } 926 927 unsigned GCNSubtarget::getBaseMaxNumVGPRs( 928 const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const { 929 // Compute maximum number of VGPRs function can use using default/requested 930 // minimum number of waves per execution unit. 931 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 932 933 // Check if maximum number of VGPRs was explicitly requested using 934 // "amdgpu-num-vgpr" attribute. 935 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 936 unsigned Requested = AMDGPU::getIntegerAttribute( 937 F, "amdgpu-num-vgpr", MaxNumVGPRs); 938 939 if (hasGFX90AInsts()) 940 Requested *= 2; 941 942 // Make sure requested value is compatible with values implied by 943 // default/requested minimum/maximum number of waves per execution unit. 944 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 945 Requested = 0; 946 if (WavesPerEU.second && 947 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 948 Requested = 0; 949 950 if (Requested) 951 MaxNumVGPRs = Requested; 952 } 953 954 return MaxNumVGPRs; 955 } 956 957 unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const { 958 return getBaseMaxNumVGPRs(F, getWavesPerEU(F)); 959 } 960 961 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 962 const Function &F = MF.getFunction(); 963 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 964 return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU()); 965 } 966 967 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, 968 int UseOpIdx, SDep &Dep) const { 969 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || 970 !Def->isInstr() || !Use->isInstr()) 971 return; 972 973 MachineInstr *DefI = Def->getInstr(); 974 MachineInstr *UseI = Use->getInstr(); 975 976 if (DefI->isBundle()) { 977 const SIRegisterInfo *TRI = getRegisterInfo(); 978 auto Reg = Dep.getReg(); 979 MachineBasicBlock::const_instr_iterator I(DefI->getIterator()); 980 MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end()); 981 unsigned Lat = 0; 982 for (++I; I != E && I->isBundledWithPred(); ++I) { 983 if (I->modifiesRegister(Reg, TRI)) 984 Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I); 985 else if (Lat) 986 --Lat; 987 } 988 Dep.setLatency(Lat); 989 } else if (UseI->isBundle()) { 990 const SIRegisterInfo *TRI = getRegisterInfo(); 991 auto Reg = Dep.getReg(); 992 MachineBasicBlock::const_instr_iterator I(UseI->getIterator()); 993 MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end()); 994 unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI); 995 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) { 996 if (I->readsRegister(Reg, TRI)) 997 break; 998 --Lat; 999 } 1000 Dep.setLatency(Lat); 1001 } 1002 } 1003 1004 namespace { 1005 struct FillMFMAShadowMutation : ScheduleDAGMutation { 1006 const SIInstrInfo *TII; 1007 1008 ScheduleDAGMI *DAG; 1009 1010 FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {} 1011 1012 bool isSALU(const SUnit *SU) const { 1013 const MachineInstr *MI = SU->getInstr(); 1014 return MI && TII->isSALU(*MI) && !MI->isTerminator(); 1015 } 1016 1017 bool isVALU(const SUnit *SU) const { 1018 const MachineInstr *MI = SU->getInstr(); 1019 return MI && TII->isVALU(*MI); 1020 } 1021 1022 bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const { 1023 if (Pred->NodeNum < Succ->NodeNum) 1024 return true; 1025 1026 SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred}); 1027 1028 for (unsigned I = 0; I < Succs.size(); ++I) { 1029 for (const SDep &SI : Succs[I]->Succs) { 1030 const SUnit *SU = SI.getSUnit(); 1031 if (SU != Succs[I] && !llvm::is_contained(Succs, SU)) 1032 Succs.push_back(SU); 1033 } 1034 } 1035 1036 SmallPtrSet<const SUnit*, 32> Visited; 1037 while (!Preds.empty()) { 1038 const SUnit *SU = Preds.pop_back_val(); 1039 if (llvm::is_contained(Succs, SU)) 1040 return false; 1041 Visited.insert(SU); 1042 for (const SDep &SI : SU->Preds) 1043 if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit())) 1044 Preds.push_back(SI.getSUnit()); 1045 } 1046 1047 return true; 1048 } 1049 1050 // Link as much SALU intructions in chain as possible. Return the size 1051 // of the chain. Links up to MaxChain instructions. 1052 unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain, 1053 SmallPtrSetImpl<SUnit *> &Visited) const { 1054 SmallVector<SUnit *, 8> Worklist({To}); 1055 unsigned Linked = 0; 1056 1057 while (!Worklist.empty() && MaxChain-- > 0) { 1058 SUnit *SU = Worklist.pop_back_val(); 1059 if (!Visited.insert(SU).second) 1060 continue; 1061 1062 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From); 1063 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n'); 1064 1065 if (SU->addPred(SDep(From, SDep::Artificial), false)) 1066 ++Linked; 1067 1068 for (SDep &SI : From->Succs) { 1069 SUnit *SUv = SI.getSUnit(); 1070 if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU)) 1071 SUv->addPred(SDep(SU, SDep::Artificial), false); 1072 } 1073 1074 for (SDep &SI : SU->Succs) { 1075 SUnit *Succ = SI.getSUnit(); 1076 if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ)) 1077 Worklist.push_back(Succ); 1078 } 1079 } 1080 1081 return Linked; 1082 } 1083 1084 void apply(ScheduleDAGInstrs *DAGInstrs) override { 1085 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); 1086 if (!ST.hasMAIInsts() || DisablePowerSched) 1087 return; 1088 DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); 1089 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); 1090 if (!TSchedModel || DAG->SUnits.empty()) 1091 return; 1092 1093 // Scan for MFMA long latency instructions and try to add a dependency 1094 // of available SALU instructions to give them a chance to fill MFMA 1095 // shadow. That is desirable to fill MFMA shadow with SALU instructions 1096 // rather than VALU to prevent power consumption bursts and throttle. 1097 auto LastSALU = DAG->SUnits.begin(); 1098 auto E = DAG->SUnits.end(); 1099 SmallPtrSet<SUnit*, 32> Visited; 1100 for (SUnit &SU : DAG->SUnits) { 1101 MachineInstr &MAI = *SU.getInstr(); 1102 if (!TII->isMAI(MAI) || 1103 MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 || 1104 MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64) 1105 continue; 1106 1107 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1; 1108 1109 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU); 1110 dbgs() << "Need " << Lat 1111 << " instructions to cover latency.\n"); 1112 1113 // Find up to Lat independent scalar instructions as early as 1114 // possible such that they can be scheduled after this MFMA. 1115 for ( ; Lat && LastSALU != E; ++LastSALU) { 1116 if (Visited.count(&*LastSALU)) 1117 continue; 1118 1119 if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU)) 1120 continue; 1121 1122 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited); 1123 } 1124 } 1125 } 1126 }; 1127 } // namespace 1128 1129 void GCNSubtarget::getPostRAMutations( 1130 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 1131 Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo)); 1132 } 1133 1134 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 1135 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 1136 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 1137 else 1138 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); 1139 } 1140 1141 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 1142 if (TM.getTargetTriple().getArch() == Triple::amdgcn) 1143 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 1144 else 1145 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); 1146 } 1147