1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Implements the AMDGPU specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUSubtarget.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUCallLowering.h" 17 #include "AMDGPUInstructionSelector.h" 18 #include "AMDGPULegalizerInfo.h" 19 #include "AMDGPURegisterBankInfo.h" 20 #include "AMDGPUTargetMachine.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "Utils/AMDGPUBaseInfo.h" 23 #include "llvm/ADT/SmallString.h" 24 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h" 25 #include "llvm/CodeGen/MachineScheduler.h" 26 #include "llvm/CodeGen/TargetFrameLowering.h" 27 #include "llvm/IR/IntrinsicsAMDGPU.h" 28 #include "llvm/IR/IntrinsicsR600.h" 29 #include "llvm/IR/MDBuilder.h" 30 #include "llvm/MC/MCSubtargetInfo.h" 31 #include <algorithm> 32 33 using namespace llvm; 34 35 #define DEBUG_TYPE "amdgpu-subtarget" 36 37 #define GET_SUBTARGETINFO_TARGET_DESC 38 #define GET_SUBTARGETINFO_CTOR 39 #define AMDGPUSubtarget GCNSubtarget 40 #include "AMDGPUGenSubtargetInfo.inc" 41 #define GET_SUBTARGETINFO_TARGET_DESC 42 #define GET_SUBTARGETINFO_CTOR 43 #undef AMDGPUSubtarget 44 #include "R600GenSubtargetInfo.inc" 45 46 static cl::opt<bool> DisablePowerSched( 47 "amdgpu-disable-power-sched", 48 cl::desc("Disable scheduling to minimize mAI power bursts"), 49 cl::init(false)); 50 51 static cl::opt<bool> EnableVGPRIndexMode( 52 "amdgpu-vgpr-index-mode", 53 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), 54 cl::init(false)); 55 56 static cl::opt<bool> EnableFlatScratch( 57 "amdgpu-enable-flat-scratch", 58 cl::desc("Use flat scratch instructions"), 59 cl::init(false)); 60 61 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen", 62 cl::desc("Enable the use of AA during codegen."), 63 cl::init(true)); 64 65 GCNSubtarget::~GCNSubtarget() = default; 66 67 R600Subtarget & 68 R600Subtarget::initializeSubtargetDependencies(const Triple &TT, 69 StringRef GPU, StringRef FS) { 70 SmallString<256> FullFS("+promote-alloca,"); 71 FullFS += FS; 72 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS); 73 74 HasMulU24 = getGeneration() >= EVERGREEN; 75 HasMulI24 = hasCaymanISA(); 76 77 return *this; 78 } 79 80 GCNSubtarget & 81 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 82 StringRef GPU, StringRef FS) { 83 // Determine default and user-specified characteristics 84 // 85 // We want to be able to turn these off, but making this a subtarget feature 86 // for SI has the unhelpful behavior that it unsets everything else if you 87 // disable it. 88 // 89 // Similarly we want enable-prt-strict-null to be on by default and not to 90 // unset everything else if it is disabled 91 92 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,"); 93 94 // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default 95 if (isAmdHsaOS()) 96 FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,"; 97 98 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS 99 100 // Disable mutually exclusive bits. 101 if (FS.find_insensitive("+wavefrontsize") != StringRef::npos) { 102 if (FS.find_insensitive("wavefrontsize16") == StringRef::npos) 103 FullFS += "-wavefrontsize16,"; 104 if (FS.find_insensitive("wavefrontsize32") == StringRef::npos) 105 FullFS += "-wavefrontsize32,"; 106 if (FS.find_insensitive("wavefrontsize64") == StringRef::npos) 107 FullFS += "-wavefrontsize64,"; 108 } 109 110 FullFS += FS; 111 112 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS); 113 114 // Implement the "generic" processors, which acts as the default when no 115 // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to 116 // the first amdgcn target that supports flat addressing. Other OSes defaults 117 // to the first amdgcn target. 118 if (Gen == AMDGPUSubtarget::INVALID) { 119 Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS 120 : AMDGPUSubtarget::SOUTHERN_ISLANDS; 121 } 122 123 // We don't support FP64 for EG/NI atm. 124 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 125 126 // Targets must either support 64-bit offsets for MUBUF instructions, and/or 127 // support flat operations, otherwise they cannot access a 64-bit global 128 // address space 129 assert(hasAddr64() || hasFlat()); 130 // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets 131 // that do not support ADDR64 variants of MUBUF instructions. Such targets 132 // cannot use a 64 bit offset with a MUBUF instruction to access the global 133 // address space 134 if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) { 135 ToggleFeature(AMDGPU::FeatureFlatForGlobal); 136 FlatForGlobal = true; 137 } 138 // Unless +-flat-for-global is specified, use MUBUF instructions for global 139 // address space access if flat operations are not available. 140 if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) { 141 ToggleFeature(AMDGPU::FeatureFlatForGlobal); 142 FlatForGlobal = false; 143 } 144 145 // Set defaults if needed. 146 if (MaxPrivateElementSize == 0) 147 MaxPrivateElementSize = 4; 148 149 if (LDSBankCount == 0) 150 LDSBankCount = 32; 151 152 if (TT.getArch() == Triple::amdgcn) { 153 if (LocalMemorySize == 0) 154 LocalMemorySize = 32768; 155 156 // Do something sensible for unspecified target. 157 if (!HasMovrel && !HasVGPRIndexMode) 158 HasMovrel = true; 159 } 160 161 // Don't crash on invalid devices. 162 if (WavefrontSizeLog2 == 0) 163 WavefrontSizeLog2 = 5; 164 165 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 166 167 TargetID.setTargetIDFromFeaturesString(FS); 168 169 LLVM_DEBUG(dbgs() << "xnack setting for subtarget: " 170 << TargetID.getXnackSetting() << '\n'); 171 LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: " 172 << TargetID.getSramEccSetting() << '\n'); 173 174 return *this; 175 } 176 177 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : 178 TargetTriple(TT), 179 GCN3Encoding(false), 180 Has16BitInsts(false), 181 HasMadMixInsts(false), 182 HasMadMacF32Insts(false), 183 HasDsSrc2Insts(false), 184 HasSDWA(false), 185 HasVOP3PInsts(false), 186 HasMulI24(true), 187 HasMulU24(true), 188 HasInv2PiInlineImm(false), 189 HasFminFmaxLegacy(true), 190 EnablePromoteAlloca(false), 191 HasTrigReducedRange(false), 192 MaxWavesPerEU(10), 193 LocalMemorySize(0), 194 WavefrontSizeLog2(0) 195 { } 196 197 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 198 const GCNTargetMachine &TM) 199 : // clang-format off 200 AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS), 201 AMDGPUSubtarget(TT), 202 TargetTriple(TT), 203 TargetID(*this), 204 Gen(INVALID), 205 InstrItins(getInstrItineraryForCPU(GPU)), 206 LDSBankCount(0), 207 MaxPrivateElementSize(0), 208 209 FastFMAF32(false), 210 FastDenormalF32(false), 211 HalfRate64Ops(false), 212 FullRate64Ops(false), 213 214 FlatForGlobal(false), 215 AutoWaitcntBeforeBarrier(false), 216 UnalignedScratchAccess(false), 217 UnalignedAccessMode(false), 218 219 HasApertureRegs(false), 220 SupportsXNACK(false), 221 EnableXNACK(false), 222 EnableTgSplit(false), 223 EnableCuMode(false), 224 TrapHandler(false), 225 226 EnableLoadStoreOpt(false), 227 EnableUnsafeDSOffsetFolding(false), 228 EnableSIScheduler(false), 229 EnableDS128(false), 230 EnablePRTStrictNull(false), 231 DumpCode(false), 232 233 FP64(false), 234 CIInsts(false), 235 GFX8Insts(false), 236 GFX9Insts(false), 237 GFX90AInsts(false), 238 GFX10Insts(false), 239 GFX10_3Insts(false), 240 GFX7GFX8GFX9Insts(false), 241 SGPRInitBug(false), 242 NegativeScratchOffsetBug(false), 243 NegativeUnalignedScratchOffsetBug(false), 244 HasSMemRealTime(false), 245 HasIntClamp(false), 246 HasFmaMixInsts(false), 247 HasMovrel(false), 248 HasVGPRIndexMode(false), 249 HasScalarStores(false), 250 HasScalarAtomics(false), 251 HasSDWAOmod(false), 252 HasSDWAScalar(false), 253 HasSDWASdst(false), 254 HasSDWAMac(false), 255 HasSDWAOutModsVOPC(false), 256 HasDPP(false), 257 HasDPP8(false), 258 Has64BitDPP(false), 259 HasPackedFP32Ops(false), 260 HasExtendedImageInsts(false), 261 HasR128A16(false), 262 HasGFX10A16(false), 263 HasG16(false), 264 HasNSAEncoding(false), 265 GFX10_AEncoding(false), 266 GFX10_BEncoding(false), 267 HasDLInsts(false), 268 HasDot1Insts(false), 269 HasDot2Insts(false), 270 HasDot3Insts(false), 271 HasDot4Insts(false), 272 HasDot5Insts(false), 273 HasDot6Insts(false), 274 HasDot7Insts(false), 275 HasMAIInsts(false), 276 HasPkFmacF16Inst(false), 277 HasAtomicFaddInsts(false), 278 SupportsSRAMECC(false), 279 EnableSRAMECC(false), 280 HasNoSdstCMPX(false), 281 HasVscnt(false), 282 HasGetWaveIdInst(false), 283 HasSMemTimeInst(false), 284 HasShaderCyclesRegister(false), 285 HasRegisterBanking(false), 286 HasVOP3Literal(false), 287 HasNoDataDepHazard(false), 288 FlatAddressSpace(false), 289 FlatInstOffsets(false), 290 FlatGlobalInsts(false), 291 FlatScratchInsts(false), 292 ScalarFlatScratchInsts(false), 293 HasArchitectedFlatScratch(false), 294 AddNoCarryInsts(false), 295 HasUnpackedD16VMem(false), 296 LDSMisalignedBug(false), 297 HasMFMAInlineLiteralBug(false), 298 UnalignedBufferAccess(false), 299 UnalignedDSAccess(false), 300 HasPackedTID(false), 301 302 ScalarizeGlobal(false), 303 304 HasVcmpxPermlaneHazard(false), 305 HasVMEMtoScalarWriteHazard(false), 306 HasSMEMtoVectorWriteHazard(false), 307 HasInstFwdPrefetchBug(false), 308 HasVcmpxExecWARHazard(false), 309 HasLdsBranchVmemWARHazard(false), 310 HasNSAtoVMEMBug(false), 311 HasNSAClauseBug(false), 312 HasOffset3fBug(false), 313 HasFlatSegmentOffsetBug(false), 314 HasImageStoreD16Bug(false), 315 HasImageGather4D16Bug(false), 316 317 FeatureDisable(false), 318 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 319 TLInfo(TM, *this), 320 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 321 // clang-format on 322 MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this); 323 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 324 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering())); 325 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 326 RegBankInfo.reset(new AMDGPURegisterBankInfo(*this)); 327 InstSelector.reset(new AMDGPUInstructionSelector( 328 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 329 } 330 331 bool GCNSubtarget::enableFlatScratch() const { 332 return flatScratchIsArchitected() || 333 (EnableFlatScratch && hasFlatScratchInsts()); 334 } 335 336 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { 337 if (getGeneration() < GFX10) 338 return 1; 339 340 switch (Opcode) { 341 case AMDGPU::V_LSHLREV_B64_e64: 342 case AMDGPU::V_LSHLREV_B64_gfx10: 343 case AMDGPU::V_LSHL_B64_e64: 344 case AMDGPU::V_LSHRREV_B64_e64: 345 case AMDGPU::V_LSHRREV_B64_gfx10: 346 case AMDGPU::V_LSHR_B64_e64: 347 case AMDGPU::V_ASHRREV_I64_e64: 348 case AMDGPU::V_ASHRREV_I64_gfx10: 349 case AMDGPU::V_ASHR_I64_e64: 350 return 1; 351 } 352 353 return 2; 354 } 355 356 /// This list was mostly derived from experimentation. 357 bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const { 358 switch (Opcode) { 359 case AMDGPU::V_CVT_F16_F32_e32: 360 case AMDGPU::V_CVT_F16_F32_e64: 361 case AMDGPU::V_CVT_F16_U16_e32: 362 case AMDGPU::V_CVT_F16_U16_e64: 363 case AMDGPU::V_CVT_F16_I16_e32: 364 case AMDGPU::V_CVT_F16_I16_e64: 365 case AMDGPU::V_RCP_F16_e64: 366 case AMDGPU::V_RCP_F16_e32: 367 case AMDGPU::V_RSQ_F16_e64: 368 case AMDGPU::V_RSQ_F16_e32: 369 case AMDGPU::V_SQRT_F16_e64: 370 case AMDGPU::V_SQRT_F16_e32: 371 case AMDGPU::V_LOG_F16_e64: 372 case AMDGPU::V_LOG_F16_e32: 373 case AMDGPU::V_EXP_F16_e64: 374 case AMDGPU::V_EXP_F16_e32: 375 case AMDGPU::V_SIN_F16_e64: 376 case AMDGPU::V_SIN_F16_e32: 377 case AMDGPU::V_COS_F16_e64: 378 case AMDGPU::V_COS_F16_e32: 379 case AMDGPU::V_FLOOR_F16_e64: 380 case AMDGPU::V_FLOOR_F16_e32: 381 case AMDGPU::V_CEIL_F16_e64: 382 case AMDGPU::V_CEIL_F16_e32: 383 case AMDGPU::V_TRUNC_F16_e64: 384 case AMDGPU::V_TRUNC_F16_e32: 385 case AMDGPU::V_RNDNE_F16_e64: 386 case AMDGPU::V_RNDNE_F16_e32: 387 case AMDGPU::V_FRACT_F16_e64: 388 case AMDGPU::V_FRACT_F16_e32: 389 case AMDGPU::V_FREXP_MANT_F16_e64: 390 case AMDGPU::V_FREXP_MANT_F16_e32: 391 case AMDGPU::V_FREXP_EXP_I16_F16_e64: 392 case AMDGPU::V_FREXP_EXP_I16_F16_e32: 393 case AMDGPU::V_LDEXP_F16_e64: 394 case AMDGPU::V_LDEXP_F16_e32: 395 case AMDGPU::V_LSHLREV_B16_e64: 396 case AMDGPU::V_LSHLREV_B16_e32: 397 case AMDGPU::V_LSHRREV_B16_e64: 398 case AMDGPU::V_LSHRREV_B16_e32: 399 case AMDGPU::V_ASHRREV_I16_e64: 400 case AMDGPU::V_ASHRREV_I16_e32: 401 case AMDGPU::V_ADD_U16_e64: 402 case AMDGPU::V_ADD_U16_e32: 403 case AMDGPU::V_SUB_U16_e64: 404 case AMDGPU::V_SUB_U16_e32: 405 case AMDGPU::V_SUBREV_U16_e64: 406 case AMDGPU::V_SUBREV_U16_e32: 407 case AMDGPU::V_MUL_LO_U16_e64: 408 case AMDGPU::V_MUL_LO_U16_e32: 409 case AMDGPU::V_ADD_F16_e64: 410 case AMDGPU::V_ADD_F16_e32: 411 case AMDGPU::V_SUB_F16_e64: 412 case AMDGPU::V_SUB_F16_e32: 413 case AMDGPU::V_SUBREV_F16_e64: 414 case AMDGPU::V_SUBREV_F16_e32: 415 case AMDGPU::V_MUL_F16_e64: 416 case AMDGPU::V_MUL_F16_e32: 417 case AMDGPU::V_MAX_F16_e64: 418 case AMDGPU::V_MAX_F16_e32: 419 case AMDGPU::V_MIN_F16_e64: 420 case AMDGPU::V_MIN_F16_e32: 421 case AMDGPU::V_MAX_U16_e64: 422 case AMDGPU::V_MAX_U16_e32: 423 case AMDGPU::V_MIN_U16_e64: 424 case AMDGPU::V_MIN_U16_e32: 425 case AMDGPU::V_MAX_I16_e64: 426 case AMDGPU::V_MAX_I16_e32: 427 case AMDGPU::V_MIN_I16_e64: 428 case AMDGPU::V_MIN_I16_e32: 429 // On gfx10, all 16-bit instructions preserve the high bits. 430 return getGeneration() <= AMDGPUSubtarget::GFX9; 431 case AMDGPU::V_MAD_F16_e64: 432 case AMDGPU::V_MADAK_F16: 433 case AMDGPU::V_MADMK_F16: 434 case AMDGPU::V_MAC_F16_e64: 435 case AMDGPU::V_MAC_F16_e32: 436 case AMDGPU::V_FMAMK_F16: 437 case AMDGPU::V_FMAAK_F16: 438 case AMDGPU::V_MAD_U16_e64: 439 case AMDGPU::V_MAD_I16_e64: 440 case AMDGPU::V_FMA_F16_e64: 441 case AMDGPU::V_FMAC_F16_e64: 442 case AMDGPU::V_FMAC_F16_e32: 443 case AMDGPU::V_DIV_FIXUP_F16_e64: 444 // In gfx9, the preferred handling of the unused high 16-bits changed. Most 445 // instructions maintain the legacy behavior of 0ing. Some instructions 446 // changed to preserving the high bits. 447 return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS; 448 case AMDGPU::V_MAD_MIXLO_F16: 449 case AMDGPU::V_MAD_MIXHI_F16: 450 default: 451 return false; 452 } 453 } 454 455 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 456 const Function &F) const { 457 if (NWaves == 1) 458 return getLocalMemorySize(); 459 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 460 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 461 if (!WorkGroupsPerCu) 462 return 0; 463 unsigned MaxWaves = getMaxWavesPerEU(); 464 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 465 } 466 467 // FIXME: Should return min,max range. 468 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 469 const Function &F) const { 470 const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second; 471 const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize); 472 if (!MaxWorkGroupsPerCu) 473 return 0; 474 475 const unsigned WaveSize = getWavefrontSize(); 476 477 // FIXME: Do we need to account for alignment requirement of LDS rounding the 478 // size up? 479 // Compute restriction based on LDS usage 480 unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u); 481 482 // This can be queried with more LDS than is possible, so just assume the 483 // worst. 484 if (NumGroups == 0) 485 return 1; 486 487 NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups); 488 489 // Round to the number of waves. 490 const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize; 491 unsigned MaxWaves = NumGroups * MaxGroupNumWaves; 492 493 // Clamp to the maximum possible number of waves. 494 MaxWaves = std::min(MaxWaves, getMaxWavesPerEU()); 495 496 // FIXME: Needs to be a multiple of the group size? 497 //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves); 498 499 assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() && 500 "computed invalid occupancy"); 501 return MaxWaves; 502 } 503 504 unsigned 505 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 506 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 507 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 508 } 509 510 std::pair<unsigned, unsigned> 511 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 512 switch (CC) { 513 case CallingConv::AMDGPU_VS: 514 case CallingConv::AMDGPU_LS: 515 case CallingConv::AMDGPU_HS: 516 case CallingConv::AMDGPU_ES: 517 case CallingConv::AMDGPU_GS: 518 case CallingConv::AMDGPU_PS: 519 return std::make_pair(1, getWavefrontSize()); 520 default: 521 return std::make_pair(1u, getMaxFlatWorkGroupSize()); 522 } 523 } 524 525 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 526 const Function &F) const { 527 // Default minimum/maximum flat work group sizes. 528 std::pair<unsigned, unsigned> Default = 529 getDefaultFlatWorkGroupSize(F.getCallingConv()); 530 531 // Requested minimum/maximum flat work group sizes. 532 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 533 F, "amdgpu-flat-work-group-size", Default); 534 535 // Make sure requested minimum is less than requested maximum. 536 if (Requested.first > Requested.second) 537 return Default; 538 539 // Make sure requested values do not violate subtarget's specifications. 540 if (Requested.first < getMinFlatWorkGroupSize()) 541 return Default; 542 if (Requested.second > getMaxFlatWorkGroupSize()) 543 return Default; 544 545 return Requested; 546 } 547 548 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 549 const Function &F) const { 550 // Default minimum/maximum number of waves per execution unit. 551 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 552 553 // Default/requested minimum/maximum flat work group sizes. 554 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 555 556 // If minimum/maximum flat work group sizes were explicitly requested using 557 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 558 // number of waves per execution unit to values implied by requested 559 // minimum/maximum flat work group sizes. 560 unsigned MinImpliedByFlatWorkGroupSize = 561 getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second); 562 Default.first = MinImpliedByFlatWorkGroupSize; 563 bool RequestedFlatWorkGroupSize = 564 F.hasFnAttribute("amdgpu-flat-work-group-size"); 565 566 // Requested minimum/maximum number of waves per execution unit. 567 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 568 F, "amdgpu-waves-per-eu", Default, true); 569 570 // Make sure requested minimum is less than requested maximum. 571 if (Requested.second && Requested.first > Requested.second) 572 return Default; 573 574 // Make sure requested values do not violate subtarget's specifications. 575 if (Requested.first < getMinWavesPerEU() || 576 Requested.second > getMaxWavesPerEU()) 577 return Default; 578 579 // Make sure requested values are compatible with values implied by requested 580 // minimum/maximum flat work group sizes. 581 if (RequestedFlatWorkGroupSize && 582 Requested.first < MinImpliedByFlatWorkGroupSize) 583 return Default; 584 585 return Requested; 586 } 587 588 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) { 589 auto Node = Kernel.getMetadata("reqd_work_group_size"); 590 if (Node && Node->getNumOperands() == 3) 591 return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue(); 592 return std::numeric_limits<unsigned>::max(); 593 } 594 595 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const { 596 return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv()); 597 } 598 599 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel, 600 unsigned Dimension) const { 601 unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension); 602 if (ReqdSize != std::numeric_limits<unsigned>::max()) 603 return ReqdSize - 1; 604 return getFlatWorkGroupSizes(Kernel).second - 1; 605 } 606 607 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 608 Function *Kernel = I->getParent()->getParent(); 609 unsigned MinSize = 0; 610 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 611 bool IdQuery = false; 612 613 // If reqd_work_group_size is present it narrows value down. 614 if (auto *CI = dyn_cast<CallInst>(I)) { 615 const Function *F = CI->getCalledFunction(); 616 if (F) { 617 unsigned Dim = UINT_MAX; 618 switch (F->getIntrinsicID()) { 619 case Intrinsic::amdgcn_workitem_id_x: 620 case Intrinsic::r600_read_tidig_x: 621 IdQuery = true; 622 LLVM_FALLTHROUGH; 623 case Intrinsic::r600_read_local_size_x: 624 Dim = 0; 625 break; 626 case Intrinsic::amdgcn_workitem_id_y: 627 case Intrinsic::r600_read_tidig_y: 628 IdQuery = true; 629 LLVM_FALLTHROUGH; 630 case Intrinsic::r600_read_local_size_y: 631 Dim = 1; 632 break; 633 case Intrinsic::amdgcn_workitem_id_z: 634 case Intrinsic::r600_read_tidig_z: 635 IdQuery = true; 636 LLVM_FALLTHROUGH; 637 case Intrinsic::r600_read_local_size_z: 638 Dim = 2; 639 break; 640 default: 641 break; 642 } 643 644 if (Dim <= 3) { 645 unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim); 646 if (ReqdSize != std::numeric_limits<unsigned>::max()) 647 MinSize = MaxSize = ReqdSize; 648 } 649 } 650 } 651 652 if (!MaxSize) 653 return false; 654 655 // Range metadata is [Lo, Hi). For ID query we need to pass max size 656 // as Hi. For size query we need to pass Hi + 1. 657 if (IdQuery) 658 MinSize = 0; 659 else 660 ++MaxSize; 661 662 MDBuilder MDB(I->getContext()); 663 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 664 APInt(32, MaxSize)); 665 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 666 return true; 667 } 668 669 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const { 670 if (isMesaKernel(F)) 671 return 16; 672 return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0); 673 } 674 675 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 676 Align &MaxAlign) const { 677 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 678 F.getCallingConv() == CallingConv::SPIR_KERNEL); 679 680 const DataLayout &DL = F.getParent()->getDataLayout(); 681 uint64_t ExplicitArgBytes = 0; 682 MaxAlign = Align(1); 683 684 for (const Argument &Arg : F.args()) { 685 const bool IsByRef = Arg.hasByRefAttr(); 686 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); 687 MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None; 688 if (!Alignment) 689 Alignment = DL.getABITypeAlign(ArgTy); 690 691 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 692 ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize; 693 MaxAlign = max(MaxAlign, Alignment); 694 } 695 696 return ExplicitArgBytes; 697 } 698 699 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 700 Align &MaxAlign) const { 701 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 702 703 unsigned ExplicitOffset = getExplicitKernelArgOffset(F); 704 705 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 706 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 707 if (ImplicitBytes != 0) { 708 const Align Alignment = getAlignmentForImplicitArgPtr(); 709 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 710 } 711 712 // Being able to dereference past the end is useful for emitting scalar loads. 713 return alignTo(TotalSize, 4); 714 } 715 716 AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const { 717 return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32 718 : AMDGPUDwarfFlavour::Wave64; 719 } 720 721 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 722 const TargetMachine &TM) : 723 R600GenSubtargetInfo(TT, GPU, /*TuneCPU*/GPU, FS), 724 AMDGPUSubtarget(TT), 725 InstrInfo(*this), 726 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 727 FMA(false), 728 CaymanISA(false), 729 CFALUBug(false), 730 HasVertexCache(false), 731 R600ALUInst(false), 732 FP64(false), 733 TexVTXClauseSize(0), 734 Gen(R600), 735 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)), 736 InstrItins(getInstrItineraryForCPU(GPU)) { } 737 738 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 739 unsigned NumRegionInstrs) const { 740 // Track register pressure so the scheduler can try to decrease 741 // pressure once register usage is above the threshold defined by 742 // SIRegisterInfo::getRegPressureSetLimit() 743 Policy.ShouldTrackPressure = true; 744 745 // Enabling both top down and bottom up scheduling seems to give us less 746 // register spills than just using one of these approaches on its own. 747 Policy.OnlyTopDown = false; 748 Policy.OnlyBottomUp = false; 749 750 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 751 if (!enableSIScheduler()) 752 Policy.ShouldTrackLaneMasks = true; 753 } 754 755 bool GCNSubtarget::hasMadF16() const { 756 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1; 757 } 758 759 bool GCNSubtarget::useVGPRIndexMode() const { 760 return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode()); 761 } 762 763 bool GCNSubtarget::useAA() const { return UseAA; } 764 765 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 766 if (getGeneration() >= AMDGPUSubtarget::GFX10) 767 return getMaxWavesPerEU(); 768 769 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 770 if (SGPRs <= 80) 771 return 10; 772 if (SGPRs <= 88) 773 return 9; 774 if (SGPRs <= 100) 775 return 8; 776 return 7; 777 } 778 if (SGPRs <= 48) 779 return 10; 780 if (SGPRs <= 56) 781 return 9; 782 if (SGPRs <= 64) 783 return 8; 784 if (SGPRs <= 72) 785 return 7; 786 if (SGPRs <= 80) 787 return 6; 788 return 5; 789 } 790 791 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 792 unsigned MaxWaves = getMaxWavesPerEU(); 793 unsigned Granule = getVGPRAllocGranule(); 794 if (VGPRs < Granule) 795 return MaxWaves; 796 unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule; 797 return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves); 798 } 799 800 unsigned 801 GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratchInit) const { 802 if (getGeneration() >= AMDGPUSubtarget::GFX10) 803 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. 804 805 if (HasFlatScratchInit) { 806 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 807 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 808 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 809 return 4; // FLAT_SCRATCH, VCC (in that order). 810 } 811 812 if (isXNACKEnabled()) 813 return 4; // XNACK, VCC (in that order). 814 return 2; // VCC. 815 } 816 817 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 818 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 819 return getBaseReservedNumSGPRs(MFI.hasFlatScratchInit()); 820 } 821 822 unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const { 823 // The logic to detect if the function has 824 // flat scratch init is slightly different than how 825 // SIMachineFunctionInfo constructor derives. 826 // We don't use amdgpu-calls, amdgpu-stack-objects 827 // attributes and isAmdHsaOrMesa here as it doesn't really matter. 828 // TODO: Outline this derivation logic and have just 829 // one common function in the backend to avoid duplication. 830 bool isEntry = AMDGPU::isEntryFunctionCC(F.getCallingConv()); 831 bool FunctionHasFlatScratchInit = false; 832 if (hasFlatAddressSpace() && isEntry && !flatScratchIsArchitected() && 833 enableFlatScratch()) { 834 FunctionHasFlatScratchInit = true; 835 } 836 return getBaseReservedNumSGPRs(FunctionHasFlatScratchInit); 837 } 838 839 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize, 840 unsigned NumSGPRs, 841 unsigned NumVGPRs) const { 842 unsigned Occupancy = 843 std::min(getMaxWavesPerEU(), 844 getOccupancyWithLocalMemSize(LDSSize, F)); 845 if (NumSGPRs) 846 Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs)); 847 if (NumVGPRs) 848 Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs)); 849 return Occupancy; 850 } 851 852 unsigned GCNSubtarget::getBaseMaxNumSGPRs( 853 const Function &F, std::pair<unsigned, unsigned> WavesPerEU, 854 unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const { 855 // Compute maximum number of SGPRs function can use using default/requested 856 // minimum number of waves per execution unit. 857 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 858 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 859 860 // Check if maximum number of SGPRs was explicitly requested using 861 // "amdgpu-num-sgpr" attribute. 862 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 863 unsigned Requested = AMDGPU::getIntegerAttribute( 864 F, "amdgpu-num-sgpr", MaxNumSGPRs); 865 866 // Make sure requested value does not violate subtarget's specifications. 867 if (Requested && (Requested <= ReservedNumSGPRs)) 868 Requested = 0; 869 870 // If more SGPRs are required to support the input user/system SGPRs, 871 // increase to accommodate them. 872 // 873 // FIXME: This really ends up using the requested number of SGPRs + number 874 // of reserved special registers in total. Theoretically you could re-use 875 // the last input registers for these special registers, but this would 876 // require a lot of complexity to deal with the weird aliasing. 877 unsigned InputNumSGPRs = PreloadedSGPRs; 878 if (Requested && Requested < InputNumSGPRs) 879 Requested = InputNumSGPRs; 880 881 // Make sure requested value is compatible with values implied by 882 // default/requested minimum/maximum number of waves per execution unit. 883 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 884 Requested = 0; 885 if (WavesPerEU.second && 886 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 887 Requested = 0; 888 889 if (Requested) 890 MaxNumSGPRs = Requested; 891 } 892 893 if (hasSGPRInitBug()) 894 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 895 896 return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs); 897 } 898 899 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 900 const Function &F = MF.getFunction(); 901 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 902 return getBaseMaxNumSGPRs(F, MFI.getWavesPerEU(), MFI.getNumPreloadedSGPRs(), 903 getReservedNumSGPRs(MF)); 904 } 905 906 static unsigned getMaxNumPreloadedSGPRs() { 907 // Max number of user SGPRs 908 unsigned MaxUserSGPRs = 4 + // private segment buffer 909 2 + // Dispatch ptr 910 2 + // queue ptr 911 2 + // kernel segment ptr 912 2 + // dispatch ID 913 2 + // flat scratch init 914 2; // Implicit buffer ptr 915 // Max number of system SGPRs 916 unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX 917 1 + // WorkGroupIDY 918 1 + // WorkGroupIDZ 919 1 + // WorkGroupInfo 920 1; // private segment wave byte offset 921 return MaxUserSGPRs + MaxSystemSGPRs; 922 } 923 924 unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const { 925 return getBaseMaxNumSGPRs(F, getWavesPerEU(F), getMaxNumPreloadedSGPRs(), 926 getReservedNumSGPRs(F)); 927 } 928 929 unsigned GCNSubtarget::getBaseMaxNumVGPRs( 930 const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const { 931 // Compute maximum number of VGPRs function can use using default/requested 932 // minimum number of waves per execution unit. 933 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 934 935 // Check if maximum number of VGPRs was explicitly requested using 936 // "amdgpu-num-vgpr" attribute. 937 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 938 unsigned Requested = AMDGPU::getIntegerAttribute( 939 F, "amdgpu-num-vgpr", MaxNumVGPRs); 940 941 if (hasGFX90AInsts()) 942 Requested *= 2; 943 944 // Make sure requested value is compatible with values implied by 945 // default/requested minimum/maximum number of waves per execution unit. 946 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 947 Requested = 0; 948 if (WavesPerEU.second && 949 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 950 Requested = 0; 951 952 if (Requested) 953 MaxNumVGPRs = Requested; 954 } 955 956 return MaxNumVGPRs; 957 } 958 959 unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const { 960 return getBaseMaxNumVGPRs(F, getWavesPerEU(F)); 961 } 962 963 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 964 const Function &F = MF.getFunction(); 965 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 966 return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU()); 967 } 968 969 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, 970 int UseOpIdx, SDep &Dep) const { 971 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || 972 !Def->isInstr() || !Use->isInstr()) 973 return; 974 975 MachineInstr *DefI = Def->getInstr(); 976 MachineInstr *UseI = Use->getInstr(); 977 978 if (DefI->isBundle()) { 979 const SIRegisterInfo *TRI = getRegisterInfo(); 980 auto Reg = Dep.getReg(); 981 MachineBasicBlock::const_instr_iterator I(DefI->getIterator()); 982 MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end()); 983 unsigned Lat = 0; 984 for (++I; I != E && I->isBundledWithPred(); ++I) { 985 if (I->modifiesRegister(Reg, TRI)) 986 Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I); 987 else if (Lat) 988 --Lat; 989 } 990 Dep.setLatency(Lat); 991 } else if (UseI->isBundle()) { 992 const SIRegisterInfo *TRI = getRegisterInfo(); 993 auto Reg = Dep.getReg(); 994 MachineBasicBlock::const_instr_iterator I(UseI->getIterator()); 995 MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end()); 996 unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI); 997 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) { 998 if (I->readsRegister(Reg, TRI)) 999 break; 1000 --Lat; 1001 } 1002 Dep.setLatency(Lat); 1003 } 1004 } 1005 1006 namespace { 1007 struct FillMFMAShadowMutation : ScheduleDAGMutation { 1008 const SIInstrInfo *TII; 1009 1010 ScheduleDAGMI *DAG; 1011 1012 FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {} 1013 1014 bool isSALU(const SUnit *SU) const { 1015 const MachineInstr *MI = SU->getInstr(); 1016 return MI && TII->isSALU(*MI) && !MI->isTerminator(); 1017 } 1018 1019 bool isVALU(const SUnit *SU) const { 1020 const MachineInstr *MI = SU->getInstr(); 1021 return MI && TII->isVALU(*MI); 1022 } 1023 1024 bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const { 1025 if (Pred->NodeNum < Succ->NodeNum) 1026 return true; 1027 1028 SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred}); 1029 1030 for (unsigned I = 0; I < Succs.size(); ++I) { 1031 for (const SDep &SI : Succs[I]->Succs) { 1032 const SUnit *SU = SI.getSUnit(); 1033 if (SU != Succs[I] && !llvm::is_contained(Succs, SU)) 1034 Succs.push_back(SU); 1035 } 1036 } 1037 1038 SmallPtrSet<const SUnit*, 32> Visited; 1039 while (!Preds.empty()) { 1040 const SUnit *SU = Preds.pop_back_val(); 1041 if (llvm::is_contained(Succs, SU)) 1042 return false; 1043 Visited.insert(SU); 1044 for (const SDep &SI : SU->Preds) 1045 if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit())) 1046 Preds.push_back(SI.getSUnit()); 1047 } 1048 1049 return true; 1050 } 1051 1052 // Link as much SALU intructions in chain as possible. Return the size 1053 // of the chain. Links up to MaxChain instructions. 1054 unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain, 1055 SmallPtrSetImpl<SUnit *> &Visited) const { 1056 SmallVector<SUnit *, 8> Worklist({To}); 1057 unsigned Linked = 0; 1058 1059 while (!Worklist.empty() && MaxChain-- > 0) { 1060 SUnit *SU = Worklist.pop_back_val(); 1061 if (!Visited.insert(SU).second) 1062 continue; 1063 1064 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From); 1065 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n'); 1066 1067 if (SU->addPred(SDep(From, SDep::Artificial), false)) 1068 ++Linked; 1069 1070 for (SDep &SI : From->Succs) { 1071 SUnit *SUv = SI.getSUnit(); 1072 if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU)) 1073 SUv->addPred(SDep(SU, SDep::Artificial), false); 1074 } 1075 1076 for (SDep &SI : SU->Succs) { 1077 SUnit *Succ = SI.getSUnit(); 1078 if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ)) 1079 Worklist.push_back(Succ); 1080 } 1081 } 1082 1083 return Linked; 1084 } 1085 1086 void apply(ScheduleDAGInstrs *DAGInstrs) override { 1087 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); 1088 if (!ST.hasMAIInsts() || DisablePowerSched) 1089 return; 1090 DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); 1091 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); 1092 if (!TSchedModel || DAG->SUnits.empty()) 1093 return; 1094 1095 // Scan for MFMA long latency instructions and try to add a dependency 1096 // of available SALU instructions to give them a chance to fill MFMA 1097 // shadow. That is desirable to fill MFMA shadow with SALU instructions 1098 // rather than VALU to prevent power consumption bursts and throttle. 1099 auto LastSALU = DAG->SUnits.begin(); 1100 auto E = DAG->SUnits.end(); 1101 SmallPtrSet<SUnit*, 32> Visited; 1102 for (SUnit &SU : DAG->SUnits) { 1103 MachineInstr &MAI = *SU.getInstr(); 1104 if (!TII->isMAI(MAI) || 1105 MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 || 1106 MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64) 1107 continue; 1108 1109 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1; 1110 1111 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU); 1112 dbgs() << "Need " << Lat 1113 << " instructions to cover latency.\n"); 1114 1115 // Find up to Lat independent scalar instructions as early as 1116 // possible such that they can be scheduled after this MFMA. 1117 for ( ; Lat && LastSALU != E; ++LastSALU) { 1118 if (Visited.count(&*LastSALU)) 1119 continue; 1120 1121 if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU)) 1122 continue; 1123 1124 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited); 1125 } 1126 } 1127 } 1128 }; 1129 } // namespace 1130 1131 void GCNSubtarget::getPostRAMutations( 1132 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 1133 Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo)); 1134 } 1135 1136 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 1137 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 1138 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 1139 else 1140 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); 1141 } 1142 1143 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 1144 if (TM.getTargetTriple().getArch() == Triple::amdgcn) 1145 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 1146 else 1147 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); 1148 } 1149