1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Implements the AMDGPU specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUSubtarget.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUCallLowering.h" 17 #include "AMDGPUInstructionSelector.h" 18 #include "AMDGPULegalizerInfo.h" 19 #include "AMDGPURegisterBankInfo.h" 20 #include "AMDGPUTargetMachine.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "Utils/AMDGPUBaseInfo.h" 23 #include "llvm/ADT/SmallString.h" 24 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h" 25 #include "llvm/CodeGen/MachineScheduler.h" 26 #include "llvm/CodeGen/TargetFrameLowering.h" 27 #include "llvm/IR/IntrinsicsAMDGPU.h" 28 #include "llvm/IR/IntrinsicsR600.h" 29 #include "llvm/IR/MDBuilder.h" 30 #include "llvm/MC/MCSubtargetInfo.h" 31 #include <algorithm> 32 33 using namespace llvm; 34 35 #define DEBUG_TYPE "amdgpu-subtarget" 36 37 #define GET_SUBTARGETINFO_TARGET_DESC 38 #define GET_SUBTARGETINFO_CTOR 39 #define AMDGPUSubtarget GCNSubtarget 40 #include "AMDGPUGenSubtargetInfo.inc" 41 #define GET_SUBTARGETINFO_TARGET_DESC 42 #define GET_SUBTARGETINFO_CTOR 43 #undef AMDGPUSubtarget 44 #include "R600GenSubtargetInfo.inc" 45 46 static cl::opt<bool> DisablePowerSched( 47 "amdgpu-disable-power-sched", 48 cl::desc("Disable scheduling to minimize mAI power bursts"), 49 cl::init(false)); 50 51 static cl::opt<bool> EnableVGPRIndexMode( 52 "amdgpu-vgpr-index-mode", 53 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), 54 cl::init(false)); 55 56 static cl::opt<bool> EnableFlatScratch( 57 "amdgpu-enable-flat-scratch", 58 cl::desc("Use flat scratch instructions"), 59 cl::init(false)); 60 61 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen", 62 cl::desc("Enable the use of AA during codegen."), 63 cl::init(true)); 64 65 GCNSubtarget::~GCNSubtarget() = default; 66 67 R600Subtarget & 68 R600Subtarget::initializeSubtargetDependencies(const Triple &TT, 69 StringRef GPU, StringRef FS) { 70 SmallString<256> FullFS("+promote-alloca,"); 71 FullFS += FS; 72 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS); 73 74 HasMulU24 = getGeneration() >= EVERGREEN; 75 HasMulI24 = hasCaymanISA(); 76 77 return *this; 78 } 79 80 GCNSubtarget & 81 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 82 StringRef GPU, StringRef FS) { 83 // Determine default and user-specified characteristics 84 // 85 // We want to be able to turn these off, but making this a subtarget feature 86 // for SI has the unhelpful behavior that it unsets everything else if you 87 // disable it. 88 // 89 // Similarly we want enable-prt-strict-null to be on by default and not to 90 // unset everything else if it is disabled 91 92 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,"); 93 94 // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default 95 if (isAmdHsaOS()) 96 FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,"; 97 98 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS 99 100 // Disable mutually exclusive bits. 101 if (FS.find_insensitive("+wavefrontsize") != StringRef::npos) { 102 if (FS.find_insensitive("wavefrontsize16") == StringRef::npos) 103 FullFS += "-wavefrontsize16,"; 104 if (FS.find_insensitive("wavefrontsize32") == StringRef::npos) 105 FullFS += "-wavefrontsize32,"; 106 if (FS.find_insensitive("wavefrontsize64") == StringRef::npos) 107 FullFS += "-wavefrontsize64,"; 108 } 109 110 FullFS += FS; 111 112 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS); 113 114 // Implement the "generic" processors, which acts as the default when no 115 // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to 116 // the first amdgcn target that supports flat addressing. Other OSes defaults 117 // to the first amdgcn target. 118 if (Gen == AMDGPUSubtarget::INVALID) { 119 Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS 120 : AMDGPUSubtarget::SOUTHERN_ISLANDS; 121 } 122 123 // We don't support FP64 for EG/NI atm. 124 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 125 126 // Targets must either support 64-bit offsets for MUBUF instructions, and/or 127 // support flat operations, otherwise they cannot access a 64-bit global 128 // address space 129 assert(hasAddr64() || hasFlat()); 130 // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets 131 // that do not support ADDR64 variants of MUBUF instructions. Such targets 132 // cannot use a 64 bit offset with a MUBUF instruction to access the global 133 // address space 134 if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) { 135 ToggleFeature(AMDGPU::FeatureFlatForGlobal); 136 FlatForGlobal = true; 137 } 138 // Unless +-flat-for-global is specified, use MUBUF instructions for global 139 // address space access if flat operations are not available. 140 if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) { 141 ToggleFeature(AMDGPU::FeatureFlatForGlobal); 142 FlatForGlobal = false; 143 } 144 145 // Set defaults if needed. 146 if (MaxPrivateElementSize == 0) 147 MaxPrivateElementSize = 4; 148 149 if (LDSBankCount == 0) 150 LDSBankCount = 32; 151 152 if (TT.getArch() == Triple::amdgcn) { 153 if (LocalMemorySize == 0) 154 LocalMemorySize = 32768; 155 156 // Do something sensible for unspecified target. 157 if (!HasMovrel && !HasVGPRIndexMode) 158 HasMovrel = true; 159 } 160 161 // Don't crash on invalid devices. 162 if (WavefrontSizeLog2 == 0) 163 WavefrontSizeLog2 = 5; 164 165 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 166 HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9; 167 168 TargetID.setTargetIDFromFeaturesString(FS); 169 170 LLVM_DEBUG(dbgs() << "xnack setting for subtarget: " 171 << TargetID.getXnackSetting() << '\n'); 172 LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: " 173 << TargetID.getSramEccSetting() << '\n'); 174 175 return *this; 176 } 177 178 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : 179 TargetTriple(TT), 180 GCN3Encoding(false), 181 Has16BitInsts(false), 182 HasMadMixInsts(false), 183 HasMadMacF32Insts(false), 184 HasDsSrc2Insts(false), 185 HasSDWA(false), 186 HasVOP3PInsts(false), 187 HasMulI24(true), 188 HasMulU24(true), 189 HasSMulHi(false), 190 HasInv2PiInlineImm(false), 191 HasFminFmaxLegacy(true), 192 EnablePromoteAlloca(false), 193 HasTrigReducedRange(false), 194 MaxWavesPerEU(10), 195 LocalMemorySize(0), 196 WavefrontSizeLog2(0) 197 { } 198 199 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 200 const GCNTargetMachine &TM) 201 : // clang-format off 202 AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS), 203 AMDGPUSubtarget(TT), 204 TargetTriple(TT), 205 TargetID(*this), 206 Gen(INVALID), 207 InstrItins(getInstrItineraryForCPU(GPU)), 208 LDSBankCount(0), 209 MaxPrivateElementSize(0), 210 211 FastFMAF32(false), 212 FastDenormalF32(false), 213 HalfRate64Ops(false), 214 FullRate64Ops(false), 215 216 FlatForGlobal(false), 217 AutoWaitcntBeforeBarrier(false), 218 UnalignedScratchAccess(false), 219 UnalignedAccessMode(false), 220 221 HasApertureRegs(false), 222 SupportsXNACK(false), 223 EnableXNACK(false), 224 EnableTgSplit(false), 225 EnableCuMode(false), 226 TrapHandler(false), 227 228 EnableLoadStoreOpt(false), 229 EnableUnsafeDSOffsetFolding(false), 230 EnableSIScheduler(false), 231 EnableDS128(false), 232 EnablePRTStrictNull(false), 233 DumpCode(false), 234 235 FP64(false), 236 CIInsts(false), 237 GFX8Insts(false), 238 GFX9Insts(false), 239 GFX90AInsts(false), 240 GFX10Insts(false), 241 GFX10_3Insts(false), 242 GFX7GFX8GFX9Insts(false), 243 SGPRInitBug(false), 244 NegativeScratchOffsetBug(false), 245 NegativeUnalignedScratchOffsetBug(false), 246 HasSMemRealTime(false), 247 HasIntClamp(false), 248 HasFmaMixInsts(false), 249 HasMovrel(false), 250 HasVGPRIndexMode(false), 251 HasScalarStores(false), 252 HasScalarAtomics(false), 253 HasSDWAOmod(false), 254 HasSDWAScalar(false), 255 HasSDWASdst(false), 256 HasSDWAMac(false), 257 HasSDWAOutModsVOPC(false), 258 HasDPP(false), 259 HasDPP8(false), 260 Has64BitDPP(false), 261 HasPackedFP32Ops(false), 262 HasExtendedImageInsts(false), 263 HasR128A16(false), 264 HasGFX10A16(false), 265 HasG16(false), 266 HasNSAEncoding(false), 267 GFX10_AEncoding(false), 268 GFX10_BEncoding(false), 269 HasDLInsts(false), 270 HasDot1Insts(false), 271 HasDot2Insts(false), 272 HasDot3Insts(false), 273 HasDot4Insts(false), 274 HasDot5Insts(false), 275 HasDot6Insts(false), 276 HasDot7Insts(false), 277 HasMAIInsts(false), 278 HasPkFmacF16Inst(false), 279 HasAtomicFaddInsts(false), 280 SupportsSRAMECC(false), 281 EnableSRAMECC(false), 282 HasNoSdstCMPX(false), 283 HasVscnt(false), 284 HasGetWaveIdInst(false), 285 HasSMemTimeInst(false), 286 HasShaderCyclesRegister(false), 287 HasRegisterBanking(false), 288 HasVOP3Literal(false), 289 HasNoDataDepHazard(false), 290 FlatAddressSpace(false), 291 FlatInstOffsets(false), 292 FlatGlobalInsts(false), 293 FlatScratchInsts(false), 294 ScalarFlatScratchInsts(false), 295 HasArchitectedFlatScratch(false), 296 AddNoCarryInsts(false), 297 HasUnpackedD16VMem(false), 298 LDSMisalignedBug(false), 299 HasMFMAInlineLiteralBug(false), 300 UnalignedBufferAccess(false), 301 UnalignedDSAccess(false), 302 HasPackedTID(false), 303 304 ScalarizeGlobal(false), 305 306 HasVcmpxPermlaneHazard(false), 307 HasVMEMtoScalarWriteHazard(false), 308 HasSMEMtoVectorWriteHazard(false), 309 HasInstFwdPrefetchBug(false), 310 HasVcmpxExecWARHazard(false), 311 HasLdsBranchVmemWARHazard(false), 312 HasNSAtoVMEMBug(false), 313 HasNSAClauseBug(false), 314 HasOffset3fBug(false), 315 HasFlatSegmentOffsetBug(false), 316 HasImageStoreD16Bug(false), 317 HasImageGather4D16Bug(false), 318 319 FeatureDisable(false), 320 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 321 TLInfo(TM, *this), 322 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 323 // clang-format on 324 MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this); 325 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 326 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering())); 327 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 328 RegBankInfo.reset(new AMDGPURegisterBankInfo(*this)); 329 InstSelector.reset(new AMDGPUInstructionSelector( 330 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 331 } 332 333 bool GCNSubtarget::enableFlatScratch() const { 334 return flatScratchIsArchitected() || 335 (EnableFlatScratch && hasFlatScratchInsts()); 336 } 337 338 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { 339 if (getGeneration() < GFX10) 340 return 1; 341 342 switch (Opcode) { 343 case AMDGPU::V_LSHLREV_B64_e64: 344 case AMDGPU::V_LSHLREV_B64_gfx10: 345 case AMDGPU::V_LSHL_B64_e64: 346 case AMDGPU::V_LSHRREV_B64_e64: 347 case AMDGPU::V_LSHRREV_B64_gfx10: 348 case AMDGPU::V_LSHR_B64_e64: 349 case AMDGPU::V_ASHRREV_I64_e64: 350 case AMDGPU::V_ASHRREV_I64_gfx10: 351 case AMDGPU::V_ASHR_I64_e64: 352 return 1; 353 } 354 355 return 2; 356 } 357 358 /// This list was mostly derived from experimentation. 359 bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const { 360 switch (Opcode) { 361 case AMDGPU::V_CVT_F16_F32_e32: 362 case AMDGPU::V_CVT_F16_F32_e64: 363 case AMDGPU::V_CVT_F16_U16_e32: 364 case AMDGPU::V_CVT_F16_U16_e64: 365 case AMDGPU::V_CVT_F16_I16_e32: 366 case AMDGPU::V_CVT_F16_I16_e64: 367 case AMDGPU::V_RCP_F16_e64: 368 case AMDGPU::V_RCP_F16_e32: 369 case AMDGPU::V_RSQ_F16_e64: 370 case AMDGPU::V_RSQ_F16_e32: 371 case AMDGPU::V_SQRT_F16_e64: 372 case AMDGPU::V_SQRT_F16_e32: 373 case AMDGPU::V_LOG_F16_e64: 374 case AMDGPU::V_LOG_F16_e32: 375 case AMDGPU::V_EXP_F16_e64: 376 case AMDGPU::V_EXP_F16_e32: 377 case AMDGPU::V_SIN_F16_e64: 378 case AMDGPU::V_SIN_F16_e32: 379 case AMDGPU::V_COS_F16_e64: 380 case AMDGPU::V_COS_F16_e32: 381 case AMDGPU::V_FLOOR_F16_e64: 382 case AMDGPU::V_FLOOR_F16_e32: 383 case AMDGPU::V_CEIL_F16_e64: 384 case AMDGPU::V_CEIL_F16_e32: 385 case AMDGPU::V_TRUNC_F16_e64: 386 case AMDGPU::V_TRUNC_F16_e32: 387 case AMDGPU::V_RNDNE_F16_e64: 388 case AMDGPU::V_RNDNE_F16_e32: 389 case AMDGPU::V_FRACT_F16_e64: 390 case AMDGPU::V_FRACT_F16_e32: 391 case AMDGPU::V_FREXP_MANT_F16_e64: 392 case AMDGPU::V_FREXP_MANT_F16_e32: 393 case AMDGPU::V_FREXP_EXP_I16_F16_e64: 394 case AMDGPU::V_FREXP_EXP_I16_F16_e32: 395 case AMDGPU::V_LDEXP_F16_e64: 396 case AMDGPU::V_LDEXP_F16_e32: 397 case AMDGPU::V_LSHLREV_B16_e64: 398 case AMDGPU::V_LSHLREV_B16_e32: 399 case AMDGPU::V_LSHRREV_B16_e64: 400 case AMDGPU::V_LSHRREV_B16_e32: 401 case AMDGPU::V_ASHRREV_I16_e64: 402 case AMDGPU::V_ASHRREV_I16_e32: 403 case AMDGPU::V_ADD_U16_e64: 404 case AMDGPU::V_ADD_U16_e32: 405 case AMDGPU::V_SUB_U16_e64: 406 case AMDGPU::V_SUB_U16_e32: 407 case AMDGPU::V_SUBREV_U16_e64: 408 case AMDGPU::V_SUBREV_U16_e32: 409 case AMDGPU::V_MUL_LO_U16_e64: 410 case AMDGPU::V_MUL_LO_U16_e32: 411 case AMDGPU::V_ADD_F16_e64: 412 case AMDGPU::V_ADD_F16_e32: 413 case AMDGPU::V_SUB_F16_e64: 414 case AMDGPU::V_SUB_F16_e32: 415 case AMDGPU::V_SUBREV_F16_e64: 416 case AMDGPU::V_SUBREV_F16_e32: 417 case AMDGPU::V_MUL_F16_e64: 418 case AMDGPU::V_MUL_F16_e32: 419 case AMDGPU::V_MAX_F16_e64: 420 case AMDGPU::V_MAX_F16_e32: 421 case AMDGPU::V_MIN_F16_e64: 422 case AMDGPU::V_MIN_F16_e32: 423 case AMDGPU::V_MAX_U16_e64: 424 case AMDGPU::V_MAX_U16_e32: 425 case AMDGPU::V_MIN_U16_e64: 426 case AMDGPU::V_MIN_U16_e32: 427 case AMDGPU::V_MAX_I16_e64: 428 case AMDGPU::V_MAX_I16_e32: 429 case AMDGPU::V_MIN_I16_e64: 430 case AMDGPU::V_MIN_I16_e32: 431 // On gfx10, all 16-bit instructions preserve the high bits. 432 return getGeneration() <= AMDGPUSubtarget::GFX9; 433 case AMDGPU::V_MAD_F16_e64: 434 case AMDGPU::V_MADAK_F16: 435 case AMDGPU::V_MADMK_F16: 436 case AMDGPU::V_MAC_F16_e64: 437 case AMDGPU::V_MAC_F16_e32: 438 case AMDGPU::V_FMAMK_F16: 439 case AMDGPU::V_FMAAK_F16: 440 case AMDGPU::V_MAD_U16_e64: 441 case AMDGPU::V_MAD_I16_e64: 442 case AMDGPU::V_FMA_F16_e64: 443 case AMDGPU::V_FMAC_F16_e64: 444 case AMDGPU::V_FMAC_F16_e32: 445 case AMDGPU::V_DIV_FIXUP_F16_e64: 446 // In gfx9, the preferred handling of the unused high 16-bits changed. Most 447 // instructions maintain the legacy behavior of 0ing. Some instructions 448 // changed to preserving the high bits. 449 return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS; 450 case AMDGPU::V_MAD_MIXLO_F16: 451 case AMDGPU::V_MAD_MIXHI_F16: 452 default: 453 return false; 454 } 455 } 456 457 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 458 const Function &F) const { 459 if (NWaves == 1) 460 return getLocalMemorySize(); 461 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 462 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 463 if (!WorkGroupsPerCu) 464 return 0; 465 unsigned MaxWaves = getMaxWavesPerEU(); 466 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 467 } 468 469 // FIXME: Should return min,max range. 470 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 471 const Function &F) const { 472 const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second; 473 const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize); 474 if (!MaxWorkGroupsPerCu) 475 return 0; 476 477 const unsigned WaveSize = getWavefrontSize(); 478 479 // FIXME: Do we need to account for alignment requirement of LDS rounding the 480 // size up? 481 // Compute restriction based on LDS usage 482 unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u); 483 484 // This can be queried with more LDS than is possible, so just assume the 485 // worst. 486 if (NumGroups == 0) 487 return 1; 488 489 NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups); 490 491 // Round to the number of waves. 492 const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize; 493 unsigned MaxWaves = NumGroups * MaxGroupNumWaves; 494 495 // Clamp to the maximum possible number of waves. 496 MaxWaves = std::min(MaxWaves, getMaxWavesPerEU()); 497 498 // FIXME: Needs to be a multiple of the group size? 499 //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves); 500 501 assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() && 502 "computed invalid occupancy"); 503 return MaxWaves; 504 } 505 506 unsigned 507 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 508 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 509 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 510 } 511 512 std::pair<unsigned, unsigned> 513 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 514 switch (CC) { 515 case CallingConv::AMDGPU_VS: 516 case CallingConv::AMDGPU_LS: 517 case CallingConv::AMDGPU_HS: 518 case CallingConv::AMDGPU_ES: 519 case CallingConv::AMDGPU_GS: 520 case CallingConv::AMDGPU_PS: 521 return std::make_pair(1, getWavefrontSize()); 522 default: 523 return std::make_pair(1u, getMaxFlatWorkGroupSize()); 524 } 525 } 526 527 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 528 const Function &F) const { 529 // Default minimum/maximum flat work group sizes. 530 std::pair<unsigned, unsigned> Default = 531 getDefaultFlatWorkGroupSize(F.getCallingConv()); 532 533 // Requested minimum/maximum flat work group sizes. 534 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 535 F, "amdgpu-flat-work-group-size", Default); 536 537 // Make sure requested minimum is less than requested maximum. 538 if (Requested.first > Requested.second) 539 return Default; 540 541 // Make sure requested values do not violate subtarget's specifications. 542 if (Requested.first < getMinFlatWorkGroupSize()) 543 return Default; 544 if (Requested.second > getMaxFlatWorkGroupSize()) 545 return Default; 546 547 return Requested; 548 } 549 550 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 551 const Function &F) const { 552 // Default minimum/maximum number of waves per execution unit. 553 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 554 555 // Default/requested minimum/maximum flat work group sizes. 556 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 557 558 // If minimum/maximum flat work group sizes were explicitly requested using 559 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 560 // number of waves per execution unit to values implied by requested 561 // minimum/maximum flat work group sizes. 562 unsigned MinImpliedByFlatWorkGroupSize = 563 getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second); 564 Default.first = MinImpliedByFlatWorkGroupSize; 565 bool RequestedFlatWorkGroupSize = 566 F.hasFnAttribute("amdgpu-flat-work-group-size"); 567 568 // Requested minimum/maximum number of waves per execution unit. 569 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 570 F, "amdgpu-waves-per-eu", Default, true); 571 572 // Make sure requested minimum is less than requested maximum. 573 if (Requested.second && Requested.first > Requested.second) 574 return Default; 575 576 // Make sure requested values do not violate subtarget's specifications. 577 if (Requested.first < getMinWavesPerEU() || 578 Requested.second > getMaxWavesPerEU()) 579 return Default; 580 581 // Make sure requested values are compatible with values implied by requested 582 // minimum/maximum flat work group sizes. 583 if (RequestedFlatWorkGroupSize && 584 Requested.first < MinImpliedByFlatWorkGroupSize) 585 return Default; 586 587 return Requested; 588 } 589 590 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) { 591 auto Node = Kernel.getMetadata("reqd_work_group_size"); 592 if (Node && Node->getNumOperands() == 3) 593 return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue(); 594 return std::numeric_limits<unsigned>::max(); 595 } 596 597 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const { 598 return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv()); 599 } 600 601 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel, 602 unsigned Dimension) const { 603 unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension); 604 if (ReqdSize != std::numeric_limits<unsigned>::max()) 605 return ReqdSize - 1; 606 return getFlatWorkGroupSizes(Kernel).second - 1; 607 } 608 609 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 610 Function *Kernel = I->getParent()->getParent(); 611 unsigned MinSize = 0; 612 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 613 bool IdQuery = false; 614 615 // If reqd_work_group_size is present it narrows value down. 616 if (auto *CI = dyn_cast<CallInst>(I)) { 617 const Function *F = CI->getCalledFunction(); 618 if (F) { 619 unsigned Dim = UINT_MAX; 620 switch (F->getIntrinsicID()) { 621 case Intrinsic::amdgcn_workitem_id_x: 622 case Intrinsic::r600_read_tidig_x: 623 IdQuery = true; 624 LLVM_FALLTHROUGH; 625 case Intrinsic::r600_read_local_size_x: 626 Dim = 0; 627 break; 628 case Intrinsic::amdgcn_workitem_id_y: 629 case Intrinsic::r600_read_tidig_y: 630 IdQuery = true; 631 LLVM_FALLTHROUGH; 632 case Intrinsic::r600_read_local_size_y: 633 Dim = 1; 634 break; 635 case Intrinsic::amdgcn_workitem_id_z: 636 case Intrinsic::r600_read_tidig_z: 637 IdQuery = true; 638 LLVM_FALLTHROUGH; 639 case Intrinsic::r600_read_local_size_z: 640 Dim = 2; 641 break; 642 default: 643 break; 644 } 645 646 if (Dim <= 3) { 647 unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim); 648 if (ReqdSize != std::numeric_limits<unsigned>::max()) 649 MinSize = MaxSize = ReqdSize; 650 } 651 } 652 } 653 654 if (!MaxSize) 655 return false; 656 657 // Range metadata is [Lo, Hi). For ID query we need to pass max size 658 // as Hi. For size query we need to pass Hi + 1. 659 if (IdQuery) 660 MinSize = 0; 661 else 662 ++MaxSize; 663 664 MDBuilder MDB(I->getContext()); 665 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 666 APInt(32, MaxSize)); 667 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 668 return true; 669 } 670 671 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const { 672 if (isMesaKernel(F)) 673 return 16; 674 return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0); 675 } 676 677 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 678 Align &MaxAlign) const { 679 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 680 F.getCallingConv() == CallingConv::SPIR_KERNEL); 681 682 const DataLayout &DL = F.getParent()->getDataLayout(); 683 uint64_t ExplicitArgBytes = 0; 684 MaxAlign = Align(1); 685 686 for (const Argument &Arg : F.args()) { 687 const bool IsByRef = Arg.hasByRefAttr(); 688 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); 689 MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None; 690 if (!Alignment) 691 Alignment = DL.getABITypeAlign(ArgTy); 692 693 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 694 ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize; 695 MaxAlign = max(MaxAlign, Alignment); 696 } 697 698 return ExplicitArgBytes; 699 } 700 701 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 702 Align &MaxAlign) const { 703 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 704 705 unsigned ExplicitOffset = getExplicitKernelArgOffset(F); 706 707 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 708 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 709 if (ImplicitBytes != 0) { 710 const Align Alignment = getAlignmentForImplicitArgPtr(); 711 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 712 } 713 714 // Being able to dereference past the end is useful for emitting scalar loads. 715 return alignTo(TotalSize, 4); 716 } 717 718 AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const { 719 return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32 720 : AMDGPUDwarfFlavour::Wave64; 721 } 722 723 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 724 const TargetMachine &TM) : 725 R600GenSubtargetInfo(TT, GPU, /*TuneCPU*/GPU, FS), 726 AMDGPUSubtarget(TT), 727 InstrInfo(*this), 728 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 729 FMA(false), 730 CaymanISA(false), 731 CFALUBug(false), 732 HasVertexCache(false), 733 R600ALUInst(false), 734 FP64(false), 735 TexVTXClauseSize(0), 736 Gen(R600), 737 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)), 738 InstrItins(getInstrItineraryForCPU(GPU)) { } 739 740 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 741 unsigned NumRegionInstrs) const { 742 // Track register pressure so the scheduler can try to decrease 743 // pressure once register usage is above the threshold defined by 744 // SIRegisterInfo::getRegPressureSetLimit() 745 Policy.ShouldTrackPressure = true; 746 747 // Enabling both top down and bottom up scheduling seems to give us less 748 // register spills than just using one of these approaches on its own. 749 Policy.OnlyTopDown = false; 750 Policy.OnlyBottomUp = false; 751 752 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 753 if (!enableSIScheduler()) 754 Policy.ShouldTrackLaneMasks = true; 755 } 756 757 bool GCNSubtarget::hasMadF16() const { 758 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1; 759 } 760 761 bool GCNSubtarget::useVGPRIndexMode() const { 762 return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode()); 763 } 764 765 bool GCNSubtarget::useAA() const { return UseAA; } 766 767 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 768 if (getGeneration() >= AMDGPUSubtarget::GFX10) 769 return getMaxWavesPerEU(); 770 771 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 772 if (SGPRs <= 80) 773 return 10; 774 if (SGPRs <= 88) 775 return 9; 776 if (SGPRs <= 100) 777 return 8; 778 return 7; 779 } 780 if (SGPRs <= 48) 781 return 10; 782 if (SGPRs <= 56) 783 return 9; 784 if (SGPRs <= 64) 785 return 8; 786 if (SGPRs <= 72) 787 return 7; 788 if (SGPRs <= 80) 789 return 6; 790 return 5; 791 } 792 793 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 794 unsigned MaxWaves = getMaxWavesPerEU(); 795 unsigned Granule = getVGPRAllocGranule(); 796 if (VGPRs < Granule) 797 return MaxWaves; 798 unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule; 799 return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves); 800 } 801 802 unsigned 803 GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratchInit) const { 804 if (getGeneration() >= AMDGPUSubtarget::GFX10) 805 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. 806 807 if (HasFlatScratchInit) { 808 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 809 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 810 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 811 return 4; // FLAT_SCRATCH, VCC (in that order). 812 } 813 814 if (isXNACKEnabled()) 815 return 4; // XNACK, VCC (in that order). 816 return 2; // VCC. 817 } 818 819 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 820 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 821 return getBaseReservedNumSGPRs(MFI.hasFlatScratchInit()); 822 } 823 824 unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const { 825 // The logic to detect if the function has 826 // flat scratch init is slightly different than how 827 // SIMachineFunctionInfo constructor derives. 828 // We don't use amdgpu-calls, amdgpu-stack-objects 829 // attributes and isAmdHsaOrMesa here as it doesn't really matter. 830 // TODO: Outline this derivation logic and have just 831 // one common function in the backend to avoid duplication. 832 bool isEntry = AMDGPU::isEntryFunctionCC(F.getCallingConv()); 833 bool FunctionHasFlatScratchInit = false; 834 if (hasFlatAddressSpace() && isEntry && !flatScratchIsArchitected() && 835 enableFlatScratch()) { 836 FunctionHasFlatScratchInit = true; 837 } 838 return getBaseReservedNumSGPRs(FunctionHasFlatScratchInit); 839 } 840 841 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize, 842 unsigned NumSGPRs, 843 unsigned NumVGPRs) const { 844 unsigned Occupancy = 845 std::min(getMaxWavesPerEU(), 846 getOccupancyWithLocalMemSize(LDSSize, F)); 847 if (NumSGPRs) 848 Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs)); 849 if (NumVGPRs) 850 Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs)); 851 return Occupancy; 852 } 853 854 unsigned GCNSubtarget::getBaseMaxNumSGPRs( 855 const Function &F, std::pair<unsigned, unsigned> WavesPerEU, 856 unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const { 857 // Compute maximum number of SGPRs function can use using default/requested 858 // minimum number of waves per execution unit. 859 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 860 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 861 862 // Check if maximum number of SGPRs was explicitly requested using 863 // "amdgpu-num-sgpr" attribute. 864 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 865 unsigned Requested = AMDGPU::getIntegerAttribute( 866 F, "amdgpu-num-sgpr", MaxNumSGPRs); 867 868 // Make sure requested value does not violate subtarget's specifications. 869 if (Requested && (Requested <= ReservedNumSGPRs)) 870 Requested = 0; 871 872 // If more SGPRs are required to support the input user/system SGPRs, 873 // increase to accommodate them. 874 // 875 // FIXME: This really ends up using the requested number of SGPRs + number 876 // of reserved special registers in total. Theoretically you could re-use 877 // the last input registers for these special registers, but this would 878 // require a lot of complexity to deal with the weird aliasing. 879 unsigned InputNumSGPRs = PreloadedSGPRs; 880 if (Requested && Requested < InputNumSGPRs) 881 Requested = InputNumSGPRs; 882 883 // Make sure requested value is compatible with values implied by 884 // default/requested minimum/maximum number of waves per execution unit. 885 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 886 Requested = 0; 887 if (WavesPerEU.second && 888 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 889 Requested = 0; 890 891 if (Requested) 892 MaxNumSGPRs = Requested; 893 } 894 895 if (hasSGPRInitBug()) 896 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 897 898 return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs); 899 } 900 901 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 902 const Function &F = MF.getFunction(); 903 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 904 return getBaseMaxNumSGPRs(F, MFI.getWavesPerEU(), MFI.getNumPreloadedSGPRs(), 905 getReservedNumSGPRs(MF)); 906 } 907 908 static unsigned getMaxNumPreloadedSGPRs() { 909 // Max number of user SGPRs 910 unsigned MaxUserSGPRs = 4 + // private segment buffer 911 2 + // Dispatch ptr 912 2 + // queue ptr 913 2 + // kernel segment ptr 914 2 + // dispatch ID 915 2 + // flat scratch init 916 2; // Implicit buffer ptr 917 // Max number of system SGPRs 918 unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX 919 1 + // WorkGroupIDY 920 1 + // WorkGroupIDZ 921 1 + // WorkGroupInfo 922 1; // private segment wave byte offset 923 return MaxUserSGPRs + MaxSystemSGPRs; 924 } 925 926 unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const { 927 return getBaseMaxNumSGPRs(F, getWavesPerEU(F), getMaxNumPreloadedSGPRs(), 928 getReservedNumSGPRs(F)); 929 } 930 931 unsigned GCNSubtarget::getBaseMaxNumVGPRs( 932 const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const { 933 // Compute maximum number of VGPRs function can use using default/requested 934 // minimum number of waves per execution unit. 935 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 936 937 // Check if maximum number of VGPRs was explicitly requested using 938 // "amdgpu-num-vgpr" attribute. 939 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 940 unsigned Requested = AMDGPU::getIntegerAttribute( 941 F, "amdgpu-num-vgpr", MaxNumVGPRs); 942 943 if (hasGFX90AInsts()) 944 Requested *= 2; 945 946 // Make sure requested value is compatible with values implied by 947 // default/requested minimum/maximum number of waves per execution unit. 948 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 949 Requested = 0; 950 if (WavesPerEU.second && 951 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 952 Requested = 0; 953 954 if (Requested) 955 MaxNumVGPRs = Requested; 956 } 957 958 return MaxNumVGPRs; 959 } 960 961 unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const { 962 return getBaseMaxNumVGPRs(F, getWavesPerEU(F)); 963 } 964 965 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 966 const Function &F = MF.getFunction(); 967 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 968 return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU()); 969 } 970 971 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, 972 int UseOpIdx, SDep &Dep) const { 973 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || 974 !Def->isInstr() || !Use->isInstr()) 975 return; 976 977 MachineInstr *DefI = Def->getInstr(); 978 MachineInstr *UseI = Use->getInstr(); 979 980 if (DefI->isBundle()) { 981 const SIRegisterInfo *TRI = getRegisterInfo(); 982 auto Reg = Dep.getReg(); 983 MachineBasicBlock::const_instr_iterator I(DefI->getIterator()); 984 MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end()); 985 unsigned Lat = 0; 986 for (++I; I != E && I->isBundledWithPred(); ++I) { 987 if (I->modifiesRegister(Reg, TRI)) 988 Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I); 989 else if (Lat) 990 --Lat; 991 } 992 Dep.setLatency(Lat); 993 } else if (UseI->isBundle()) { 994 const SIRegisterInfo *TRI = getRegisterInfo(); 995 auto Reg = Dep.getReg(); 996 MachineBasicBlock::const_instr_iterator I(UseI->getIterator()); 997 MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end()); 998 unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI); 999 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) { 1000 if (I->readsRegister(Reg, TRI)) 1001 break; 1002 --Lat; 1003 } 1004 Dep.setLatency(Lat); 1005 } 1006 } 1007 1008 namespace { 1009 struct FillMFMAShadowMutation : ScheduleDAGMutation { 1010 const SIInstrInfo *TII; 1011 1012 ScheduleDAGMI *DAG; 1013 1014 FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {} 1015 1016 bool isSALU(const SUnit *SU) const { 1017 const MachineInstr *MI = SU->getInstr(); 1018 return MI && TII->isSALU(*MI) && !MI->isTerminator(); 1019 } 1020 1021 bool isVALU(const SUnit *SU) const { 1022 const MachineInstr *MI = SU->getInstr(); 1023 return MI && TII->isVALU(*MI); 1024 } 1025 1026 bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const { 1027 if (Pred->NodeNum < Succ->NodeNum) 1028 return true; 1029 1030 SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred}); 1031 1032 for (unsigned I = 0; I < Succs.size(); ++I) { 1033 for (const SDep &SI : Succs[I]->Succs) { 1034 const SUnit *SU = SI.getSUnit(); 1035 if (SU != Succs[I] && !llvm::is_contained(Succs, SU)) 1036 Succs.push_back(SU); 1037 } 1038 } 1039 1040 SmallPtrSet<const SUnit*, 32> Visited; 1041 while (!Preds.empty()) { 1042 const SUnit *SU = Preds.pop_back_val(); 1043 if (llvm::is_contained(Succs, SU)) 1044 return false; 1045 Visited.insert(SU); 1046 for (const SDep &SI : SU->Preds) 1047 if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit())) 1048 Preds.push_back(SI.getSUnit()); 1049 } 1050 1051 return true; 1052 } 1053 1054 // Link as much SALU intructions in chain as possible. Return the size 1055 // of the chain. Links up to MaxChain instructions. 1056 unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain, 1057 SmallPtrSetImpl<SUnit *> &Visited) const { 1058 SmallVector<SUnit *, 8> Worklist({To}); 1059 unsigned Linked = 0; 1060 1061 while (!Worklist.empty() && MaxChain-- > 0) { 1062 SUnit *SU = Worklist.pop_back_val(); 1063 if (!Visited.insert(SU).second) 1064 continue; 1065 1066 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From); 1067 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n'); 1068 1069 if (SU->addPred(SDep(From, SDep::Artificial), false)) 1070 ++Linked; 1071 1072 for (SDep &SI : From->Succs) { 1073 SUnit *SUv = SI.getSUnit(); 1074 if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU)) 1075 SUv->addPred(SDep(SU, SDep::Artificial), false); 1076 } 1077 1078 for (SDep &SI : SU->Succs) { 1079 SUnit *Succ = SI.getSUnit(); 1080 if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ)) 1081 Worklist.push_back(Succ); 1082 } 1083 } 1084 1085 return Linked; 1086 } 1087 1088 void apply(ScheduleDAGInstrs *DAGInstrs) override { 1089 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); 1090 if (!ST.hasMAIInsts() || DisablePowerSched) 1091 return; 1092 DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); 1093 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); 1094 if (!TSchedModel || DAG->SUnits.empty()) 1095 return; 1096 1097 // Scan for MFMA long latency instructions and try to add a dependency 1098 // of available SALU instructions to give them a chance to fill MFMA 1099 // shadow. That is desirable to fill MFMA shadow with SALU instructions 1100 // rather than VALU to prevent power consumption bursts and throttle. 1101 auto LastSALU = DAG->SUnits.begin(); 1102 auto E = DAG->SUnits.end(); 1103 SmallPtrSet<SUnit*, 32> Visited; 1104 for (SUnit &SU : DAG->SUnits) { 1105 MachineInstr &MAI = *SU.getInstr(); 1106 if (!TII->isMAI(MAI) || 1107 MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 || 1108 MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64) 1109 continue; 1110 1111 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1; 1112 1113 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU); 1114 dbgs() << "Need " << Lat 1115 << " instructions to cover latency.\n"); 1116 1117 // Find up to Lat independent scalar instructions as early as 1118 // possible such that they can be scheduled after this MFMA. 1119 for ( ; Lat && LastSALU != E; ++LastSALU) { 1120 if (Visited.count(&*LastSALU)) 1121 continue; 1122 1123 if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU)) 1124 continue; 1125 1126 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited); 1127 } 1128 } 1129 } 1130 }; 1131 } // namespace 1132 1133 void GCNSubtarget::getPostRAMutations( 1134 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 1135 Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo)); 1136 } 1137 1138 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 1139 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 1140 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 1141 else 1142 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); 1143 } 1144 1145 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 1146 if (TM.getTargetTriple().getArch() == Triple::amdgcn) 1147 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 1148 else 1149 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); 1150 } 1151