1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Implements the AMDGPU specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUSubtarget.h" 15 #include "AMDGPUCallLowering.h" 16 #include "AMDGPUInstructionSelector.h" 17 #include "AMDGPULegalizerInfo.h" 18 #include "AMDGPURegisterBankInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "R600Subtarget.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "Utils/AMDGPUBaseInfo.h" 23 #include "llvm/ADT/SmallString.h" 24 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h" 25 #include "llvm/CodeGen/MachineScheduler.h" 26 #include "llvm/CodeGen/TargetFrameLowering.h" 27 #include "llvm/IR/IntrinsicsAMDGPU.h" 28 #include "llvm/IR/IntrinsicsR600.h" 29 #include "llvm/IR/MDBuilder.h" 30 #include "llvm/MC/MCSubtargetInfo.h" 31 #include <algorithm> 32 33 using namespace llvm; 34 35 #define DEBUG_TYPE "amdgpu-subtarget" 36 37 #define GET_SUBTARGETINFO_TARGET_DESC 38 #define GET_SUBTARGETINFO_CTOR 39 #define AMDGPUSubtarget GCNSubtarget 40 #include "AMDGPUGenSubtargetInfo.inc" 41 #undef AMDGPUSubtarget 42 43 static cl::opt<bool> DisablePowerSched( 44 "amdgpu-disable-power-sched", 45 cl::desc("Disable scheduling to minimize mAI power bursts"), 46 cl::init(false)); 47 48 static cl::opt<bool> EnableVGPRIndexMode( 49 "amdgpu-vgpr-index-mode", 50 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), 51 cl::init(false)); 52 53 static cl::opt<bool> EnableFlatScratch( 54 "amdgpu-enable-flat-scratch", 55 cl::desc("Use flat scratch instructions"), 56 cl::init(false)); 57 58 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen", 59 cl::desc("Enable the use of AA during codegen."), 60 cl::init(true)); 61 62 GCNSubtarget::~GCNSubtarget() = default; 63 64 GCNSubtarget & 65 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 66 StringRef GPU, StringRef FS) { 67 // Determine default and user-specified characteristics 68 // 69 // We want to be able to turn these off, but making this a subtarget feature 70 // for SI has the unhelpful behavior that it unsets everything else if you 71 // disable it. 72 // 73 // Similarly we want enable-prt-strict-null to be on by default and not to 74 // unset everything else if it is disabled 75 76 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,"); 77 78 // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default 79 if (isAmdHsaOS()) 80 FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,"; 81 82 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS 83 84 // Disable mutually exclusive bits. 85 if (FS.contains_insensitive("+wavefrontsize")) { 86 if (!FS.contains_insensitive("wavefrontsize16")) 87 FullFS += "-wavefrontsize16,"; 88 if (!FS.contains_insensitive("wavefrontsize32")) 89 FullFS += "-wavefrontsize32,"; 90 if (!FS.contains_insensitive("wavefrontsize64")) 91 FullFS += "-wavefrontsize64,"; 92 } 93 94 FullFS += FS; 95 96 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS); 97 98 // Implement the "generic" processors, which acts as the default when no 99 // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to 100 // the first amdgcn target that supports flat addressing. Other OSes defaults 101 // to the first amdgcn target. 102 if (Gen == AMDGPUSubtarget::INVALID) { 103 Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS 104 : AMDGPUSubtarget::SOUTHERN_ISLANDS; 105 } 106 107 // We don't support FP64 for EG/NI atm. 108 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 109 110 // Targets must either support 64-bit offsets for MUBUF instructions, and/or 111 // support flat operations, otherwise they cannot access a 64-bit global 112 // address space 113 assert(hasAddr64() || hasFlat()); 114 // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets 115 // that do not support ADDR64 variants of MUBUF instructions. Such targets 116 // cannot use a 64 bit offset with a MUBUF instruction to access the global 117 // address space 118 if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) { 119 ToggleFeature(AMDGPU::FeatureFlatForGlobal); 120 FlatForGlobal = true; 121 } 122 // Unless +-flat-for-global is specified, use MUBUF instructions for global 123 // address space access if flat operations are not available. 124 if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) { 125 ToggleFeature(AMDGPU::FeatureFlatForGlobal); 126 FlatForGlobal = false; 127 } 128 129 // Set defaults if needed. 130 if (MaxPrivateElementSize == 0) 131 MaxPrivateElementSize = 4; 132 133 if (LDSBankCount == 0) 134 LDSBankCount = 32; 135 136 if (TT.getArch() == Triple::amdgcn) { 137 if (LocalMemorySize == 0) 138 LocalMemorySize = 32768; 139 140 // Do something sensible for unspecified target. 141 if (!HasMovrel && !HasVGPRIndexMode) 142 HasMovrel = true; 143 } 144 145 // Don't crash on invalid devices. 146 if (WavefrontSizeLog2 == 0) 147 WavefrontSizeLog2 = 5; 148 149 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 150 HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9; 151 152 TargetID.setTargetIDFromFeaturesString(FS); 153 154 LLVM_DEBUG(dbgs() << "xnack setting for subtarget: " 155 << TargetID.getXnackSetting() << '\n'); 156 LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: " 157 << TargetID.getSramEccSetting() << '\n'); 158 159 return *this; 160 } 161 162 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : 163 TargetTriple(TT), 164 GCN3Encoding(false), 165 Has16BitInsts(false), 166 HasMadMixInsts(false), 167 HasMadMacF32Insts(false), 168 HasDsSrc2Insts(false), 169 HasSDWA(false), 170 HasVOP3PInsts(false), 171 HasMulI24(true), 172 HasMulU24(true), 173 HasSMulHi(false), 174 HasInv2PiInlineImm(false), 175 HasFminFmaxLegacy(true), 176 EnablePromoteAlloca(false), 177 HasTrigReducedRange(false), 178 MaxWavesPerEU(10), 179 LocalMemorySize(0), 180 WavefrontSizeLog2(0) 181 { } 182 183 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 184 const GCNTargetMachine &TM) 185 : // clang-format off 186 AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS), 187 AMDGPUSubtarget(TT), 188 TargetTriple(TT), 189 TargetID(*this), 190 Gen(INVALID), 191 InstrItins(getInstrItineraryForCPU(GPU)), 192 LDSBankCount(0), 193 MaxPrivateElementSize(0), 194 195 FastFMAF32(false), 196 FastDenormalF32(false), 197 HalfRate64Ops(false), 198 FullRate64Ops(false), 199 200 FlatForGlobal(false), 201 AutoWaitcntBeforeBarrier(false), 202 UnalignedScratchAccess(false), 203 UnalignedAccessMode(false), 204 205 HasApertureRegs(false), 206 SupportsXNACK(false), 207 EnableXNACK(false), 208 EnableTgSplit(false), 209 EnableCuMode(false), 210 TrapHandler(false), 211 212 EnableLoadStoreOpt(false), 213 EnableUnsafeDSOffsetFolding(false), 214 EnableSIScheduler(false), 215 EnableDS128(false), 216 EnablePRTStrictNull(false), 217 DumpCode(false), 218 219 FP64(false), 220 CIInsts(false), 221 GFX8Insts(false), 222 GFX9Insts(false), 223 GFX90AInsts(false), 224 GFX10Insts(false), 225 GFX10_3Insts(false), 226 GFX7GFX8GFX9Insts(false), 227 SGPRInitBug(false), 228 NegativeScratchOffsetBug(false), 229 NegativeUnalignedScratchOffsetBug(false), 230 HasSMemRealTime(false), 231 HasIntClamp(false), 232 HasFmaMixInsts(false), 233 HasMovrel(false), 234 HasVGPRIndexMode(false), 235 HasScalarStores(false), 236 HasScalarAtomics(false), 237 HasSDWAOmod(false), 238 HasSDWAScalar(false), 239 HasSDWASdst(false), 240 HasSDWAMac(false), 241 HasSDWAOutModsVOPC(false), 242 HasDPP(false), 243 HasDPP8(false), 244 Has64BitDPP(false), 245 HasPackedFP32Ops(false), 246 HasExtendedImageInsts(false), 247 HasR128A16(false), 248 HasGFX10A16(false), 249 HasG16(false), 250 HasNSAEncoding(false), 251 NSAMaxSize(0), 252 GFX10_AEncoding(false), 253 GFX10_BEncoding(false), 254 HasDLInsts(false), 255 HasDot1Insts(false), 256 HasDot2Insts(false), 257 HasDot3Insts(false), 258 HasDot4Insts(false), 259 HasDot5Insts(false), 260 HasDot6Insts(false), 261 HasDot7Insts(false), 262 HasMAIInsts(false), 263 HasPkFmacF16Inst(false), 264 HasAtomicFaddInsts(false), 265 SupportsSRAMECC(false), 266 EnableSRAMECC(false), 267 HasNoSdstCMPX(false), 268 HasVscnt(false), 269 HasGetWaveIdInst(false), 270 HasSMemTimeInst(false), 271 HasShaderCyclesRegister(false), 272 HasRegisterBanking(false), 273 HasVOP3Literal(false), 274 HasNoDataDepHazard(false), 275 FlatAddressSpace(false), 276 FlatInstOffsets(false), 277 FlatGlobalInsts(false), 278 FlatScratchInsts(false), 279 ScalarFlatScratchInsts(false), 280 HasArchitectedFlatScratch(false), 281 AddNoCarryInsts(false), 282 HasUnpackedD16VMem(false), 283 LDSMisalignedBug(false), 284 HasMFMAInlineLiteralBug(false), 285 UnalignedBufferAccess(false), 286 UnalignedDSAccess(false), 287 HasPackedTID(false), 288 289 ScalarizeGlobal(false), 290 291 HasVcmpxPermlaneHazard(false), 292 HasVMEMtoScalarWriteHazard(false), 293 HasSMEMtoVectorWriteHazard(false), 294 HasInstFwdPrefetchBug(false), 295 HasVcmpxExecWARHazard(false), 296 HasLdsBranchVmemWARHazard(false), 297 HasNSAtoVMEMBug(false), 298 HasNSAClauseBug(false), 299 HasOffset3fBug(false), 300 HasFlatSegmentOffsetBug(false), 301 HasImageStoreD16Bug(false), 302 HasImageGather4D16Bug(false), 303 304 FeatureDisable(false), 305 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 306 TLInfo(TM, *this), 307 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 308 // clang-format on 309 MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this); 310 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 311 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering())); 312 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 313 RegBankInfo.reset(new AMDGPURegisterBankInfo(*this)); 314 InstSelector.reset(new AMDGPUInstructionSelector( 315 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 316 } 317 318 bool GCNSubtarget::enableFlatScratch() const { 319 return flatScratchIsArchitected() || 320 (EnableFlatScratch && hasFlatScratchInsts()); 321 } 322 323 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { 324 if (getGeneration() < GFX10) 325 return 1; 326 327 switch (Opcode) { 328 case AMDGPU::V_LSHLREV_B64_e64: 329 case AMDGPU::V_LSHLREV_B64_gfx10: 330 case AMDGPU::V_LSHL_B64_e64: 331 case AMDGPU::V_LSHRREV_B64_e64: 332 case AMDGPU::V_LSHRREV_B64_gfx10: 333 case AMDGPU::V_LSHR_B64_e64: 334 case AMDGPU::V_ASHRREV_I64_e64: 335 case AMDGPU::V_ASHRREV_I64_gfx10: 336 case AMDGPU::V_ASHR_I64_e64: 337 return 1; 338 } 339 340 return 2; 341 } 342 343 /// This list was mostly derived from experimentation. 344 bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const { 345 switch (Opcode) { 346 case AMDGPU::V_CVT_F16_F32_e32: 347 case AMDGPU::V_CVT_F16_F32_e64: 348 case AMDGPU::V_CVT_F16_U16_e32: 349 case AMDGPU::V_CVT_F16_U16_e64: 350 case AMDGPU::V_CVT_F16_I16_e32: 351 case AMDGPU::V_CVT_F16_I16_e64: 352 case AMDGPU::V_RCP_F16_e64: 353 case AMDGPU::V_RCP_F16_e32: 354 case AMDGPU::V_RSQ_F16_e64: 355 case AMDGPU::V_RSQ_F16_e32: 356 case AMDGPU::V_SQRT_F16_e64: 357 case AMDGPU::V_SQRT_F16_e32: 358 case AMDGPU::V_LOG_F16_e64: 359 case AMDGPU::V_LOG_F16_e32: 360 case AMDGPU::V_EXP_F16_e64: 361 case AMDGPU::V_EXP_F16_e32: 362 case AMDGPU::V_SIN_F16_e64: 363 case AMDGPU::V_SIN_F16_e32: 364 case AMDGPU::V_COS_F16_e64: 365 case AMDGPU::V_COS_F16_e32: 366 case AMDGPU::V_FLOOR_F16_e64: 367 case AMDGPU::V_FLOOR_F16_e32: 368 case AMDGPU::V_CEIL_F16_e64: 369 case AMDGPU::V_CEIL_F16_e32: 370 case AMDGPU::V_TRUNC_F16_e64: 371 case AMDGPU::V_TRUNC_F16_e32: 372 case AMDGPU::V_RNDNE_F16_e64: 373 case AMDGPU::V_RNDNE_F16_e32: 374 case AMDGPU::V_FRACT_F16_e64: 375 case AMDGPU::V_FRACT_F16_e32: 376 case AMDGPU::V_FREXP_MANT_F16_e64: 377 case AMDGPU::V_FREXP_MANT_F16_e32: 378 case AMDGPU::V_FREXP_EXP_I16_F16_e64: 379 case AMDGPU::V_FREXP_EXP_I16_F16_e32: 380 case AMDGPU::V_LDEXP_F16_e64: 381 case AMDGPU::V_LDEXP_F16_e32: 382 case AMDGPU::V_LSHLREV_B16_e64: 383 case AMDGPU::V_LSHLREV_B16_e32: 384 case AMDGPU::V_LSHRREV_B16_e64: 385 case AMDGPU::V_LSHRREV_B16_e32: 386 case AMDGPU::V_ASHRREV_I16_e64: 387 case AMDGPU::V_ASHRREV_I16_e32: 388 case AMDGPU::V_ADD_U16_e64: 389 case AMDGPU::V_ADD_U16_e32: 390 case AMDGPU::V_SUB_U16_e64: 391 case AMDGPU::V_SUB_U16_e32: 392 case AMDGPU::V_SUBREV_U16_e64: 393 case AMDGPU::V_SUBREV_U16_e32: 394 case AMDGPU::V_MUL_LO_U16_e64: 395 case AMDGPU::V_MUL_LO_U16_e32: 396 case AMDGPU::V_ADD_F16_e64: 397 case AMDGPU::V_ADD_F16_e32: 398 case AMDGPU::V_SUB_F16_e64: 399 case AMDGPU::V_SUB_F16_e32: 400 case AMDGPU::V_SUBREV_F16_e64: 401 case AMDGPU::V_SUBREV_F16_e32: 402 case AMDGPU::V_MUL_F16_e64: 403 case AMDGPU::V_MUL_F16_e32: 404 case AMDGPU::V_MAX_F16_e64: 405 case AMDGPU::V_MAX_F16_e32: 406 case AMDGPU::V_MIN_F16_e64: 407 case AMDGPU::V_MIN_F16_e32: 408 case AMDGPU::V_MAX_U16_e64: 409 case AMDGPU::V_MAX_U16_e32: 410 case AMDGPU::V_MIN_U16_e64: 411 case AMDGPU::V_MIN_U16_e32: 412 case AMDGPU::V_MAX_I16_e64: 413 case AMDGPU::V_MAX_I16_e32: 414 case AMDGPU::V_MIN_I16_e64: 415 case AMDGPU::V_MIN_I16_e32: 416 // On gfx10, all 16-bit instructions preserve the high bits. 417 return getGeneration() <= AMDGPUSubtarget::GFX9; 418 case AMDGPU::V_MAD_F16_e64: 419 case AMDGPU::V_MADAK_F16: 420 case AMDGPU::V_MADMK_F16: 421 case AMDGPU::V_MAC_F16_e64: 422 case AMDGPU::V_MAC_F16_e32: 423 case AMDGPU::V_FMAMK_F16: 424 case AMDGPU::V_FMAAK_F16: 425 case AMDGPU::V_MAD_U16_e64: 426 case AMDGPU::V_MAD_I16_e64: 427 case AMDGPU::V_FMA_F16_e64: 428 case AMDGPU::V_FMAC_F16_e64: 429 case AMDGPU::V_FMAC_F16_e32: 430 case AMDGPU::V_DIV_FIXUP_F16_e64: 431 // In gfx9, the preferred handling of the unused high 16-bits changed. Most 432 // instructions maintain the legacy behavior of 0ing. Some instructions 433 // changed to preserving the high bits. 434 return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS; 435 case AMDGPU::V_MAD_MIXLO_F16: 436 case AMDGPU::V_MAD_MIXHI_F16: 437 default: 438 return false; 439 } 440 } 441 442 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 443 const Function &F) const { 444 if (NWaves == 1) 445 return getLocalMemorySize(); 446 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 447 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 448 if (!WorkGroupsPerCu) 449 return 0; 450 unsigned MaxWaves = getMaxWavesPerEU(); 451 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 452 } 453 454 // FIXME: Should return min,max range. 455 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 456 const Function &F) const { 457 const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second; 458 const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize); 459 if (!MaxWorkGroupsPerCu) 460 return 0; 461 462 const unsigned WaveSize = getWavefrontSize(); 463 464 // FIXME: Do we need to account for alignment requirement of LDS rounding the 465 // size up? 466 // Compute restriction based on LDS usage 467 unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u); 468 469 // This can be queried with more LDS than is possible, so just assume the 470 // worst. 471 if (NumGroups == 0) 472 return 1; 473 474 NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups); 475 476 // Round to the number of waves. 477 const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize; 478 unsigned MaxWaves = NumGroups * MaxGroupNumWaves; 479 480 // Clamp to the maximum possible number of waves. 481 MaxWaves = std::min(MaxWaves, getMaxWavesPerEU()); 482 483 // FIXME: Needs to be a multiple of the group size? 484 //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves); 485 486 assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() && 487 "computed invalid occupancy"); 488 return MaxWaves; 489 } 490 491 unsigned 492 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 493 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 494 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 495 } 496 497 std::pair<unsigned, unsigned> 498 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 499 switch (CC) { 500 case CallingConv::AMDGPU_VS: 501 case CallingConv::AMDGPU_LS: 502 case CallingConv::AMDGPU_HS: 503 case CallingConv::AMDGPU_ES: 504 case CallingConv::AMDGPU_GS: 505 case CallingConv::AMDGPU_PS: 506 return std::make_pair(1, getWavefrontSize()); 507 default: 508 return std::make_pair(1u, getMaxFlatWorkGroupSize()); 509 } 510 } 511 512 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 513 const Function &F) const { 514 // Default minimum/maximum flat work group sizes. 515 std::pair<unsigned, unsigned> Default = 516 getDefaultFlatWorkGroupSize(F.getCallingConv()); 517 518 // Requested minimum/maximum flat work group sizes. 519 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 520 F, "amdgpu-flat-work-group-size", Default); 521 522 // Make sure requested minimum is less than requested maximum. 523 if (Requested.first > Requested.second) 524 return Default; 525 526 // Make sure requested values do not violate subtarget's specifications. 527 if (Requested.first < getMinFlatWorkGroupSize()) 528 return Default; 529 if (Requested.second > getMaxFlatWorkGroupSize()) 530 return Default; 531 532 return Requested; 533 } 534 535 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 536 const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const { 537 // Default minimum/maximum number of waves per execution unit. 538 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 539 540 // If minimum/maximum flat work group sizes were explicitly requested using 541 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 542 // number of waves per execution unit to values implied by requested 543 // minimum/maximum flat work group sizes. 544 unsigned MinImpliedByFlatWorkGroupSize = 545 getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second); 546 Default.first = MinImpliedByFlatWorkGroupSize; 547 548 // Requested minimum/maximum number of waves per execution unit. 549 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 550 F, "amdgpu-waves-per-eu", Default, true); 551 552 // Make sure requested minimum is less than requested maximum. 553 if (Requested.second && Requested.first > Requested.second) 554 return Default; 555 556 // Make sure requested values do not violate subtarget's specifications. 557 if (Requested.first < getMinWavesPerEU() || 558 Requested.second > getMaxWavesPerEU()) 559 return Default; 560 561 // Make sure requested values are compatible with values implied by requested 562 // minimum/maximum flat work group sizes. 563 if (Requested.first < MinImpliedByFlatWorkGroupSize) 564 return Default; 565 566 return Requested; 567 } 568 569 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) { 570 auto Node = Kernel.getMetadata("reqd_work_group_size"); 571 if (Node && Node->getNumOperands() == 3) 572 return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue(); 573 return std::numeric_limits<unsigned>::max(); 574 } 575 576 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const { 577 return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv()); 578 } 579 580 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel, 581 unsigned Dimension) const { 582 unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension); 583 if (ReqdSize != std::numeric_limits<unsigned>::max()) 584 return ReqdSize - 1; 585 return getFlatWorkGroupSizes(Kernel).second - 1; 586 } 587 588 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 589 Function *Kernel = I->getParent()->getParent(); 590 unsigned MinSize = 0; 591 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 592 bool IdQuery = false; 593 594 // If reqd_work_group_size is present it narrows value down. 595 if (auto *CI = dyn_cast<CallInst>(I)) { 596 const Function *F = CI->getCalledFunction(); 597 if (F) { 598 unsigned Dim = UINT_MAX; 599 switch (F->getIntrinsicID()) { 600 case Intrinsic::amdgcn_workitem_id_x: 601 case Intrinsic::r600_read_tidig_x: 602 IdQuery = true; 603 LLVM_FALLTHROUGH; 604 case Intrinsic::r600_read_local_size_x: 605 Dim = 0; 606 break; 607 case Intrinsic::amdgcn_workitem_id_y: 608 case Intrinsic::r600_read_tidig_y: 609 IdQuery = true; 610 LLVM_FALLTHROUGH; 611 case Intrinsic::r600_read_local_size_y: 612 Dim = 1; 613 break; 614 case Intrinsic::amdgcn_workitem_id_z: 615 case Intrinsic::r600_read_tidig_z: 616 IdQuery = true; 617 LLVM_FALLTHROUGH; 618 case Intrinsic::r600_read_local_size_z: 619 Dim = 2; 620 break; 621 default: 622 break; 623 } 624 625 if (Dim <= 3) { 626 unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim); 627 if (ReqdSize != std::numeric_limits<unsigned>::max()) 628 MinSize = MaxSize = ReqdSize; 629 } 630 } 631 } 632 633 if (!MaxSize) 634 return false; 635 636 // Range metadata is [Lo, Hi). For ID query we need to pass max size 637 // as Hi. For size query we need to pass Hi + 1. 638 if (IdQuery) 639 MinSize = 0; 640 else 641 ++MaxSize; 642 643 MDBuilder MDB(I->getContext()); 644 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 645 APInt(32, MaxSize)); 646 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 647 return true; 648 } 649 650 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const { 651 assert(AMDGPU::isKernel(F.getCallingConv())); 652 653 // We don't allocate the segment if we know the implicit arguments weren't 654 // used, even if the ABI implies we need them. 655 if (F.hasFnAttribute("amdgpu-no-implicitarg-ptr")) 656 return 0; 657 658 if (isMesaKernel(F)) 659 return 16; 660 661 // Assume all implicit inputs are used by default 662 return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 56); 663 } 664 665 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 666 Align &MaxAlign) const { 667 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 668 F.getCallingConv() == CallingConv::SPIR_KERNEL); 669 670 const DataLayout &DL = F.getParent()->getDataLayout(); 671 uint64_t ExplicitArgBytes = 0; 672 MaxAlign = Align(1); 673 674 for (const Argument &Arg : F.args()) { 675 const bool IsByRef = Arg.hasByRefAttr(); 676 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); 677 MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None; 678 if (!Alignment) 679 Alignment = DL.getABITypeAlign(ArgTy); 680 681 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 682 ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize; 683 MaxAlign = max(MaxAlign, Alignment); 684 } 685 686 return ExplicitArgBytes; 687 } 688 689 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 690 Align &MaxAlign) const { 691 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 692 693 unsigned ExplicitOffset = getExplicitKernelArgOffset(F); 694 695 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 696 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 697 if (ImplicitBytes != 0) { 698 const Align Alignment = getAlignmentForImplicitArgPtr(); 699 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 700 MaxAlign = std::max(MaxAlign, Alignment); 701 } 702 703 // Being able to dereference past the end is useful for emitting scalar loads. 704 return alignTo(TotalSize, 4); 705 } 706 707 AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const { 708 return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32 709 : AMDGPUDwarfFlavour::Wave64; 710 } 711 712 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 713 unsigned NumRegionInstrs) const { 714 // Track register pressure so the scheduler can try to decrease 715 // pressure once register usage is above the threshold defined by 716 // SIRegisterInfo::getRegPressureSetLimit() 717 Policy.ShouldTrackPressure = true; 718 719 // Enabling both top down and bottom up scheduling seems to give us less 720 // register spills than just using one of these approaches on its own. 721 Policy.OnlyTopDown = false; 722 Policy.OnlyBottomUp = false; 723 724 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 725 if (!enableSIScheduler()) 726 Policy.ShouldTrackLaneMasks = true; 727 } 728 729 bool GCNSubtarget::hasMadF16() const { 730 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1; 731 } 732 733 bool GCNSubtarget::useVGPRIndexMode() const { 734 return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode()); 735 } 736 737 bool GCNSubtarget::useAA() const { return UseAA; } 738 739 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 740 if (getGeneration() >= AMDGPUSubtarget::GFX10) 741 return getMaxWavesPerEU(); 742 743 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 744 if (SGPRs <= 80) 745 return 10; 746 if (SGPRs <= 88) 747 return 9; 748 if (SGPRs <= 100) 749 return 8; 750 return 7; 751 } 752 if (SGPRs <= 48) 753 return 10; 754 if (SGPRs <= 56) 755 return 9; 756 if (SGPRs <= 64) 757 return 8; 758 if (SGPRs <= 72) 759 return 7; 760 if (SGPRs <= 80) 761 return 6; 762 return 5; 763 } 764 765 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 766 unsigned MaxWaves = getMaxWavesPerEU(); 767 unsigned Granule = getVGPRAllocGranule(); 768 if (VGPRs < Granule) 769 return MaxWaves; 770 unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule; 771 return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves); 772 } 773 774 unsigned 775 GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratchInit) const { 776 if (getGeneration() >= AMDGPUSubtarget::GFX10) 777 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. 778 779 if (HasFlatScratchInit || HasArchitectedFlatScratch) { 780 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 781 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 782 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 783 return 4; // FLAT_SCRATCH, VCC (in that order). 784 } 785 786 if (isXNACKEnabled()) 787 return 4; // XNACK, VCC (in that order). 788 return 2; // VCC. 789 } 790 791 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 792 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 793 return getBaseReservedNumSGPRs(MFI.hasFlatScratchInit()); 794 } 795 796 unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const { 797 // The logic to detect if the function has 798 // flat scratch init is slightly different than how 799 // SIMachineFunctionInfo constructor derives. 800 // We don't use amdgpu-calls, amdgpu-stack-objects 801 // attributes and isAmdHsaOrMesa here as it doesn't really matter. 802 // TODO: Outline this derivation logic and have just 803 // one common function in the backend to avoid duplication. 804 bool isEntry = AMDGPU::isEntryFunctionCC(F.getCallingConv()); 805 bool FunctionHasFlatScratchInit = false; 806 if (hasFlatAddressSpace() && isEntry && !flatScratchIsArchitected() && 807 enableFlatScratch()) { 808 FunctionHasFlatScratchInit = true; 809 } 810 return getBaseReservedNumSGPRs(FunctionHasFlatScratchInit); 811 } 812 813 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize, 814 unsigned NumSGPRs, 815 unsigned NumVGPRs) const { 816 unsigned Occupancy = 817 std::min(getMaxWavesPerEU(), 818 getOccupancyWithLocalMemSize(LDSSize, F)); 819 if (NumSGPRs) 820 Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs)); 821 if (NumVGPRs) 822 Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs)); 823 return Occupancy; 824 } 825 826 unsigned GCNSubtarget::getBaseMaxNumSGPRs( 827 const Function &F, std::pair<unsigned, unsigned> WavesPerEU, 828 unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const { 829 // Compute maximum number of SGPRs function can use using default/requested 830 // minimum number of waves per execution unit. 831 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 832 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 833 834 // Check if maximum number of SGPRs was explicitly requested using 835 // "amdgpu-num-sgpr" attribute. 836 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 837 unsigned Requested = AMDGPU::getIntegerAttribute( 838 F, "amdgpu-num-sgpr", MaxNumSGPRs); 839 840 // Make sure requested value does not violate subtarget's specifications. 841 if (Requested && (Requested <= ReservedNumSGPRs)) 842 Requested = 0; 843 844 // If more SGPRs are required to support the input user/system SGPRs, 845 // increase to accommodate them. 846 // 847 // FIXME: This really ends up using the requested number of SGPRs + number 848 // of reserved special registers in total. Theoretically you could re-use 849 // the last input registers for these special registers, but this would 850 // require a lot of complexity to deal with the weird aliasing. 851 unsigned InputNumSGPRs = PreloadedSGPRs; 852 if (Requested && Requested < InputNumSGPRs) 853 Requested = InputNumSGPRs; 854 855 // Make sure requested value is compatible with values implied by 856 // default/requested minimum/maximum number of waves per execution unit. 857 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 858 Requested = 0; 859 if (WavesPerEU.second && 860 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 861 Requested = 0; 862 863 if (Requested) 864 MaxNumSGPRs = Requested; 865 } 866 867 if (hasSGPRInitBug()) 868 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 869 870 return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs); 871 } 872 873 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 874 const Function &F = MF.getFunction(); 875 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 876 return getBaseMaxNumSGPRs(F, MFI.getWavesPerEU(), MFI.getNumPreloadedSGPRs(), 877 getReservedNumSGPRs(MF)); 878 } 879 880 static unsigned getMaxNumPreloadedSGPRs() { 881 // Max number of user SGPRs 882 unsigned MaxUserSGPRs = 4 + // private segment buffer 883 2 + // Dispatch ptr 884 2 + // queue ptr 885 2 + // kernel segment ptr 886 2 + // dispatch ID 887 2 + // flat scratch init 888 2; // Implicit buffer ptr 889 // Max number of system SGPRs 890 unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX 891 1 + // WorkGroupIDY 892 1 + // WorkGroupIDZ 893 1 + // WorkGroupInfo 894 1; // private segment wave byte offset 895 return MaxUserSGPRs + MaxSystemSGPRs; 896 } 897 898 unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const { 899 return getBaseMaxNumSGPRs(F, getWavesPerEU(F), getMaxNumPreloadedSGPRs(), 900 getReservedNumSGPRs(F)); 901 } 902 903 unsigned GCNSubtarget::getBaseMaxNumVGPRs( 904 const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const { 905 // Compute maximum number of VGPRs function can use using default/requested 906 // minimum number of waves per execution unit. 907 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 908 909 // Check if maximum number of VGPRs was explicitly requested using 910 // "amdgpu-num-vgpr" attribute. 911 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 912 unsigned Requested = AMDGPU::getIntegerAttribute( 913 F, "amdgpu-num-vgpr", MaxNumVGPRs); 914 915 if (hasGFX90AInsts()) 916 Requested *= 2; 917 918 // Make sure requested value is compatible with values implied by 919 // default/requested minimum/maximum number of waves per execution unit. 920 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 921 Requested = 0; 922 if (WavesPerEU.second && 923 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 924 Requested = 0; 925 926 if (Requested) 927 MaxNumVGPRs = Requested; 928 } 929 930 return MaxNumVGPRs; 931 } 932 933 unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const { 934 return getBaseMaxNumVGPRs(F, getWavesPerEU(F)); 935 } 936 937 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 938 const Function &F = MF.getFunction(); 939 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 940 return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU()); 941 } 942 943 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, 944 int UseOpIdx, SDep &Dep) const { 945 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || 946 !Def->isInstr() || !Use->isInstr()) 947 return; 948 949 MachineInstr *DefI = Def->getInstr(); 950 MachineInstr *UseI = Use->getInstr(); 951 952 if (DefI->isBundle()) { 953 const SIRegisterInfo *TRI = getRegisterInfo(); 954 auto Reg = Dep.getReg(); 955 MachineBasicBlock::const_instr_iterator I(DefI->getIterator()); 956 MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end()); 957 unsigned Lat = 0; 958 for (++I; I != E && I->isBundledWithPred(); ++I) { 959 if (I->modifiesRegister(Reg, TRI)) 960 Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I); 961 else if (Lat) 962 --Lat; 963 } 964 Dep.setLatency(Lat); 965 } else if (UseI->isBundle()) { 966 const SIRegisterInfo *TRI = getRegisterInfo(); 967 auto Reg = Dep.getReg(); 968 MachineBasicBlock::const_instr_iterator I(UseI->getIterator()); 969 MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end()); 970 unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI); 971 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) { 972 if (I->readsRegister(Reg, TRI)) 973 break; 974 --Lat; 975 } 976 Dep.setLatency(Lat); 977 } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) { 978 // Work around the fact that SIInstrInfo::fixImplicitOperands modifies 979 // implicit operands which come from the MCInstrDesc, which can fool 980 // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit 981 // pseudo operands. 982 Dep.setLatency(InstrInfo.getSchedModel().computeOperandLatency( 983 DefI, DefOpIdx, UseI, UseOpIdx)); 984 } 985 } 986 987 namespace { 988 struct FillMFMAShadowMutation : ScheduleDAGMutation { 989 const SIInstrInfo *TII; 990 991 ScheduleDAGMI *DAG; 992 993 FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {} 994 995 bool isSALU(const SUnit *SU) const { 996 const MachineInstr *MI = SU->getInstr(); 997 return MI && TII->isSALU(*MI) && !MI->isTerminator(); 998 } 999 1000 bool isVALU(const SUnit *SU) const { 1001 const MachineInstr *MI = SU->getInstr(); 1002 return MI && TII->isVALU(*MI); 1003 } 1004 1005 bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const { 1006 if (Pred->NodeNum < Succ->NodeNum) 1007 return true; 1008 1009 SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred}); 1010 1011 for (unsigned I = 0; I < Succs.size(); ++I) { 1012 for (const SDep &SI : Succs[I]->Succs) { 1013 const SUnit *SU = SI.getSUnit(); 1014 if (SU != Succs[I] && !llvm::is_contained(Succs, SU)) 1015 Succs.push_back(SU); 1016 } 1017 } 1018 1019 SmallPtrSet<const SUnit*, 32> Visited; 1020 while (!Preds.empty()) { 1021 const SUnit *SU = Preds.pop_back_val(); 1022 if (llvm::is_contained(Succs, SU)) 1023 return false; 1024 Visited.insert(SU); 1025 for (const SDep &SI : SU->Preds) 1026 if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit())) 1027 Preds.push_back(SI.getSUnit()); 1028 } 1029 1030 return true; 1031 } 1032 1033 // Link as many SALU instructions in chain as possible. Return the size 1034 // of the chain. Links up to MaxChain instructions. 1035 unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain, 1036 SmallPtrSetImpl<SUnit *> &Visited) const { 1037 SmallVector<SUnit *, 8> Worklist({To}); 1038 unsigned Linked = 0; 1039 1040 while (!Worklist.empty() && MaxChain-- > 0) { 1041 SUnit *SU = Worklist.pop_back_val(); 1042 if (!Visited.insert(SU).second) 1043 continue; 1044 1045 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From); 1046 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n'); 1047 1048 if (SU->addPred(SDep(From, SDep::Artificial), false)) 1049 ++Linked; 1050 1051 for (SDep &SI : From->Succs) { 1052 SUnit *SUv = SI.getSUnit(); 1053 if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU)) 1054 SUv->addPred(SDep(SU, SDep::Artificial), false); 1055 } 1056 1057 for (SDep &SI : SU->Succs) { 1058 SUnit *Succ = SI.getSUnit(); 1059 if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ)) 1060 Worklist.push_back(Succ); 1061 } 1062 } 1063 1064 return Linked; 1065 } 1066 1067 void apply(ScheduleDAGInstrs *DAGInstrs) override { 1068 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); 1069 if (!ST.hasMAIInsts() || DisablePowerSched) 1070 return; 1071 DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); 1072 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); 1073 if (!TSchedModel || DAG->SUnits.empty()) 1074 return; 1075 1076 // Scan for MFMA long latency instructions and try to add a dependency 1077 // of available SALU instructions to give them a chance to fill MFMA 1078 // shadow. That is desirable to fill MFMA shadow with SALU instructions 1079 // rather than VALU to prevent power consumption bursts and throttle. 1080 auto LastSALU = DAG->SUnits.begin(); 1081 auto E = DAG->SUnits.end(); 1082 SmallPtrSet<SUnit*, 32> Visited; 1083 for (SUnit &SU : DAG->SUnits) { 1084 MachineInstr &MAI = *SU.getInstr(); 1085 if (!TII->isMAI(MAI) || 1086 MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 || 1087 MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64) 1088 continue; 1089 1090 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1; 1091 1092 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU); 1093 dbgs() << "Need " << Lat 1094 << " instructions to cover latency.\n"); 1095 1096 // Find up to Lat independent scalar instructions as early as 1097 // possible such that they can be scheduled after this MFMA. 1098 for ( ; Lat && LastSALU != E; ++LastSALU) { 1099 if (Visited.count(&*LastSALU)) 1100 continue; 1101 1102 if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU)) 1103 continue; 1104 1105 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited); 1106 } 1107 } 1108 } 1109 }; 1110 } // namespace 1111 1112 void GCNSubtarget::getPostRAMutations( 1113 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 1114 Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo)); 1115 } 1116 1117 std::unique_ptr<ScheduleDAGMutation> 1118 GCNSubtarget::createFillMFMAShadowMutation(const TargetInstrInfo *TII) const { 1119 return std::make_unique<FillMFMAShadowMutation>(&InstrInfo); 1120 } 1121 1122 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 1123 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 1124 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 1125 else 1126 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); 1127 } 1128 1129 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 1130 if (TM.getTargetTriple().getArch() == Triple::amdgcn) 1131 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 1132 else 1133 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); 1134 } 1135