1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Implements the AMDGPU specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUSubtarget.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUTargetMachine.h" 17 #include "AMDGPUCallLowering.h" 18 #include "AMDGPUInstructionSelector.h" 19 #include "AMDGPULegalizerInfo.h" 20 #include "AMDGPURegisterBankInfo.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 23 #include "llvm/ADT/SmallString.h" 24 #include "llvm/CodeGen/MachineScheduler.h" 25 #include "llvm/MC/MCSubtargetInfo.h" 26 #include "llvm/IR/MDBuilder.h" 27 #include "llvm/CodeGen/TargetFrameLowering.h" 28 #include <algorithm> 29 30 using namespace llvm; 31 32 #define DEBUG_TYPE "amdgpu-subtarget" 33 34 #define GET_SUBTARGETINFO_TARGET_DESC 35 #define GET_SUBTARGETINFO_CTOR 36 #define AMDGPUSubtarget GCNSubtarget 37 #include "AMDGPUGenSubtargetInfo.inc" 38 #define GET_SUBTARGETINFO_TARGET_DESC 39 #define GET_SUBTARGETINFO_CTOR 40 #undef AMDGPUSubtarget 41 #include "R600GenSubtargetInfo.inc" 42 43 static cl::opt<bool> DisablePowerSched( 44 "amdgpu-disable-power-sched", 45 cl::desc("Disable scheduling to minimize mAI power bursts"), 46 cl::init(false)); 47 48 static cl::opt<bool> EnableVGPRIndexMode( 49 "amdgpu-vgpr-index-mode", 50 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), 51 cl::init(false)); 52 53 GCNSubtarget::~GCNSubtarget() = default; 54 55 R600Subtarget & 56 R600Subtarget::initializeSubtargetDependencies(const Triple &TT, 57 StringRef GPU, StringRef FS) { 58 SmallString<256> FullFS("+promote-alloca,"); 59 FullFS += FS; 60 ParseSubtargetFeatures(GPU, FullFS); 61 62 HasMulU24 = getGeneration() >= EVERGREEN; 63 HasMulI24 = hasCaymanISA(); 64 65 return *this; 66 } 67 68 GCNSubtarget & 69 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 70 StringRef GPU, StringRef FS) { 71 // Determine default and user-specified characteristics 72 // 73 // We want to be able to turn these off, but making this a subtarget feature 74 // for SI has the unhelpful behavior that it unsets everything else if you 75 // disable it. 76 // 77 // Similarly we want enable-prt-strict-null to be on by default and not to 78 // unset everything else if it is disabled 79 80 // Assuming ECC is enabled is the conservative default. 81 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,+sram-ecc,+xnack,"); 82 83 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 84 FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,"; 85 86 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS 87 88 // Disable mutually exclusive bits. 89 if (FS.find_lower("+wavefrontsize") != StringRef::npos) { 90 if (FS.find_lower("wavefrontsize16") == StringRef::npos) 91 FullFS += "-wavefrontsize16,"; 92 if (FS.find_lower("wavefrontsize32") == StringRef::npos) 93 FullFS += "-wavefrontsize32,"; 94 if (FS.find_lower("wavefrontsize64") == StringRef::npos) 95 FullFS += "-wavefrontsize64,"; 96 } 97 98 FullFS += FS; 99 100 ParseSubtargetFeatures(GPU, FullFS); 101 102 // We don't support FP64 for EG/NI atm. 103 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 104 105 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es 106 // on VI and newer hardware to avoid assertion failures due to missing ADDR64 107 // variants of MUBUF instructions. 108 if (!hasAddr64() && !FS.contains("flat-for-global")) { 109 FlatForGlobal = true; 110 } 111 112 // Set defaults if needed. 113 if (MaxPrivateElementSize == 0) 114 MaxPrivateElementSize = 4; 115 116 if (LDSBankCount == 0) 117 LDSBankCount = 32; 118 119 if (TT.getArch() == Triple::amdgcn) { 120 if (LocalMemorySize == 0) 121 LocalMemorySize = 32768; 122 123 // Do something sensible for unspecified target. 124 if (!HasMovrel && !HasVGPRIndexMode) 125 HasMovrel = true; 126 } 127 128 // Don't crash on invalid devices. 129 if (WavefrontSizeLog2 == 0) 130 WavefrontSizeLog2 = 5; 131 132 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 133 134 // Disable XNACK on targets where it is not enabled by default unless it is 135 // explicitly requested. 136 if (!FS.contains("+xnack") && DoesNotSupportXNACK && EnableXNACK) { 137 ToggleFeature(AMDGPU::FeatureXNACK); 138 EnableXNACK = false; 139 } 140 141 // ECC is on by default, but turn it off if the hardware doesn't support it 142 // anyway. This matters for the gfx9 targets with d16 loads, but don't support 143 // ECC. 144 if (DoesNotSupportSRAMECC && EnableSRAMECC) { 145 ToggleFeature(AMDGPU::FeatureSRAMECC); 146 EnableSRAMECC = false; 147 } 148 149 return *this; 150 } 151 152 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : 153 TargetTriple(TT), 154 Has16BitInsts(false), 155 HasMadMixInsts(false), 156 HasMadMacF32Insts(false), 157 HasDsSrc2Insts(false), 158 HasSDWA(false), 159 HasVOP3PInsts(false), 160 HasMulI24(true), 161 HasMulU24(true), 162 HasInv2PiInlineImm(false), 163 HasFminFmaxLegacy(true), 164 EnablePromoteAlloca(false), 165 HasTrigReducedRange(false), 166 MaxWavesPerEU(10), 167 LocalMemorySize(0), 168 WavefrontSizeLog2(0) 169 { } 170 171 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 172 const GCNTargetMachine &TM) : 173 AMDGPUGenSubtargetInfo(TT, GPU, FS), 174 AMDGPUSubtarget(TT), 175 TargetTriple(TT), 176 Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS), 177 InstrItins(getInstrItineraryForCPU(GPU)), 178 LDSBankCount(0), 179 MaxPrivateElementSize(0), 180 181 FastFMAF32(false), 182 FastDenormalF32(false), 183 HalfRate64Ops(false), 184 185 FlatForGlobal(false), 186 AutoWaitcntBeforeBarrier(false), 187 CodeObjectV3(false), 188 UnalignedScratchAccess(false), 189 UnalignedBufferAccess(false), 190 191 HasApertureRegs(false), 192 EnableXNACK(false), 193 DoesNotSupportXNACK(false), 194 EnableCuMode(false), 195 TrapHandler(false), 196 197 EnableLoadStoreOpt(false), 198 EnableUnsafeDSOffsetFolding(false), 199 EnableSIScheduler(false), 200 EnableDS128(false), 201 EnablePRTStrictNull(false), 202 DumpCode(false), 203 204 FP64(false), 205 GCN3Encoding(false), 206 CIInsts(false), 207 GFX8Insts(false), 208 GFX9Insts(false), 209 GFX10Insts(false), 210 GFX10_3Insts(false), 211 GFX7GFX8GFX9Insts(false), 212 SGPRInitBug(false), 213 HasSMemRealTime(false), 214 HasIntClamp(false), 215 HasFmaMixInsts(false), 216 HasMovrel(false), 217 HasVGPRIndexMode(false), 218 HasScalarStores(false), 219 HasScalarAtomics(false), 220 HasSDWAOmod(false), 221 HasSDWAScalar(false), 222 HasSDWASdst(false), 223 HasSDWAMac(false), 224 HasSDWAOutModsVOPC(false), 225 HasDPP(false), 226 HasDPP8(false), 227 HasR128A16(false), 228 HasGFX10A16(false), 229 HasG16(false), 230 HasNSAEncoding(false), 231 GFX10_BEncoding(false), 232 HasDLInsts(false), 233 HasDot1Insts(false), 234 HasDot2Insts(false), 235 HasDot3Insts(false), 236 HasDot4Insts(false), 237 HasDot5Insts(false), 238 HasDot6Insts(false), 239 HasMAIInsts(false), 240 HasPkFmacF16Inst(false), 241 HasAtomicFaddInsts(false), 242 EnableSRAMECC(false), 243 DoesNotSupportSRAMECC(false), 244 HasNoSdstCMPX(false), 245 HasVscnt(false), 246 HasGetWaveIdInst(false), 247 HasSMemTimeInst(false), 248 HasRegisterBanking(false), 249 HasVOP3Literal(false), 250 HasNoDataDepHazard(false), 251 FlatAddressSpace(false), 252 FlatInstOffsets(false), 253 FlatGlobalInsts(false), 254 FlatScratchInsts(false), 255 ScalarFlatScratchInsts(false), 256 AddNoCarryInsts(false), 257 HasUnpackedD16VMem(false), 258 LDSMisalignedBug(false), 259 HasMFMAInlineLiteralBug(false), 260 261 ScalarizeGlobal(false), 262 263 HasVcmpxPermlaneHazard(false), 264 HasVMEMtoScalarWriteHazard(false), 265 HasSMEMtoVectorWriteHazard(false), 266 HasInstFwdPrefetchBug(false), 267 HasVcmpxExecWARHazard(false), 268 HasLdsBranchVmemWARHazard(false), 269 HasNSAtoVMEMBug(false), 270 HasOffset3fBug(false), 271 HasFlatSegmentOffsetBug(false), 272 273 FeatureDisable(false), 274 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 275 TLInfo(TM, *this), 276 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 277 MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this); 278 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 279 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering())); 280 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 281 RegBankInfo.reset(new AMDGPURegisterBankInfo(*this)); 282 InstSelector.reset(new AMDGPUInstructionSelector( 283 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 284 } 285 286 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { 287 if (getGeneration() < GFX10) 288 return 1; 289 290 switch (Opcode) { 291 case AMDGPU::V_LSHLREV_B64: 292 case AMDGPU::V_LSHLREV_B64_gfx10: 293 case AMDGPU::V_LSHL_B64: 294 case AMDGPU::V_LSHRREV_B64: 295 case AMDGPU::V_LSHRREV_B64_gfx10: 296 case AMDGPU::V_LSHR_B64: 297 case AMDGPU::V_ASHRREV_I64: 298 case AMDGPU::V_ASHRREV_I64_gfx10: 299 case AMDGPU::V_ASHR_I64: 300 return 1; 301 } 302 303 return 2; 304 } 305 306 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 307 const Function &F) const { 308 if (NWaves == 1) 309 return getLocalMemorySize(); 310 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 311 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 312 if (!WorkGroupsPerCu) 313 return 0; 314 unsigned MaxWaves = getMaxWavesPerEU(); 315 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 316 } 317 318 // FIXME: Should return min,max range. 319 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 320 const Function &F) const { 321 const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second; 322 const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize); 323 if (!MaxWorkGroupsPerCu) 324 return 0; 325 326 const unsigned WaveSize = getWavefrontSize(); 327 328 // FIXME: Do we need to account for alignment requirement of LDS rounding the 329 // size up? 330 // Compute restriction based on LDS usage 331 unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u); 332 333 // This can be queried with more LDS than is possible, so just assume the 334 // worst. 335 if (NumGroups == 0) 336 return 1; 337 338 NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups); 339 340 // Round to the number of waves. 341 const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize; 342 unsigned MaxWaves = NumGroups * MaxGroupNumWaves; 343 344 // Clamp to the maximum possible number of waves. 345 MaxWaves = std::min(MaxWaves, getMaxWavesPerEU()); 346 347 // FIXME: Needs to be a multiple of the group size? 348 //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves); 349 350 assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() && 351 "computed invalid occupancy"); 352 return MaxWaves; 353 } 354 355 unsigned 356 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 357 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 358 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 359 } 360 361 std::pair<unsigned, unsigned> 362 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 363 switch (CC) { 364 case CallingConv::AMDGPU_VS: 365 case CallingConv::AMDGPU_LS: 366 case CallingConv::AMDGPU_HS: 367 case CallingConv::AMDGPU_ES: 368 case CallingConv::AMDGPU_GS: 369 case CallingConv::AMDGPU_PS: 370 return std::make_pair(1, getWavefrontSize()); 371 default: 372 return std::make_pair(1u, getMaxFlatWorkGroupSize()); 373 } 374 } 375 376 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 377 const Function &F) const { 378 // Default minimum/maximum flat work group sizes. 379 std::pair<unsigned, unsigned> Default = 380 getDefaultFlatWorkGroupSize(F.getCallingConv()); 381 382 // Requested minimum/maximum flat work group sizes. 383 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 384 F, "amdgpu-flat-work-group-size", Default); 385 386 // Make sure requested minimum is less than requested maximum. 387 if (Requested.first > Requested.second) 388 return Default; 389 390 // Make sure requested values do not violate subtarget's specifications. 391 if (Requested.first < getMinFlatWorkGroupSize()) 392 return Default; 393 if (Requested.second > getMaxFlatWorkGroupSize()) 394 return Default; 395 396 return Requested; 397 } 398 399 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 400 const Function &F) const { 401 // Default minimum/maximum number of waves per execution unit. 402 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 403 404 // Default/requested minimum/maximum flat work group sizes. 405 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 406 407 // If minimum/maximum flat work group sizes were explicitly requested using 408 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 409 // number of waves per execution unit to values implied by requested 410 // minimum/maximum flat work group sizes. 411 unsigned MinImpliedByFlatWorkGroupSize = 412 getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second); 413 bool RequestedFlatWorkGroupSize = false; 414 415 if (F.hasFnAttribute("amdgpu-flat-work-group-size")) { 416 Default.first = MinImpliedByFlatWorkGroupSize; 417 RequestedFlatWorkGroupSize = true; 418 } 419 420 // Requested minimum/maximum number of waves per execution unit. 421 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 422 F, "amdgpu-waves-per-eu", Default, true); 423 424 // Make sure requested minimum is less than requested maximum. 425 if (Requested.second && Requested.first > Requested.second) 426 return Default; 427 428 // Make sure requested values do not violate subtarget's specifications. 429 if (Requested.first < getMinWavesPerEU() || 430 Requested.first > getMaxWavesPerEU()) 431 return Default; 432 if (Requested.second > getMaxWavesPerEU()) 433 return Default; 434 435 // Make sure requested values are compatible with values implied by requested 436 // minimum/maximum flat work group sizes. 437 if (RequestedFlatWorkGroupSize && 438 Requested.first < MinImpliedByFlatWorkGroupSize) 439 return Default; 440 441 return Requested; 442 } 443 444 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 445 Function *Kernel = I->getParent()->getParent(); 446 unsigned MinSize = 0; 447 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 448 bool IdQuery = false; 449 450 // If reqd_work_group_size is present it narrows value down. 451 if (auto *CI = dyn_cast<CallInst>(I)) { 452 const Function *F = CI->getCalledFunction(); 453 if (F) { 454 unsigned Dim = UINT_MAX; 455 switch (F->getIntrinsicID()) { 456 case Intrinsic::amdgcn_workitem_id_x: 457 case Intrinsic::r600_read_tidig_x: 458 IdQuery = true; 459 LLVM_FALLTHROUGH; 460 case Intrinsic::r600_read_local_size_x: 461 Dim = 0; 462 break; 463 case Intrinsic::amdgcn_workitem_id_y: 464 case Intrinsic::r600_read_tidig_y: 465 IdQuery = true; 466 LLVM_FALLTHROUGH; 467 case Intrinsic::r600_read_local_size_y: 468 Dim = 1; 469 break; 470 case Intrinsic::amdgcn_workitem_id_z: 471 case Intrinsic::r600_read_tidig_z: 472 IdQuery = true; 473 LLVM_FALLTHROUGH; 474 case Intrinsic::r600_read_local_size_z: 475 Dim = 2; 476 break; 477 default: 478 break; 479 } 480 if (Dim <= 3) { 481 if (auto Node = Kernel->getMetadata("reqd_work_group_size")) 482 if (Node->getNumOperands() == 3) 483 MinSize = MaxSize = mdconst::extract<ConstantInt>( 484 Node->getOperand(Dim))->getZExtValue(); 485 } 486 } 487 } 488 489 if (!MaxSize) 490 return false; 491 492 // Range metadata is [Lo, Hi). For ID query we need to pass max size 493 // as Hi. For size query we need to pass Hi + 1. 494 if (IdQuery) 495 MinSize = 0; 496 else 497 ++MaxSize; 498 499 MDBuilder MDB(I->getContext()); 500 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 501 APInt(32, MaxSize)); 502 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 503 return true; 504 } 505 506 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 507 Align &MaxAlign) const { 508 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 509 F.getCallingConv() == CallingConv::SPIR_KERNEL); 510 511 const DataLayout &DL = F.getParent()->getDataLayout(); 512 uint64_t ExplicitArgBytes = 0; 513 MaxAlign = Align(1); 514 515 for (const Argument &Arg : F.args()) { 516 Type *ArgTy = Arg.getType(); 517 518 const Align Alignment(DL.getABITypeAlignment(ArgTy)); 519 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 520 ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize; 521 MaxAlign = std::max(MaxAlign, Alignment); 522 } 523 524 return ExplicitArgBytes; 525 } 526 527 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 528 Align &MaxAlign) const { 529 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 530 531 unsigned ExplicitOffset = getExplicitKernelArgOffset(F); 532 533 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 534 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 535 if (ImplicitBytes != 0) { 536 const Align Alignment = getAlignmentForImplicitArgPtr(); 537 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 538 } 539 540 // Being able to dereference past the end is useful for emitting scalar loads. 541 return alignTo(TotalSize, 4); 542 } 543 544 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 545 const TargetMachine &TM) : 546 R600GenSubtargetInfo(TT, GPU, FS), 547 AMDGPUSubtarget(TT), 548 InstrInfo(*this), 549 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 550 FMA(false), 551 CaymanISA(false), 552 CFALUBug(false), 553 HasVertexCache(false), 554 R600ALUInst(false), 555 FP64(false), 556 TexVTXClauseSize(0), 557 Gen(R600), 558 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)), 559 InstrItins(getInstrItineraryForCPU(GPU)) { } 560 561 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 562 unsigned NumRegionInstrs) const { 563 // Track register pressure so the scheduler can try to decrease 564 // pressure once register usage is above the threshold defined by 565 // SIRegisterInfo::getRegPressureSetLimit() 566 Policy.ShouldTrackPressure = true; 567 568 // Enabling both top down and bottom up scheduling seems to give us less 569 // register spills than just using one of these approaches on its own. 570 Policy.OnlyTopDown = false; 571 Policy.OnlyBottomUp = false; 572 573 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 574 if (!enableSIScheduler()) 575 Policy.ShouldTrackLaneMasks = true; 576 } 577 578 bool GCNSubtarget::hasMadF16() const { 579 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1; 580 } 581 582 bool GCNSubtarget::useVGPRIndexMode() const { 583 return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode()); 584 } 585 586 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 587 if (getGeneration() >= AMDGPUSubtarget::GFX10) 588 return getMaxWavesPerEU(); 589 590 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 591 if (SGPRs <= 80) 592 return 10; 593 if (SGPRs <= 88) 594 return 9; 595 if (SGPRs <= 100) 596 return 8; 597 return 7; 598 } 599 if (SGPRs <= 48) 600 return 10; 601 if (SGPRs <= 56) 602 return 9; 603 if (SGPRs <= 64) 604 return 8; 605 if (SGPRs <= 72) 606 return 7; 607 if (SGPRs <= 80) 608 return 6; 609 return 5; 610 } 611 612 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 613 unsigned MaxWaves = getMaxWavesPerEU(); 614 unsigned Granule = getVGPRAllocGranule(); 615 if (VGPRs < Granule) 616 return MaxWaves; 617 unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule; 618 return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves); 619 } 620 621 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 622 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 623 if (getGeneration() >= AMDGPUSubtarget::GFX10) 624 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. 625 626 if (MFI.hasFlatScratchInit()) { 627 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 628 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 629 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 630 return 4; // FLAT_SCRATCH, VCC (in that order). 631 } 632 633 if (isXNACKEnabled()) 634 return 4; // XNACK, VCC (in that order). 635 return 2; // VCC. 636 } 637 638 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize, 639 unsigned NumSGPRs, 640 unsigned NumVGPRs) const { 641 unsigned Occupancy = 642 std::min(getMaxWavesPerEU(), 643 getOccupancyWithLocalMemSize(LDSSize, F)); 644 if (NumSGPRs) 645 Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs)); 646 if (NumVGPRs) 647 Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs)); 648 return Occupancy; 649 } 650 651 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 652 const Function &F = MF.getFunction(); 653 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 654 655 // Compute maximum number of SGPRs function can use using default/requested 656 // minimum number of waves per execution unit. 657 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 658 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 659 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 660 661 // Check if maximum number of SGPRs was explicitly requested using 662 // "amdgpu-num-sgpr" attribute. 663 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 664 unsigned Requested = AMDGPU::getIntegerAttribute( 665 F, "amdgpu-num-sgpr", MaxNumSGPRs); 666 667 // Make sure requested value does not violate subtarget's specifications. 668 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 669 Requested = 0; 670 671 // If more SGPRs are required to support the input user/system SGPRs, 672 // increase to accommodate them. 673 // 674 // FIXME: This really ends up using the requested number of SGPRs + number 675 // of reserved special registers in total. Theoretically you could re-use 676 // the last input registers for these special registers, but this would 677 // require a lot of complexity to deal with the weird aliasing. 678 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 679 if (Requested && Requested < InputNumSGPRs) 680 Requested = InputNumSGPRs; 681 682 // Make sure requested value is compatible with values implied by 683 // default/requested minimum/maximum number of waves per execution unit. 684 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 685 Requested = 0; 686 if (WavesPerEU.second && 687 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 688 Requested = 0; 689 690 if (Requested) 691 MaxNumSGPRs = Requested; 692 } 693 694 if (hasSGPRInitBug()) 695 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 696 697 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 698 MaxAddressableNumSGPRs); 699 } 700 701 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 702 const Function &F = MF.getFunction(); 703 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 704 705 // Compute maximum number of VGPRs function can use using default/requested 706 // minimum number of waves per execution unit. 707 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 708 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 709 710 // Check if maximum number of VGPRs was explicitly requested using 711 // "amdgpu-num-vgpr" attribute. 712 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 713 unsigned Requested = AMDGPU::getIntegerAttribute( 714 F, "amdgpu-num-vgpr", MaxNumVGPRs); 715 716 // Make sure requested value is compatible with values implied by 717 // default/requested minimum/maximum number of waves per execution unit. 718 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 719 Requested = 0; 720 if (WavesPerEU.second && 721 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 722 Requested = 0; 723 724 if (Requested) 725 MaxNumVGPRs = Requested; 726 } 727 728 return MaxNumVGPRs; 729 } 730 731 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, 732 int UseOpIdx, SDep &Dep) const { 733 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || 734 !Def->isInstr() || !Use->isInstr()) 735 return; 736 737 MachineInstr *DefI = Def->getInstr(); 738 MachineInstr *UseI = Use->getInstr(); 739 740 if (DefI->isBundle()) { 741 const SIRegisterInfo *TRI = getRegisterInfo(); 742 auto Reg = Dep.getReg(); 743 MachineBasicBlock::const_instr_iterator I(DefI->getIterator()); 744 MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end()); 745 unsigned Lat = 0; 746 for (++I; I != E && I->isBundledWithPred(); ++I) { 747 if (I->modifiesRegister(Reg, TRI)) 748 Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I); 749 else if (Lat) 750 --Lat; 751 } 752 Dep.setLatency(Lat); 753 } else if (UseI->isBundle()) { 754 const SIRegisterInfo *TRI = getRegisterInfo(); 755 auto Reg = Dep.getReg(); 756 MachineBasicBlock::const_instr_iterator I(UseI->getIterator()); 757 MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end()); 758 unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI); 759 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) { 760 if (I->readsRegister(Reg, TRI)) 761 break; 762 --Lat; 763 } 764 Dep.setLatency(Lat); 765 } 766 } 767 768 namespace { 769 struct FillMFMAShadowMutation : ScheduleDAGMutation { 770 const SIInstrInfo *TII; 771 772 ScheduleDAGMI *DAG; 773 774 FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {} 775 776 bool isSALU(const SUnit *SU) const { 777 const MachineInstr *MI = SU->getInstr(); 778 return MI && TII->isSALU(*MI) && !MI->isTerminator(); 779 } 780 781 bool isVALU(const SUnit *SU) const { 782 const MachineInstr *MI = SU->getInstr(); 783 return MI && TII->isVALU(*MI); 784 } 785 786 bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const { 787 if (Pred->NodeNum < Succ->NodeNum) 788 return true; 789 790 SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred}); 791 792 for (unsigned I = 0; I < Succs.size(); ++I) { 793 for (const SDep &SI : Succs[I]->Succs) { 794 const SUnit *SU = SI.getSUnit(); 795 if (SU != Succs[I] && llvm::find(Succs, SU) == Succs.end()) 796 Succs.push_back(SU); 797 } 798 } 799 800 SmallPtrSet<const SUnit*, 32> Visited; 801 while (!Preds.empty()) { 802 const SUnit *SU = Preds.pop_back_val(); 803 if (llvm::find(Succs, SU) != Succs.end()) 804 return false; 805 Visited.insert(SU); 806 for (const SDep &SI : SU->Preds) 807 if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit())) 808 Preds.push_back(SI.getSUnit()); 809 } 810 811 return true; 812 } 813 814 // Link as much SALU intructions in chain as possible. Return the size 815 // of the chain. Links up to MaxChain instructions. 816 unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain, 817 SmallPtrSetImpl<SUnit *> &Visited) const { 818 SmallVector<SUnit *, 8> Worklist({To}); 819 unsigned Linked = 0; 820 821 while (!Worklist.empty() && MaxChain-- > 0) { 822 SUnit *SU = Worklist.pop_back_val(); 823 if (!Visited.insert(SU).second) 824 continue; 825 826 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From); 827 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n'); 828 829 if (SU->addPred(SDep(From, SDep::Artificial), false)) 830 ++Linked; 831 832 for (SDep &SI : From->Succs) { 833 SUnit *SUv = SI.getSUnit(); 834 if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU)) 835 SUv->addPred(SDep(SU, SDep::Artificial), false); 836 } 837 838 for (SDep &SI : SU->Succs) { 839 SUnit *Succ = SI.getSUnit(); 840 if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ)) 841 Worklist.push_back(Succ); 842 } 843 } 844 845 return Linked; 846 } 847 848 void apply(ScheduleDAGInstrs *DAGInstrs) override { 849 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); 850 if (!ST.hasMAIInsts() || DisablePowerSched) 851 return; 852 DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); 853 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); 854 if (!TSchedModel || DAG->SUnits.empty()) 855 return; 856 857 // Scan for MFMA long latency instructions and try to add a dependency 858 // of available SALU instructions to give them a chance to fill MFMA 859 // shadow. That is desirable to fill MFMA shadow with SALU instructions 860 // rather than VALU to prevent power consumption bursts and throttle. 861 auto LastSALU = DAG->SUnits.begin(); 862 auto E = DAG->SUnits.end(); 863 SmallPtrSet<SUnit*, 32> Visited; 864 for (SUnit &SU : DAG->SUnits) { 865 MachineInstr &MAI = *SU.getInstr(); 866 if (!TII->isMAI(MAI) || 867 MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 || 868 MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32) 869 continue; 870 871 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1; 872 873 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU); 874 dbgs() << "Need " << Lat 875 << " instructions to cover latency.\n"); 876 877 // Find up to Lat independent scalar instructions as early as 878 // possible such that they can be scheduled after this MFMA. 879 for ( ; Lat && LastSALU != E; ++LastSALU) { 880 if (Visited.count(&*LastSALU)) 881 continue; 882 883 if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU)) 884 continue; 885 886 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited); 887 } 888 } 889 } 890 }; 891 } // namespace 892 893 void GCNSubtarget::getPostRAMutations( 894 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 895 Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo)); 896 } 897 898 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 899 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 900 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 901 else 902 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); 903 } 904 905 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 906 if (TM.getTargetTriple().getArch() == Triple::amdgcn) 907 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 908 else 909 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); 910 } 911