1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Implements the AMDGPU specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUSubtarget.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUTargetMachine.h" 17 #include "AMDGPUCallLowering.h" 18 #include "AMDGPUInstructionSelector.h" 19 #include "AMDGPULegalizerInfo.h" 20 #include "AMDGPURegisterBankInfo.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 23 #include "llvm/ADT/SmallString.h" 24 #include "llvm/CodeGen/MachineScheduler.h" 25 #include "llvm/MC/MCSubtargetInfo.h" 26 #include "llvm/IR/MDBuilder.h" 27 #include "llvm/CodeGen/TargetFrameLowering.h" 28 #include <algorithm> 29 30 using namespace llvm; 31 32 #define DEBUG_TYPE "amdgpu-subtarget" 33 34 #define GET_SUBTARGETINFO_TARGET_DESC 35 #define GET_SUBTARGETINFO_CTOR 36 #define AMDGPUSubtarget GCNSubtarget 37 #include "AMDGPUGenSubtargetInfo.inc" 38 #define GET_SUBTARGETINFO_TARGET_DESC 39 #define GET_SUBTARGETINFO_CTOR 40 #undef AMDGPUSubtarget 41 #include "R600GenSubtargetInfo.inc" 42 43 static cl::opt<bool> DisablePowerSched( 44 "amdgpu-disable-power-sched", 45 cl::desc("Disable scheduling to minimize mAI power bursts"), 46 cl::init(false)); 47 48 static cl::opt<bool> EnableVGPRIndexMode( 49 "amdgpu-vgpr-index-mode", 50 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), 51 cl::init(false)); 52 53 GCNSubtarget::~GCNSubtarget() = default; 54 55 R600Subtarget & 56 R600Subtarget::initializeSubtargetDependencies(const Triple &TT, 57 StringRef GPU, StringRef FS) { 58 SmallString<256> FullFS("+promote-alloca,"); 59 FullFS += FS; 60 ParseSubtargetFeatures(GPU, FullFS); 61 62 HasMulU24 = getGeneration() >= EVERGREEN; 63 HasMulI24 = hasCaymanISA(); 64 65 return *this; 66 } 67 68 GCNSubtarget & 69 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 70 StringRef GPU, StringRef FS) { 71 // Determine default and user-specified characteristics 72 // 73 // We want to be able to turn these off, but making this a subtarget feature 74 // for SI has the unhelpful behavior that it unsets everything else if you 75 // disable it. 76 // 77 // Similarly we want enable-prt-strict-null to be on by default and not to 78 // unset everything else if it is disabled 79 80 // Assuming ECC is enabled is the conservative default. 81 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,+sram-ecc,+xnack,"); 82 83 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 84 FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,"; 85 86 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS 87 88 // Disable mutually exclusive bits. 89 if (FS.find_lower("+wavefrontsize") != StringRef::npos) { 90 if (FS.find_lower("wavefrontsize16") == StringRef::npos) 91 FullFS += "-wavefrontsize16,"; 92 if (FS.find_lower("wavefrontsize32") == StringRef::npos) 93 FullFS += "-wavefrontsize32,"; 94 if (FS.find_lower("wavefrontsize64") == StringRef::npos) 95 FullFS += "-wavefrontsize64,"; 96 } 97 98 FullFS += FS; 99 100 ParseSubtargetFeatures(GPU, FullFS); 101 102 // We don't support FP64 for EG/NI atm. 103 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 104 105 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es 106 // on VI and newer hardware to avoid assertion failures due to missing ADDR64 107 // variants of MUBUF instructions. 108 if (!hasAddr64() && !FS.contains("flat-for-global")) { 109 FlatForGlobal = true; 110 } 111 112 // Set defaults if needed. 113 if (MaxPrivateElementSize == 0) 114 MaxPrivateElementSize = 4; 115 116 if (LDSBankCount == 0) 117 LDSBankCount = 32; 118 119 if (TT.getArch() == Triple::amdgcn) { 120 if (LocalMemorySize == 0) 121 LocalMemorySize = 32768; 122 123 // Do something sensible for unspecified target. 124 if (!HasMovrel && !HasVGPRIndexMode) 125 HasMovrel = true; 126 } 127 128 // Don't crash on invalid devices. 129 if (WavefrontSize == 0) 130 WavefrontSize = 64; 131 132 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 133 134 // Disable XNACK on targets where it is not enabled by default unless it is 135 // explicitly requested. 136 if (!FS.contains("+xnack") && DoesNotSupportXNACK && EnableXNACK) { 137 ToggleFeature(AMDGPU::FeatureXNACK); 138 EnableXNACK = false; 139 } 140 141 // ECC is on by default, but turn it off if the hardware doesn't support it 142 // anyway. This matters for the gfx9 targets with d16 loads, but don't support 143 // ECC. 144 if (DoesNotSupportSRAMECC && EnableSRAMECC) { 145 ToggleFeature(AMDGPU::FeatureSRAMECC); 146 EnableSRAMECC = false; 147 } 148 149 return *this; 150 } 151 152 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : 153 TargetTriple(TT), 154 Has16BitInsts(false), 155 HasMadMixInsts(false), 156 FPExceptions(false), 157 HasSDWA(false), 158 HasVOP3PInsts(false), 159 HasMulI24(true), 160 HasMulU24(true), 161 HasInv2PiInlineImm(false), 162 HasFminFmaxLegacy(true), 163 EnablePromoteAlloca(false), 164 HasTrigReducedRange(false), 165 MaxWavesPerEU(10), 166 LocalMemorySize(0), 167 WavefrontSize(0) 168 { } 169 170 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 171 const GCNTargetMachine &TM) : 172 AMDGPUGenSubtargetInfo(TT, GPU, FS), 173 AMDGPUSubtarget(TT), 174 TargetTriple(TT), 175 Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS), 176 InstrItins(getInstrItineraryForCPU(GPU)), 177 LDSBankCount(0), 178 MaxPrivateElementSize(0), 179 180 FastFMAF32(false), 181 FastDenormalF32(false), 182 HalfRate64Ops(false), 183 184 FlatForGlobal(false), 185 AutoWaitcntBeforeBarrier(false), 186 CodeObjectV3(false), 187 UnalignedScratchAccess(false), 188 UnalignedBufferAccess(false), 189 190 HasApertureRegs(false), 191 EnableXNACK(false), 192 DoesNotSupportXNACK(false), 193 EnableCuMode(false), 194 TrapHandler(false), 195 196 EnableLoadStoreOpt(false), 197 EnableUnsafeDSOffsetFolding(false), 198 EnableSIScheduler(false), 199 EnableDS128(false), 200 EnablePRTStrictNull(false), 201 DumpCode(false), 202 203 FP64(false), 204 GCN3Encoding(false), 205 CIInsts(false), 206 GFX8Insts(false), 207 GFX9Insts(false), 208 GFX10Insts(false), 209 GFX7GFX8GFX9Insts(false), 210 SGPRInitBug(false), 211 HasSMemRealTime(false), 212 HasIntClamp(false), 213 HasFmaMixInsts(false), 214 HasMovrel(false), 215 HasVGPRIndexMode(false), 216 HasScalarStores(false), 217 HasScalarAtomics(false), 218 HasSDWAOmod(false), 219 HasSDWAScalar(false), 220 HasSDWASdst(false), 221 HasSDWAMac(false), 222 HasSDWAOutModsVOPC(false), 223 HasDPP(false), 224 HasDPP8(false), 225 HasR128A16(false), 226 HasGFX10A16(false), 227 HasNSAEncoding(false), 228 HasDLInsts(false), 229 HasDot1Insts(false), 230 HasDot2Insts(false), 231 HasDot3Insts(false), 232 HasDot4Insts(false), 233 HasDot5Insts(false), 234 HasDot6Insts(false), 235 HasMAIInsts(false), 236 HasPkFmacF16Inst(false), 237 HasAtomicFaddInsts(false), 238 EnableSRAMECC(false), 239 DoesNotSupportSRAMECC(false), 240 HasNoSdstCMPX(false), 241 HasVscnt(false), 242 HasRegisterBanking(false), 243 HasVOP3Literal(false), 244 HasNoDataDepHazard(false), 245 FlatAddressSpace(false), 246 FlatInstOffsets(false), 247 FlatGlobalInsts(false), 248 FlatScratchInsts(false), 249 ScalarFlatScratchInsts(false), 250 AddNoCarryInsts(false), 251 HasUnpackedD16VMem(false), 252 LDSMisalignedBug(false), 253 HasMFMAInlineLiteralBug(false), 254 255 ScalarizeGlobal(false), 256 257 HasVcmpxPermlaneHazard(false), 258 HasVMEMtoScalarWriteHazard(false), 259 HasSMEMtoVectorWriteHazard(false), 260 HasInstFwdPrefetchBug(false), 261 HasVcmpxExecWARHazard(false), 262 HasLdsBranchVmemWARHazard(false), 263 HasNSAtoVMEMBug(false), 264 HasOffset3fBug(false), 265 HasFlatSegmentOffsetBug(false), 266 267 FeatureDisable(false), 268 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 269 TLInfo(TM, *this), 270 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 271 MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this); 272 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 273 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 274 RegBankInfo.reset(new AMDGPURegisterBankInfo(*this)); 275 InstSelector.reset(new AMDGPUInstructionSelector( 276 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 277 } 278 279 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { 280 if (getGeneration() < GFX10) 281 return 1; 282 283 switch (Opcode) { 284 case AMDGPU::V_LSHLREV_B64: 285 case AMDGPU::V_LSHLREV_B64_gfx10: 286 case AMDGPU::V_LSHL_B64: 287 case AMDGPU::V_LSHRREV_B64: 288 case AMDGPU::V_LSHRREV_B64_gfx10: 289 case AMDGPU::V_LSHR_B64: 290 case AMDGPU::V_ASHRREV_I64: 291 case AMDGPU::V_ASHRREV_I64_gfx10: 292 case AMDGPU::V_ASHR_I64: 293 return 1; 294 } 295 296 return 2; 297 } 298 299 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 300 const Function &F) const { 301 if (NWaves == 1) 302 return getLocalMemorySize(); 303 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 304 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 305 if (!WorkGroupsPerCu) 306 return 0; 307 unsigned MaxWaves = getMaxWavesPerEU(); 308 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 309 } 310 311 // FIXME: Should return min,max range. 312 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 313 const Function &F) const { 314 const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second; 315 const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize); 316 if (!MaxWorkGroupsPerCu) 317 return 0; 318 319 const unsigned WaveSize = getWavefrontSize(); 320 321 // FIXME: Do we need to account for alignment requirement of LDS rounding the 322 // size up? 323 // Compute restriction based on LDS usage 324 unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u); 325 326 // This can be queried with more LDS than is possible, so just assume the 327 // worst. 328 if (NumGroups == 0) 329 return 1; 330 331 NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups); 332 333 // Round to the number of waves. 334 const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize; 335 unsigned MaxWaves = NumGroups * MaxGroupNumWaves; 336 337 // Clamp to the maximum possible number of waves. 338 MaxWaves = std::min(MaxWaves, getMaxWavesPerEU()); 339 340 // FIXME: Needs to be a multiple of the group size? 341 //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves); 342 343 assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() && 344 "computed invalid occupancy"); 345 return MaxWaves; 346 } 347 348 unsigned 349 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 350 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 351 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 352 } 353 354 std::pair<unsigned, unsigned> 355 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 356 switch (CC) { 357 case CallingConv::AMDGPU_VS: 358 case CallingConv::AMDGPU_LS: 359 case CallingConv::AMDGPU_HS: 360 case CallingConv::AMDGPU_ES: 361 case CallingConv::AMDGPU_GS: 362 case CallingConv::AMDGPU_PS: 363 return std::make_pair(1, getWavefrontSize()); 364 default: 365 return std::make_pair(1u, getMaxFlatWorkGroupSize()); 366 } 367 } 368 369 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 370 const Function &F) const { 371 // Default minimum/maximum flat work group sizes. 372 std::pair<unsigned, unsigned> Default = 373 getDefaultFlatWorkGroupSize(F.getCallingConv()); 374 375 // Requested minimum/maximum flat work group sizes. 376 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 377 F, "amdgpu-flat-work-group-size", Default); 378 379 // Make sure requested minimum is less than requested maximum. 380 if (Requested.first > Requested.second) 381 return Default; 382 383 // Make sure requested values do not violate subtarget's specifications. 384 if (Requested.first < getMinFlatWorkGroupSize()) 385 return Default; 386 if (Requested.second > getMaxFlatWorkGroupSize()) 387 return Default; 388 389 return Requested; 390 } 391 392 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 393 const Function &F) const { 394 // Default minimum/maximum number of waves per execution unit. 395 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 396 397 // Default/requested minimum/maximum flat work group sizes. 398 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 399 400 // If minimum/maximum flat work group sizes were explicitly requested using 401 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 402 // number of waves per execution unit to values implied by requested 403 // minimum/maximum flat work group sizes. 404 unsigned MinImpliedByFlatWorkGroupSize = 405 getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second); 406 bool RequestedFlatWorkGroupSize = false; 407 408 if (F.hasFnAttribute("amdgpu-flat-work-group-size")) { 409 Default.first = MinImpliedByFlatWorkGroupSize; 410 RequestedFlatWorkGroupSize = true; 411 } 412 413 // Requested minimum/maximum number of waves per execution unit. 414 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 415 F, "amdgpu-waves-per-eu", Default, true); 416 417 // Make sure requested minimum is less than requested maximum. 418 if (Requested.second && Requested.first > Requested.second) 419 return Default; 420 421 // Make sure requested values do not violate subtarget's specifications. 422 if (Requested.first < getMinWavesPerEU() || 423 Requested.first > getMaxWavesPerEU()) 424 return Default; 425 if (Requested.second > getMaxWavesPerEU()) 426 return Default; 427 428 // Make sure requested values are compatible with values implied by requested 429 // minimum/maximum flat work group sizes. 430 if (RequestedFlatWorkGroupSize && 431 Requested.first < MinImpliedByFlatWorkGroupSize) 432 return Default; 433 434 return Requested; 435 } 436 437 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 438 Function *Kernel = I->getParent()->getParent(); 439 unsigned MinSize = 0; 440 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 441 bool IdQuery = false; 442 443 // If reqd_work_group_size is present it narrows value down. 444 if (auto *CI = dyn_cast<CallInst>(I)) { 445 const Function *F = CI->getCalledFunction(); 446 if (F) { 447 unsigned Dim = UINT_MAX; 448 switch (F->getIntrinsicID()) { 449 case Intrinsic::amdgcn_workitem_id_x: 450 case Intrinsic::r600_read_tidig_x: 451 IdQuery = true; 452 LLVM_FALLTHROUGH; 453 case Intrinsic::r600_read_local_size_x: 454 Dim = 0; 455 break; 456 case Intrinsic::amdgcn_workitem_id_y: 457 case Intrinsic::r600_read_tidig_y: 458 IdQuery = true; 459 LLVM_FALLTHROUGH; 460 case Intrinsic::r600_read_local_size_y: 461 Dim = 1; 462 break; 463 case Intrinsic::amdgcn_workitem_id_z: 464 case Intrinsic::r600_read_tidig_z: 465 IdQuery = true; 466 LLVM_FALLTHROUGH; 467 case Intrinsic::r600_read_local_size_z: 468 Dim = 2; 469 break; 470 default: 471 break; 472 } 473 if (Dim <= 3) { 474 if (auto Node = Kernel->getMetadata("reqd_work_group_size")) 475 if (Node->getNumOperands() == 3) 476 MinSize = MaxSize = mdconst::extract<ConstantInt>( 477 Node->getOperand(Dim))->getZExtValue(); 478 } 479 } 480 } 481 482 if (!MaxSize) 483 return false; 484 485 // Range metadata is [Lo, Hi). For ID query we need to pass max size 486 // as Hi. For size query we need to pass Hi + 1. 487 if (IdQuery) 488 MinSize = 0; 489 else 490 ++MaxSize; 491 492 MDBuilder MDB(I->getContext()); 493 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 494 APInt(32, MaxSize)); 495 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 496 return true; 497 } 498 499 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 500 Align &MaxAlign) const { 501 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 502 F.getCallingConv() == CallingConv::SPIR_KERNEL); 503 504 const DataLayout &DL = F.getParent()->getDataLayout(); 505 uint64_t ExplicitArgBytes = 0; 506 MaxAlign = Align(1); 507 508 for (const Argument &Arg : F.args()) { 509 Type *ArgTy = Arg.getType(); 510 511 const Align Alignment(DL.getABITypeAlignment(ArgTy)); 512 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 513 ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize; 514 MaxAlign = std::max(MaxAlign, Alignment); 515 } 516 517 return ExplicitArgBytes; 518 } 519 520 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 521 Align &MaxAlign) const { 522 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 523 524 unsigned ExplicitOffset = getExplicitKernelArgOffset(F); 525 526 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 527 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 528 if (ImplicitBytes != 0) { 529 const Align Alignment = getAlignmentForImplicitArgPtr(); 530 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 531 } 532 533 // Being able to dereference past the end is useful for emitting scalar loads. 534 return alignTo(TotalSize, 4); 535 } 536 537 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 538 const TargetMachine &TM) : 539 R600GenSubtargetInfo(TT, GPU, FS), 540 AMDGPUSubtarget(TT), 541 InstrInfo(*this), 542 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 543 FMA(false), 544 CaymanISA(false), 545 CFALUBug(false), 546 HasVertexCache(false), 547 R600ALUInst(false), 548 FP64(false), 549 TexVTXClauseSize(0), 550 Gen(R600), 551 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)), 552 InstrItins(getInstrItineraryForCPU(GPU)) { } 553 554 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 555 unsigned NumRegionInstrs) const { 556 // Track register pressure so the scheduler can try to decrease 557 // pressure once register usage is above the threshold defined by 558 // SIRegisterInfo::getRegPressureSetLimit() 559 Policy.ShouldTrackPressure = true; 560 561 // Enabling both top down and bottom up scheduling seems to give us less 562 // register spills than just using one of these approaches on its own. 563 Policy.OnlyTopDown = false; 564 Policy.OnlyBottomUp = false; 565 566 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 567 if (!enableSIScheduler()) 568 Policy.ShouldTrackLaneMasks = true; 569 } 570 571 bool GCNSubtarget::hasMadF16() const { 572 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1; 573 } 574 575 bool GCNSubtarget::useVGPRIndexMode() const { 576 return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode()); 577 } 578 579 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 580 if (getGeneration() >= AMDGPUSubtarget::GFX10) 581 return getMaxWavesPerEU(); 582 583 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 584 if (SGPRs <= 80) 585 return 10; 586 if (SGPRs <= 88) 587 return 9; 588 if (SGPRs <= 100) 589 return 8; 590 return 7; 591 } 592 if (SGPRs <= 48) 593 return 10; 594 if (SGPRs <= 56) 595 return 9; 596 if (SGPRs <= 64) 597 return 8; 598 if (SGPRs <= 72) 599 return 7; 600 if (SGPRs <= 80) 601 return 6; 602 return 5; 603 } 604 605 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 606 unsigned MaxWaves = getMaxWavesPerEU(); 607 unsigned Granule = getVGPRAllocGranule(); 608 if (VGPRs < Granule) 609 return MaxWaves; 610 unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule; 611 return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves); 612 } 613 614 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 615 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 616 if (getGeneration() >= AMDGPUSubtarget::GFX10) 617 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. 618 619 if (MFI.hasFlatScratchInit()) { 620 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 621 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 622 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 623 return 4; // FLAT_SCRATCH, VCC (in that order). 624 } 625 626 if (isXNACKEnabled()) 627 return 4; // XNACK, VCC (in that order). 628 return 2; // VCC. 629 } 630 631 unsigned GCNSubtarget::computeOccupancy(const MachineFunction &MF, 632 unsigned LDSSize, 633 unsigned NumSGPRs, 634 unsigned NumVGPRs) const { 635 unsigned Occupancy = 636 std::min(getMaxWavesPerEU(), 637 getOccupancyWithLocalMemSize(LDSSize, MF.getFunction())); 638 if (NumSGPRs) 639 Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs)); 640 if (NumVGPRs) 641 Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs)); 642 return Occupancy; 643 } 644 645 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 646 const Function &F = MF.getFunction(); 647 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 648 649 // Compute maximum number of SGPRs function can use using default/requested 650 // minimum number of waves per execution unit. 651 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 652 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 653 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 654 655 // Check if maximum number of SGPRs was explicitly requested using 656 // "amdgpu-num-sgpr" attribute. 657 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 658 unsigned Requested = AMDGPU::getIntegerAttribute( 659 F, "amdgpu-num-sgpr", MaxNumSGPRs); 660 661 // Make sure requested value does not violate subtarget's specifications. 662 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 663 Requested = 0; 664 665 // If more SGPRs are required to support the input user/system SGPRs, 666 // increase to accommodate them. 667 // 668 // FIXME: This really ends up using the requested number of SGPRs + number 669 // of reserved special registers in total. Theoretically you could re-use 670 // the last input registers for these special registers, but this would 671 // require a lot of complexity to deal with the weird aliasing. 672 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 673 if (Requested && Requested < InputNumSGPRs) 674 Requested = InputNumSGPRs; 675 676 // Make sure requested value is compatible with values implied by 677 // default/requested minimum/maximum number of waves per execution unit. 678 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 679 Requested = 0; 680 if (WavesPerEU.second && 681 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 682 Requested = 0; 683 684 if (Requested) 685 MaxNumSGPRs = Requested; 686 } 687 688 if (hasSGPRInitBug()) 689 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 690 691 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 692 MaxAddressableNumSGPRs); 693 } 694 695 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 696 const Function &F = MF.getFunction(); 697 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 698 699 // Compute maximum number of VGPRs function can use using default/requested 700 // minimum number of waves per execution unit. 701 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 702 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 703 704 // Check if maximum number of VGPRs was explicitly requested using 705 // "amdgpu-num-vgpr" attribute. 706 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 707 unsigned Requested = AMDGPU::getIntegerAttribute( 708 F, "amdgpu-num-vgpr", MaxNumVGPRs); 709 710 // Make sure requested value is compatible with values implied by 711 // default/requested minimum/maximum number of waves per execution unit. 712 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 713 Requested = 0; 714 if (WavesPerEU.second && 715 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 716 Requested = 0; 717 718 if (Requested) 719 MaxNumVGPRs = Requested; 720 } 721 722 return MaxNumVGPRs; 723 } 724 725 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, 726 int UseOpIdx, SDep &Dep) const { 727 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || 728 !Def->isInstr() || !Use->isInstr()) 729 return; 730 731 MachineInstr *DefI = Def->getInstr(); 732 MachineInstr *UseI = Use->getInstr(); 733 734 if (DefI->isBundle()) { 735 const SIRegisterInfo *TRI = getRegisterInfo(); 736 auto Reg = Dep.getReg(); 737 MachineBasicBlock::const_instr_iterator I(DefI->getIterator()); 738 MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end()); 739 unsigned Lat = 0; 740 for (++I; I != E && I->isBundledWithPred(); ++I) { 741 if (I->modifiesRegister(Reg, TRI)) 742 Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I); 743 else if (Lat) 744 --Lat; 745 } 746 Dep.setLatency(Lat); 747 } else if (UseI->isBundle()) { 748 const SIRegisterInfo *TRI = getRegisterInfo(); 749 auto Reg = Dep.getReg(); 750 MachineBasicBlock::const_instr_iterator I(UseI->getIterator()); 751 MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end()); 752 unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI); 753 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) { 754 if (I->readsRegister(Reg, TRI)) 755 break; 756 --Lat; 757 } 758 Dep.setLatency(Lat); 759 } 760 } 761 762 namespace { 763 struct FillMFMAShadowMutation : ScheduleDAGMutation { 764 const SIInstrInfo *TII; 765 766 ScheduleDAGMI *DAG; 767 768 FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {} 769 770 bool isSALU(const SUnit *SU) const { 771 const MachineInstr *MI = SU->getInstr(); 772 return MI && TII->isSALU(*MI) && !MI->isTerminator(); 773 } 774 775 bool isVALU(const SUnit *SU) const { 776 const MachineInstr *MI = SU->getInstr(); 777 return MI && TII->isVALU(*MI); 778 } 779 780 bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const { 781 if (Pred->NodeNum < Succ->NodeNum) 782 return true; 783 784 SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred}); 785 786 for (unsigned I = 0; I < Succs.size(); ++I) { 787 for (const SDep &SI : Succs[I]->Succs) { 788 const SUnit *SU = SI.getSUnit(); 789 if (SU != Succs[I] && llvm::find(Succs, SU) == Succs.end()) 790 Succs.push_back(SU); 791 } 792 } 793 794 SmallPtrSet<const SUnit*, 32> Visited; 795 while (!Preds.empty()) { 796 const SUnit *SU = Preds.pop_back_val(); 797 if (llvm::find(Succs, SU) != Succs.end()) 798 return false; 799 Visited.insert(SU); 800 for (const SDep &SI : SU->Preds) 801 if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit())) 802 Preds.push_back(SI.getSUnit()); 803 } 804 805 return true; 806 } 807 808 // Link as much SALU intructions in chain as possible. Return the size 809 // of the chain. Links up to MaxChain instructions. 810 unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain, 811 SmallPtrSetImpl<SUnit *> &Visited) const { 812 SmallVector<SUnit *, 8> Worklist({To}); 813 unsigned Linked = 0; 814 815 while (!Worklist.empty() && MaxChain-- > 0) { 816 SUnit *SU = Worklist.pop_back_val(); 817 if (!Visited.insert(SU).second) 818 continue; 819 820 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From); 821 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n'); 822 823 if (SU->addPred(SDep(From, SDep::Artificial), false)) 824 ++Linked; 825 826 for (SDep &SI : From->Succs) { 827 SUnit *SUv = SI.getSUnit(); 828 if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU)) 829 SUv->addPred(SDep(SU, SDep::Artificial), false); 830 } 831 832 for (SDep &SI : SU->Succs) { 833 SUnit *Succ = SI.getSUnit(); 834 if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ)) 835 Worklist.push_back(Succ); 836 } 837 } 838 839 return Linked; 840 } 841 842 void apply(ScheduleDAGInstrs *DAGInstrs) override { 843 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); 844 if (!ST.hasMAIInsts() || DisablePowerSched) 845 return; 846 DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); 847 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); 848 if (!TSchedModel || DAG->SUnits.empty()) 849 return; 850 851 // Scan for MFMA long latency instructions and try to add a dependency 852 // of available SALU instructions to give them a chance to fill MFMA 853 // shadow. That is desirable to fill MFMA shadow with SALU instructions 854 // rather than VALU to prevent power consumption bursts and throttle. 855 auto LastSALU = DAG->SUnits.begin(); 856 auto E = DAG->SUnits.end(); 857 SmallPtrSet<SUnit*, 32> Visited; 858 for (SUnit &SU : DAG->SUnits) { 859 MachineInstr &MAI = *SU.getInstr(); 860 if (!TII->isMAI(MAI) || 861 MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 || 862 MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32) 863 continue; 864 865 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1; 866 867 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU); 868 dbgs() << "Need " << Lat 869 << " instructions to cover latency.\n"); 870 871 // Find up to Lat independent scalar instructions as early as 872 // possible such that they can be scheduled after this MFMA. 873 for ( ; Lat && LastSALU != E; ++LastSALU) { 874 if (Visited.count(&*LastSALU)) 875 continue; 876 877 if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU)) 878 continue; 879 880 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited); 881 } 882 } 883 } 884 }; 885 } // namespace 886 887 void GCNSubtarget::getPostRAMutations( 888 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 889 Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo)); 890 } 891 892 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 893 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 894 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 895 else 896 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); 897 } 898 899 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 900 if (TM.getTargetTriple().getArch() == Triple::amdgcn) 901 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 902 else 903 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); 904 } 905