1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Implements the AMDGPU specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUSubtarget.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUTargetMachine.h" 17 #include "AMDGPUCallLowering.h" 18 #include "AMDGPUInstructionSelector.h" 19 #include "AMDGPULegalizerInfo.h" 20 #include "AMDGPURegisterBankInfo.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 23 #include "llvm/ADT/SmallString.h" 24 #include "llvm/CodeGen/MachineScheduler.h" 25 #include "llvm/MC/MCSubtargetInfo.h" 26 #include "llvm/IR/MDBuilder.h" 27 #include "llvm/CodeGen/TargetFrameLowering.h" 28 #include <algorithm> 29 30 using namespace llvm; 31 32 #define DEBUG_TYPE "amdgpu-subtarget" 33 34 #define GET_SUBTARGETINFO_TARGET_DESC 35 #define GET_SUBTARGETINFO_CTOR 36 #define AMDGPUSubtarget GCNSubtarget 37 #include "AMDGPUGenSubtargetInfo.inc" 38 #define GET_SUBTARGETINFO_TARGET_DESC 39 #define GET_SUBTARGETINFO_CTOR 40 #undef AMDGPUSubtarget 41 #include "R600GenSubtargetInfo.inc" 42 43 static cl::opt<bool> DisablePowerSched( 44 "amdgpu-disable-power-sched", 45 cl::desc("Disable scheduling to minimize mAI power bursts"), 46 cl::init(false)); 47 48 static cl::opt<bool> EnableVGPRIndexMode( 49 "amdgpu-vgpr-index-mode", 50 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), 51 cl::init(false)); 52 53 GCNSubtarget::~GCNSubtarget() = default; 54 55 R600Subtarget & 56 R600Subtarget::initializeSubtargetDependencies(const Triple &TT, 57 StringRef GPU, StringRef FS) { 58 SmallString<256> FullFS("+promote-alloca,"); 59 FullFS += FS; 60 ParseSubtargetFeatures(GPU, FullFS); 61 62 HasMulU24 = getGeneration() >= EVERGREEN; 63 HasMulI24 = hasCaymanISA(); 64 65 return *this; 66 } 67 68 GCNSubtarget & 69 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 70 StringRef GPU, StringRef FS) { 71 // Determine default and user-specified characteristics 72 // 73 // We want to be able to turn these off, but making this a subtarget feature 74 // for SI has the unhelpful behavior that it unsets everything else if you 75 // disable it. 76 // 77 // Similarly we want enable-prt-strict-null to be on by default and not to 78 // unset everything else if it is disabled 79 80 // Assuming ECC is enabled is the conservative default. 81 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,+sram-ecc,+xnack,"); 82 83 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 84 FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,"; 85 86 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS 87 88 // Disable mutually exclusive bits. 89 if (FS.find_lower("+wavefrontsize") != StringRef::npos) { 90 if (FS.find_lower("wavefrontsize16") == StringRef::npos) 91 FullFS += "-wavefrontsize16,"; 92 if (FS.find_lower("wavefrontsize32") == StringRef::npos) 93 FullFS += "-wavefrontsize32,"; 94 if (FS.find_lower("wavefrontsize64") == StringRef::npos) 95 FullFS += "-wavefrontsize64,"; 96 } 97 98 FullFS += FS; 99 100 ParseSubtargetFeatures(GPU, FullFS); 101 102 // We don't support FP64 for EG/NI atm. 103 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 104 105 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es 106 // on VI and newer hardware to avoid assertion failures due to missing ADDR64 107 // variants of MUBUF instructions. 108 if (!hasAddr64() && !FS.contains("flat-for-global")) { 109 FlatForGlobal = true; 110 } 111 112 // Set defaults if needed. 113 if (MaxPrivateElementSize == 0) 114 MaxPrivateElementSize = 4; 115 116 if (LDSBankCount == 0) 117 LDSBankCount = 32; 118 119 if (TT.getArch() == Triple::amdgcn) { 120 if (LocalMemorySize == 0) 121 LocalMemorySize = 32768; 122 123 // Do something sensible for unspecified target. 124 if (!HasMovrel && !HasVGPRIndexMode) 125 HasMovrel = true; 126 } 127 128 // Don't crash on invalid devices. 129 if (WavefrontSizeLog2 == 0) 130 WavefrontSizeLog2 = 5; 131 132 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 133 134 // Disable XNACK on targets where it is not enabled by default unless it is 135 // explicitly requested. 136 if (!FS.contains("+xnack") && DoesNotSupportXNACK && EnableXNACK) { 137 ToggleFeature(AMDGPU::FeatureXNACK); 138 EnableXNACK = false; 139 } 140 141 // ECC is on by default, but turn it off if the hardware doesn't support it 142 // anyway. This matters for the gfx9 targets with d16 loads, but don't support 143 // ECC. 144 if (DoesNotSupportSRAMECC && EnableSRAMECC) { 145 ToggleFeature(AMDGPU::FeatureSRAMECC); 146 EnableSRAMECC = false; 147 } 148 149 return *this; 150 } 151 152 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : 153 TargetTriple(TT), 154 Has16BitInsts(false), 155 HasMadMixInsts(false), 156 HasMadMacF32Insts(false), 157 HasDsSrc2Insts(false), 158 HasSDWA(false), 159 HasVOP3PInsts(false), 160 HasMulI24(true), 161 HasMulU24(true), 162 HasInv2PiInlineImm(false), 163 HasFminFmaxLegacy(true), 164 EnablePromoteAlloca(false), 165 HasTrigReducedRange(false), 166 MaxWavesPerEU(10), 167 LocalMemorySize(0), 168 WavefrontSizeLog2(0) 169 { } 170 171 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 172 const GCNTargetMachine &TM) : 173 AMDGPUGenSubtargetInfo(TT, GPU, FS), 174 AMDGPUSubtarget(TT), 175 TargetTriple(TT), 176 Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS), 177 InstrItins(getInstrItineraryForCPU(GPU)), 178 LDSBankCount(0), 179 MaxPrivateElementSize(0), 180 181 FastFMAF32(false), 182 FastDenormalF32(false), 183 HalfRate64Ops(false), 184 185 FlatForGlobal(false), 186 AutoWaitcntBeforeBarrier(false), 187 CodeObjectV3(false), 188 UnalignedScratchAccess(false), 189 UnalignedBufferAccess(false), 190 191 HasApertureRegs(false), 192 EnableXNACK(false), 193 DoesNotSupportXNACK(false), 194 EnableCuMode(false), 195 TrapHandler(false), 196 197 EnableLoadStoreOpt(false), 198 EnableUnsafeDSOffsetFolding(false), 199 EnableSIScheduler(false), 200 EnableDS128(false), 201 EnablePRTStrictNull(false), 202 DumpCode(false), 203 204 FP64(false), 205 GCN3Encoding(false), 206 CIInsts(false), 207 GFX8Insts(false), 208 GFX9Insts(false), 209 GFX10Insts(false), 210 GFX10_3Insts(false), 211 GFX7GFX8GFX9Insts(false), 212 SGPRInitBug(false), 213 HasSMemRealTime(false), 214 HasIntClamp(false), 215 HasFmaMixInsts(false), 216 HasMovrel(false), 217 HasVGPRIndexMode(false), 218 HasScalarStores(false), 219 HasScalarAtomics(false), 220 HasSDWAOmod(false), 221 HasSDWAScalar(false), 222 HasSDWASdst(false), 223 HasSDWAMac(false), 224 HasSDWAOutModsVOPC(false), 225 HasDPP(false), 226 HasDPP8(false), 227 HasR128A16(false), 228 HasGFX10A16(false), 229 HasG16(false), 230 HasNSAEncoding(false), 231 GFX10_BEncoding(false), 232 HasDLInsts(false), 233 HasDot1Insts(false), 234 HasDot2Insts(false), 235 HasDot3Insts(false), 236 HasDot4Insts(false), 237 HasDot5Insts(false), 238 HasDot6Insts(false), 239 HasMAIInsts(false), 240 HasPkFmacF16Inst(false), 241 HasAtomicFaddInsts(false), 242 EnableSRAMECC(false), 243 DoesNotSupportSRAMECC(false), 244 HasNoSdstCMPX(false), 245 HasVscnt(false), 246 HasGetWaveIdInst(false), 247 HasSMemTimeInst(false), 248 HasRegisterBanking(false), 249 HasVOP3Literal(false), 250 HasNoDataDepHazard(false), 251 FlatAddressSpace(false), 252 FlatInstOffsets(false), 253 FlatGlobalInsts(false), 254 FlatScratchInsts(false), 255 ScalarFlatScratchInsts(false), 256 AddNoCarryInsts(false), 257 HasUnpackedD16VMem(false), 258 LDSMisalignedBug(false), 259 HasMFMAInlineLiteralBug(false), 260 261 ScalarizeGlobal(false), 262 263 HasVcmpxPermlaneHazard(false), 264 HasVMEMtoScalarWriteHazard(false), 265 HasSMEMtoVectorWriteHazard(false), 266 HasInstFwdPrefetchBug(false), 267 HasVcmpxExecWARHazard(false), 268 HasLdsBranchVmemWARHazard(false), 269 HasNSAtoVMEMBug(false), 270 HasOffset3fBug(false), 271 HasFlatSegmentOffsetBug(false), 272 273 FeatureDisable(false), 274 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 275 TLInfo(TM, *this), 276 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 277 MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this); 278 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 279 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering())); 280 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 281 RegBankInfo.reset(new AMDGPURegisterBankInfo(*this)); 282 InstSelector.reset(new AMDGPUInstructionSelector( 283 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 284 } 285 286 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { 287 if (getGeneration() < GFX10) 288 return 1; 289 290 switch (Opcode) { 291 case AMDGPU::V_LSHLREV_B64: 292 case AMDGPU::V_LSHLREV_B64_gfx10: 293 case AMDGPU::V_LSHL_B64: 294 case AMDGPU::V_LSHRREV_B64: 295 case AMDGPU::V_LSHRREV_B64_gfx10: 296 case AMDGPU::V_LSHR_B64: 297 case AMDGPU::V_ASHRREV_I64: 298 case AMDGPU::V_ASHRREV_I64_gfx10: 299 case AMDGPU::V_ASHR_I64: 300 return 1; 301 } 302 303 return 2; 304 } 305 306 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 307 const Function &F) const { 308 if (NWaves == 1) 309 return getLocalMemorySize(); 310 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 311 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 312 if (!WorkGroupsPerCu) 313 return 0; 314 unsigned MaxWaves = getMaxWavesPerEU(); 315 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 316 } 317 318 // FIXME: Should return min,max range. 319 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 320 const Function &F) const { 321 const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second; 322 const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize); 323 if (!MaxWorkGroupsPerCu) 324 return 0; 325 326 const unsigned WaveSize = getWavefrontSize(); 327 328 // FIXME: Do we need to account for alignment requirement of LDS rounding the 329 // size up? 330 // Compute restriction based on LDS usage 331 unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u); 332 333 // This can be queried with more LDS than is possible, so just assume the 334 // worst. 335 if (NumGroups == 0) 336 return 1; 337 338 NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups); 339 340 // Round to the number of waves. 341 const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize; 342 unsigned MaxWaves = NumGroups * MaxGroupNumWaves; 343 344 // Clamp to the maximum possible number of waves. 345 MaxWaves = std::min(MaxWaves, getMaxWavesPerEU()); 346 347 // FIXME: Needs to be a multiple of the group size? 348 //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves); 349 350 assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() && 351 "computed invalid occupancy"); 352 return MaxWaves; 353 } 354 355 unsigned 356 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 357 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 358 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 359 } 360 361 std::pair<unsigned, unsigned> 362 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 363 switch (CC) { 364 case CallingConv::AMDGPU_VS: 365 case CallingConv::AMDGPU_LS: 366 case CallingConv::AMDGPU_HS: 367 case CallingConv::AMDGPU_ES: 368 case CallingConv::AMDGPU_GS: 369 case CallingConv::AMDGPU_PS: 370 return std::make_pair(1, getWavefrontSize()); 371 default: 372 return std::make_pair(1u, getMaxFlatWorkGroupSize()); 373 } 374 } 375 376 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 377 const Function &F) const { 378 // Default minimum/maximum flat work group sizes. 379 std::pair<unsigned, unsigned> Default = 380 getDefaultFlatWorkGroupSize(F.getCallingConv()); 381 382 // Requested minimum/maximum flat work group sizes. 383 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 384 F, "amdgpu-flat-work-group-size", Default); 385 386 // Make sure requested minimum is less than requested maximum. 387 if (Requested.first > Requested.second) 388 return Default; 389 390 // Make sure requested values do not violate subtarget's specifications. 391 if (Requested.first < getMinFlatWorkGroupSize()) 392 return Default; 393 if (Requested.second > getMaxFlatWorkGroupSize()) 394 return Default; 395 396 return Requested; 397 } 398 399 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 400 const Function &F) const { 401 // Default minimum/maximum number of waves per execution unit. 402 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 403 404 // Default/requested minimum/maximum flat work group sizes. 405 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 406 407 // If minimum/maximum flat work group sizes were explicitly requested using 408 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 409 // number of waves per execution unit to values implied by requested 410 // minimum/maximum flat work group sizes. 411 unsigned MinImpliedByFlatWorkGroupSize = 412 getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second); 413 Default.first = MinImpliedByFlatWorkGroupSize; 414 bool RequestedFlatWorkGroupSize = 415 F.hasFnAttribute("amdgpu-flat-work-group-size"); 416 417 // Requested minimum/maximum number of waves per execution unit. 418 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 419 F, "amdgpu-waves-per-eu", Default, true); 420 421 // Make sure requested minimum is less than requested maximum. 422 if (Requested.second && Requested.first > Requested.second) 423 return Default; 424 425 // Make sure requested values do not violate subtarget's specifications. 426 if (Requested.first < getMinWavesPerEU() || 427 Requested.second > getMaxWavesPerEU()) 428 return Default; 429 430 // Make sure requested values are compatible with values implied by requested 431 // minimum/maximum flat work group sizes. 432 if (RequestedFlatWorkGroupSize && 433 Requested.first < MinImpliedByFlatWorkGroupSize) 434 return Default; 435 436 return Requested; 437 } 438 439 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 440 Function *Kernel = I->getParent()->getParent(); 441 unsigned MinSize = 0; 442 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 443 bool IdQuery = false; 444 445 // If reqd_work_group_size is present it narrows value down. 446 if (auto *CI = dyn_cast<CallInst>(I)) { 447 const Function *F = CI->getCalledFunction(); 448 if (F) { 449 unsigned Dim = UINT_MAX; 450 switch (F->getIntrinsicID()) { 451 case Intrinsic::amdgcn_workitem_id_x: 452 case Intrinsic::r600_read_tidig_x: 453 IdQuery = true; 454 LLVM_FALLTHROUGH; 455 case Intrinsic::r600_read_local_size_x: 456 Dim = 0; 457 break; 458 case Intrinsic::amdgcn_workitem_id_y: 459 case Intrinsic::r600_read_tidig_y: 460 IdQuery = true; 461 LLVM_FALLTHROUGH; 462 case Intrinsic::r600_read_local_size_y: 463 Dim = 1; 464 break; 465 case Intrinsic::amdgcn_workitem_id_z: 466 case Intrinsic::r600_read_tidig_z: 467 IdQuery = true; 468 LLVM_FALLTHROUGH; 469 case Intrinsic::r600_read_local_size_z: 470 Dim = 2; 471 break; 472 default: 473 break; 474 } 475 if (Dim <= 3) { 476 if (auto Node = Kernel->getMetadata("reqd_work_group_size")) 477 if (Node->getNumOperands() == 3) 478 MinSize = MaxSize = mdconst::extract<ConstantInt>( 479 Node->getOperand(Dim))->getZExtValue(); 480 } 481 } 482 } 483 484 if (!MaxSize) 485 return false; 486 487 // Range metadata is [Lo, Hi). For ID query we need to pass max size 488 // as Hi. For size query we need to pass Hi + 1. 489 if (IdQuery) 490 MinSize = 0; 491 else 492 ++MaxSize; 493 494 MDBuilder MDB(I->getContext()); 495 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 496 APInt(32, MaxSize)); 497 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 498 return true; 499 } 500 501 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 502 Align &MaxAlign) const { 503 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 504 F.getCallingConv() == CallingConv::SPIR_KERNEL); 505 506 const DataLayout &DL = F.getParent()->getDataLayout(); 507 uint64_t ExplicitArgBytes = 0; 508 MaxAlign = Align(1); 509 510 for (const Argument &Arg : F.args()) { 511 const bool IsByRef = Arg.hasByRefAttr(); 512 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); 513 MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None; 514 if (!Alignment) 515 Alignment = DL.getABITypeAlign(ArgTy); 516 517 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 518 ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize; 519 MaxAlign = max(MaxAlign, Alignment); 520 } 521 522 return ExplicitArgBytes; 523 } 524 525 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 526 Align &MaxAlign) const { 527 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 528 529 unsigned ExplicitOffset = getExplicitKernelArgOffset(F); 530 531 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 532 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 533 if (ImplicitBytes != 0) { 534 const Align Alignment = getAlignmentForImplicitArgPtr(); 535 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 536 } 537 538 // Being able to dereference past the end is useful for emitting scalar loads. 539 return alignTo(TotalSize, 4); 540 } 541 542 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 543 const TargetMachine &TM) : 544 R600GenSubtargetInfo(TT, GPU, FS), 545 AMDGPUSubtarget(TT), 546 InstrInfo(*this), 547 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 548 FMA(false), 549 CaymanISA(false), 550 CFALUBug(false), 551 HasVertexCache(false), 552 R600ALUInst(false), 553 FP64(false), 554 TexVTXClauseSize(0), 555 Gen(R600), 556 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)), 557 InstrItins(getInstrItineraryForCPU(GPU)) { } 558 559 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 560 unsigned NumRegionInstrs) const { 561 // Track register pressure so the scheduler can try to decrease 562 // pressure once register usage is above the threshold defined by 563 // SIRegisterInfo::getRegPressureSetLimit() 564 Policy.ShouldTrackPressure = true; 565 566 // Enabling both top down and bottom up scheduling seems to give us less 567 // register spills than just using one of these approaches on its own. 568 Policy.OnlyTopDown = false; 569 Policy.OnlyBottomUp = false; 570 571 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 572 if (!enableSIScheduler()) 573 Policy.ShouldTrackLaneMasks = true; 574 } 575 576 bool GCNSubtarget::hasMadF16() const { 577 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1; 578 } 579 580 bool GCNSubtarget::useVGPRIndexMode() const { 581 return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode()); 582 } 583 584 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 585 if (getGeneration() >= AMDGPUSubtarget::GFX10) 586 return getMaxWavesPerEU(); 587 588 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 589 if (SGPRs <= 80) 590 return 10; 591 if (SGPRs <= 88) 592 return 9; 593 if (SGPRs <= 100) 594 return 8; 595 return 7; 596 } 597 if (SGPRs <= 48) 598 return 10; 599 if (SGPRs <= 56) 600 return 9; 601 if (SGPRs <= 64) 602 return 8; 603 if (SGPRs <= 72) 604 return 7; 605 if (SGPRs <= 80) 606 return 6; 607 return 5; 608 } 609 610 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 611 unsigned MaxWaves = getMaxWavesPerEU(); 612 unsigned Granule = getVGPRAllocGranule(); 613 if (VGPRs < Granule) 614 return MaxWaves; 615 unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule; 616 return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves); 617 } 618 619 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 620 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 621 if (getGeneration() >= AMDGPUSubtarget::GFX10) 622 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. 623 624 if (MFI.hasFlatScratchInit()) { 625 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 626 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 627 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 628 return 4; // FLAT_SCRATCH, VCC (in that order). 629 } 630 631 if (isXNACKEnabled()) 632 return 4; // XNACK, VCC (in that order). 633 return 2; // VCC. 634 } 635 636 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize, 637 unsigned NumSGPRs, 638 unsigned NumVGPRs) const { 639 unsigned Occupancy = 640 std::min(getMaxWavesPerEU(), 641 getOccupancyWithLocalMemSize(LDSSize, F)); 642 if (NumSGPRs) 643 Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs)); 644 if (NumVGPRs) 645 Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs)); 646 return Occupancy; 647 } 648 649 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 650 const Function &F = MF.getFunction(); 651 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 652 653 // Compute maximum number of SGPRs function can use using default/requested 654 // minimum number of waves per execution unit. 655 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 656 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 657 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 658 659 // Check if maximum number of SGPRs was explicitly requested using 660 // "amdgpu-num-sgpr" attribute. 661 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 662 unsigned Requested = AMDGPU::getIntegerAttribute( 663 F, "amdgpu-num-sgpr", MaxNumSGPRs); 664 665 // Make sure requested value does not violate subtarget's specifications. 666 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 667 Requested = 0; 668 669 // If more SGPRs are required to support the input user/system SGPRs, 670 // increase to accommodate them. 671 // 672 // FIXME: This really ends up using the requested number of SGPRs + number 673 // of reserved special registers in total. Theoretically you could re-use 674 // the last input registers for these special registers, but this would 675 // require a lot of complexity to deal with the weird aliasing. 676 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 677 if (Requested && Requested < InputNumSGPRs) 678 Requested = InputNumSGPRs; 679 680 // Make sure requested value is compatible with values implied by 681 // default/requested minimum/maximum number of waves per execution unit. 682 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 683 Requested = 0; 684 if (WavesPerEU.second && 685 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 686 Requested = 0; 687 688 if (Requested) 689 MaxNumSGPRs = Requested; 690 } 691 692 if (hasSGPRInitBug()) 693 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 694 695 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 696 MaxAddressableNumSGPRs); 697 } 698 699 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 700 const Function &F = MF.getFunction(); 701 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 702 703 // Compute maximum number of VGPRs function can use using default/requested 704 // minimum number of waves per execution unit. 705 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 706 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 707 708 // Check if maximum number of VGPRs was explicitly requested using 709 // "amdgpu-num-vgpr" attribute. 710 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 711 unsigned Requested = AMDGPU::getIntegerAttribute( 712 F, "amdgpu-num-vgpr", MaxNumVGPRs); 713 714 // Make sure requested value is compatible with values implied by 715 // default/requested minimum/maximum number of waves per execution unit. 716 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 717 Requested = 0; 718 if (WavesPerEU.second && 719 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 720 Requested = 0; 721 722 if (Requested) 723 MaxNumVGPRs = Requested; 724 } 725 726 return MaxNumVGPRs; 727 } 728 729 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, 730 int UseOpIdx, SDep &Dep) const { 731 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || 732 !Def->isInstr() || !Use->isInstr()) 733 return; 734 735 MachineInstr *DefI = Def->getInstr(); 736 MachineInstr *UseI = Use->getInstr(); 737 738 if (DefI->isBundle()) { 739 const SIRegisterInfo *TRI = getRegisterInfo(); 740 auto Reg = Dep.getReg(); 741 MachineBasicBlock::const_instr_iterator I(DefI->getIterator()); 742 MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end()); 743 unsigned Lat = 0; 744 for (++I; I != E && I->isBundledWithPred(); ++I) { 745 if (I->modifiesRegister(Reg, TRI)) 746 Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I); 747 else if (Lat) 748 --Lat; 749 } 750 Dep.setLatency(Lat); 751 } else if (UseI->isBundle()) { 752 const SIRegisterInfo *TRI = getRegisterInfo(); 753 auto Reg = Dep.getReg(); 754 MachineBasicBlock::const_instr_iterator I(UseI->getIterator()); 755 MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end()); 756 unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI); 757 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) { 758 if (I->readsRegister(Reg, TRI)) 759 break; 760 --Lat; 761 } 762 Dep.setLatency(Lat); 763 } 764 } 765 766 namespace { 767 struct FillMFMAShadowMutation : ScheduleDAGMutation { 768 const SIInstrInfo *TII; 769 770 ScheduleDAGMI *DAG; 771 772 FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {} 773 774 bool isSALU(const SUnit *SU) const { 775 const MachineInstr *MI = SU->getInstr(); 776 return MI && TII->isSALU(*MI) && !MI->isTerminator(); 777 } 778 779 bool isVALU(const SUnit *SU) const { 780 const MachineInstr *MI = SU->getInstr(); 781 return MI && TII->isVALU(*MI); 782 } 783 784 bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const { 785 if (Pred->NodeNum < Succ->NodeNum) 786 return true; 787 788 SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred}); 789 790 for (unsigned I = 0; I < Succs.size(); ++I) { 791 for (const SDep &SI : Succs[I]->Succs) { 792 const SUnit *SU = SI.getSUnit(); 793 if (SU != Succs[I] && llvm::find(Succs, SU) == Succs.end()) 794 Succs.push_back(SU); 795 } 796 } 797 798 SmallPtrSet<const SUnit*, 32> Visited; 799 while (!Preds.empty()) { 800 const SUnit *SU = Preds.pop_back_val(); 801 if (llvm::find(Succs, SU) != Succs.end()) 802 return false; 803 Visited.insert(SU); 804 for (const SDep &SI : SU->Preds) 805 if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit())) 806 Preds.push_back(SI.getSUnit()); 807 } 808 809 return true; 810 } 811 812 // Link as much SALU intructions in chain as possible. Return the size 813 // of the chain. Links up to MaxChain instructions. 814 unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain, 815 SmallPtrSetImpl<SUnit *> &Visited) const { 816 SmallVector<SUnit *, 8> Worklist({To}); 817 unsigned Linked = 0; 818 819 while (!Worklist.empty() && MaxChain-- > 0) { 820 SUnit *SU = Worklist.pop_back_val(); 821 if (!Visited.insert(SU).second) 822 continue; 823 824 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From); 825 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n'); 826 827 if (SU->addPred(SDep(From, SDep::Artificial), false)) 828 ++Linked; 829 830 for (SDep &SI : From->Succs) { 831 SUnit *SUv = SI.getSUnit(); 832 if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU)) 833 SUv->addPred(SDep(SU, SDep::Artificial), false); 834 } 835 836 for (SDep &SI : SU->Succs) { 837 SUnit *Succ = SI.getSUnit(); 838 if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ)) 839 Worklist.push_back(Succ); 840 } 841 } 842 843 return Linked; 844 } 845 846 void apply(ScheduleDAGInstrs *DAGInstrs) override { 847 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); 848 if (!ST.hasMAIInsts() || DisablePowerSched) 849 return; 850 DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); 851 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); 852 if (!TSchedModel || DAG->SUnits.empty()) 853 return; 854 855 // Scan for MFMA long latency instructions and try to add a dependency 856 // of available SALU instructions to give them a chance to fill MFMA 857 // shadow. That is desirable to fill MFMA shadow with SALU instructions 858 // rather than VALU to prevent power consumption bursts and throttle. 859 auto LastSALU = DAG->SUnits.begin(); 860 auto E = DAG->SUnits.end(); 861 SmallPtrSet<SUnit*, 32> Visited; 862 for (SUnit &SU : DAG->SUnits) { 863 MachineInstr &MAI = *SU.getInstr(); 864 if (!TII->isMAI(MAI) || 865 MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 || 866 MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32) 867 continue; 868 869 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1; 870 871 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU); 872 dbgs() << "Need " << Lat 873 << " instructions to cover latency.\n"); 874 875 // Find up to Lat independent scalar instructions as early as 876 // possible such that they can be scheduled after this MFMA. 877 for ( ; Lat && LastSALU != E; ++LastSALU) { 878 if (Visited.count(&*LastSALU)) 879 continue; 880 881 if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU)) 882 continue; 883 884 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited); 885 } 886 } 887 } 888 }; 889 } // namespace 890 891 void GCNSubtarget::getPostRAMutations( 892 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 893 Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo)); 894 } 895 896 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 897 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 898 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 899 else 900 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); 901 } 902 903 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 904 if (TM.getTargetTriple().getArch() == Triple::amdgcn) 905 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 906 else 907 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); 908 } 909