1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Implements the AMDGPU specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUSubtarget.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUTargetMachine.h" 17 #include "AMDGPUCallLowering.h" 18 #include "AMDGPUInstructionSelector.h" 19 #include "AMDGPULegalizerInfo.h" 20 #include "AMDGPURegisterBankInfo.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 23 #include "llvm/ADT/SmallString.h" 24 #include "llvm/CodeGen/MachineScheduler.h" 25 #include "llvm/MC/MCSubtargetInfo.h" 26 #include "llvm/IR/MDBuilder.h" 27 #include "llvm/CodeGen/TargetFrameLowering.h" 28 #include <algorithm> 29 30 using namespace llvm; 31 32 #define DEBUG_TYPE "amdgpu-subtarget" 33 34 #define GET_SUBTARGETINFO_TARGET_DESC 35 #define GET_SUBTARGETINFO_CTOR 36 #define AMDGPUSubtarget GCNSubtarget 37 #include "AMDGPUGenSubtargetInfo.inc" 38 #define GET_SUBTARGETINFO_TARGET_DESC 39 #define GET_SUBTARGETINFO_CTOR 40 #undef AMDGPUSubtarget 41 #include "R600GenSubtargetInfo.inc" 42 43 static cl::opt<bool> DisablePowerSched( 44 "amdgpu-disable-power-sched", 45 cl::desc("Disable scheduling to minimize mAI power bursts"), 46 cl::init(false)); 47 48 static cl::opt<bool> EnableVGPRIndexMode( 49 "amdgpu-vgpr-index-mode", 50 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), 51 cl::init(false)); 52 53 GCNSubtarget::~GCNSubtarget() = default; 54 55 R600Subtarget & 56 R600Subtarget::initializeSubtargetDependencies(const Triple &TT, 57 StringRef GPU, StringRef FS) { 58 SmallString<256> FullFS("+promote-alloca,"); 59 FullFS += FS; 60 ParseSubtargetFeatures(GPU, FullFS); 61 62 // FIXME: I don't think think Evergreen has any useful support for 63 // denormals, but should be checked. Should we issue a warning somewhere 64 // if someone tries to enable these? 65 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { 66 FP32Denormals = false; 67 } 68 69 HasMulU24 = getGeneration() >= EVERGREEN; 70 HasMulI24 = hasCaymanISA(); 71 72 return *this; 73 } 74 75 GCNSubtarget & 76 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 77 StringRef GPU, StringRef FS) { 78 // Determine default and user-specified characteristics 79 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be 80 // enabled, but some instructions do not respect them and they run at the 81 // double precision rate, so don't enable by default. 82 // 83 // We want to be able to turn these off, but making this a subtarget feature 84 // for SI has the unhelpful behavior that it unsets everything else if you 85 // disable it. 86 // 87 // Similarly we want enable-prt-strict-null to be on by default and not to 88 // unset everything else if it is disabled 89 90 // Assuming ECC is enabled is the conservative default. 91 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,+xnack,"); 92 93 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 94 FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,"; 95 96 // FIXME: I don't think think Evergreen has any useful support for 97 // denormals, but should be checked. Should we issue a warning somewhere 98 // if someone tries to enable these? 99 if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { 100 FullFS += "+fp64-fp16-denormals,"; 101 } else { 102 FullFS += "-fp32-denormals,"; 103 } 104 105 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS 106 107 // Disable mutually exclusive bits. 108 if (FS.find_lower("+wavefrontsize") != StringRef::npos) { 109 if (FS.find_lower("wavefrontsize16") == StringRef::npos) 110 FullFS += "-wavefrontsize16,"; 111 if (FS.find_lower("wavefrontsize32") == StringRef::npos) 112 FullFS += "-wavefrontsize32,"; 113 if (FS.find_lower("wavefrontsize64") == StringRef::npos) 114 FullFS += "-wavefrontsize64,"; 115 } 116 117 FullFS += FS; 118 119 ParseSubtargetFeatures(GPU, FullFS); 120 121 // We don't support FP64 for EG/NI atm. 122 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 123 124 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es 125 // on VI and newer hardware to avoid assertion failures due to missing ADDR64 126 // variants of MUBUF instructions. 127 if (!hasAddr64() && !FS.contains("flat-for-global")) { 128 FlatForGlobal = true; 129 } 130 131 // Set defaults if needed. 132 if (MaxPrivateElementSize == 0) 133 MaxPrivateElementSize = 4; 134 135 if (LDSBankCount == 0) 136 LDSBankCount = 32; 137 138 if (TT.getArch() == Triple::amdgcn) { 139 if (LocalMemorySize == 0) 140 LocalMemorySize = 32768; 141 142 // Do something sensible for unspecified target. 143 if (!HasMovrel && !HasVGPRIndexMode) 144 HasMovrel = true; 145 } 146 147 // Don't crash on invalid devices. 148 if (WavefrontSize == 0) 149 WavefrontSize = 64; 150 151 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 152 153 // Disable XNACK on targets where it is not enabled by default unless it is 154 // explicitly requested. 155 if (!FS.contains("+xnack") && DoesNotSupportXNACK && EnableXNACK) { 156 ToggleFeature(AMDGPU::FeatureXNACK); 157 EnableXNACK = false; 158 } 159 160 // ECC is on by default, but turn it off if the hardware doesn't support it 161 // anyway. This matters for the gfx9 targets with d16 loads, but don't support 162 // ECC. 163 if (DoesNotSupportSRAMECC && EnableSRAMECC) { 164 ToggleFeature(AMDGPU::FeatureSRAMECC); 165 EnableSRAMECC = false; 166 } 167 168 return *this; 169 } 170 171 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : 172 TargetTriple(TT), 173 Has16BitInsts(false), 174 HasMadMixInsts(false), 175 FP32Denormals(false), 176 FPExceptions(false), 177 HasSDWA(false), 178 HasVOP3PInsts(false), 179 HasMulI24(true), 180 HasMulU24(true), 181 HasInv2PiInlineImm(false), 182 HasFminFmaxLegacy(true), 183 EnablePromoteAlloca(false), 184 HasTrigReducedRange(false), 185 MaxWavesPerEU(10), 186 LocalMemorySize(0), 187 WavefrontSize(0) 188 { } 189 190 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 191 const GCNTargetMachine &TM) : 192 AMDGPUGenSubtargetInfo(TT, GPU, FS), 193 AMDGPUSubtarget(TT), 194 TargetTriple(TT), 195 Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS), 196 InstrItins(getInstrItineraryForCPU(GPU)), 197 LDSBankCount(0), 198 MaxPrivateElementSize(0), 199 200 FastFMAF32(false), 201 HalfRate64Ops(false), 202 203 FP64FP16Denormals(false), 204 FlatForGlobal(false), 205 AutoWaitcntBeforeBarrier(false), 206 CodeObjectV3(false), 207 UnalignedScratchAccess(false), 208 UnalignedBufferAccess(false), 209 210 HasApertureRegs(false), 211 EnableXNACK(false), 212 DoesNotSupportXNACK(false), 213 EnableCuMode(false), 214 TrapHandler(false), 215 216 EnableLoadStoreOpt(false), 217 EnableUnsafeDSOffsetFolding(false), 218 EnableSIScheduler(false), 219 EnableDS128(false), 220 EnablePRTStrictNull(false), 221 DumpCode(false), 222 223 FP64(false), 224 GCN3Encoding(false), 225 CIInsts(false), 226 GFX8Insts(false), 227 GFX9Insts(false), 228 GFX10Insts(false), 229 GFX7GFX8GFX9Insts(false), 230 SGPRInitBug(false), 231 HasSMemRealTime(false), 232 HasIntClamp(false), 233 HasFmaMixInsts(false), 234 HasMovrel(false), 235 HasVGPRIndexMode(false), 236 HasScalarStores(false), 237 HasScalarAtomics(false), 238 HasSDWAOmod(false), 239 HasSDWAScalar(false), 240 HasSDWASdst(false), 241 HasSDWAMac(false), 242 HasSDWAOutModsVOPC(false), 243 HasDPP(false), 244 HasDPP8(false), 245 HasR128A16(false), 246 HasGFX10A16(false), 247 HasNSAEncoding(false), 248 HasDLInsts(false), 249 HasDot1Insts(false), 250 HasDot2Insts(false), 251 HasDot3Insts(false), 252 HasDot4Insts(false), 253 HasDot5Insts(false), 254 HasDot6Insts(false), 255 HasMAIInsts(false), 256 HasPkFmacF16Inst(false), 257 HasAtomicFaddInsts(false), 258 EnableSRAMECC(false), 259 DoesNotSupportSRAMECC(false), 260 HasNoSdstCMPX(false), 261 HasVscnt(false), 262 HasRegisterBanking(false), 263 HasVOP3Literal(false), 264 HasNoDataDepHazard(false), 265 FlatAddressSpace(false), 266 FlatInstOffsets(false), 267 FlatGlobalInsts(false), 268 FlatScratchInsts(false), 269 ScalarFlatScratchInsts(false), 270 AddNoCarryInsts(false), 271 HasUnpackedD16VMem(false), 272 LDSMisalignedBug(false), 273 HasMFMAInlineLiteralBug(false), 274 275 ScalarizeGlobal(false), 276 277 HasVcmpxPermlaneHazard(false), 278 HasVMEMtoScalarWriteHazard(false), 279 HasSMEMtoVectorWriteHazard(false), 280 HasInstFwdPrefetchBug(false), 281 HasVcmpxExecWARHazard(false), 282 HasLdsBranchVmemWARHazard(false), 283 HasNSAtoVMEMBug(false), 284 HasOffset3fBug(false), 285 HasFlatSegmentOffsetBug(false), 286 287 FeatureDisable(false), 288 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 289 TLInfo(TM, *this), 290 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 291 MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this); 292 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 293 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 294 RegBankInfo.reset(new AMDGPURegisterBankInfo(*this)); 295 InstSelector.reset(new AMDGPUInstructionSelector( 296 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 297 } 298 299 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { 300 if (getGeneration() < GFX10) 301 return 1; 302 303 switch (Opcode) { 304 case AMDGPU::V_LSHLREV_B64: 305 case AMDGPU::V_LSHLREV_B64_gfx10: 306 case AMDGPU::V_LSHL_B64: 307 case AMDGPU::V_LSHRREV_B64: 308 case AMDGPU::V_LSHRREV_B64_gfx10: 309 case AMDGPU::V_LSHR_B64: 310 case AMDGPU::V_ASHRREV_I64: 311 case AMDGPU::V_ASHRREV_I64_gfx10: 312 case AMDGPU::V_ASHR_I64: 313 return 1; 314 } 315 316 return 2; 317 } 318 319 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 320 const Function &F) const { 321 if (NWaves == 1) 322 return getLocalMemorySize(); 323 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 324 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 325 if (!WorkGroupsPerCu) 326 return 0; 327 unsigned MaxWaves = getMaxWavesPerEU(); 328 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 329 } 330 331 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 332 const Function &F) const { 333 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 334 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 335 if (!WorkGroupsPerCu) 336 return 0; 337 unsigned MaxWaves = getMaxWavesPerEU(); 338 unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu; 339 unsigned NumWaves = Limit / (Bytes ? Bytes : 1u); 340 NumWaves = std::min(NumWaves, MaxWaves); 341 NumWaves = std::max(NumWaves, 1u); 342 return NumWaves; 343 } 344 345 unsigned 346 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 347 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 348 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 349 } 350 351 std::pair<unsigned, unsigned> 352 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 353 switch (CC) { 354 case CallingConv::AMDGPU_VS: 355 case CallingConv::AMDGPU_LS: 356 case CallingConv::AMDGPU_HS: 357 case CallingConv::AMDGPU_ES: 358 case CallingConv::AMDGPU_GS: 359 case CallingConv::AMDGPU_PS: 360 return std::make_pair(1, getWavefrontSize()); 361 default: 362 return std::make_pair(1u, getMaxFlatWorkGroupSize()); 363 } 364 } 365 366 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 367 const Function &F) const { 368 // Default minimum/maximum flat work group sizes. 369 std::pair<unsigned, unsigned> Default = 370 getDefaultFlatWorkGroupSize(F.getCallingConv()); 371 372 // Requested minimum/maximum flat work group sizes. 373 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 374 F, "amdgpu-flat-work-group-size", Default); 375 376 // Make sure requested minimum is less than requested maximum. 377 if (Requested.first > Requested.second) 378 return Default; 379 380 // Make sure requested values do not violate subtarget's specifications. 381 if (Requested.first < getMinFlatWorkGroupSize()) 382 return Default; 383 if (Requested.second > getMaxFlatWorkGroupSize()) 384 return Default; 385 386 return Requested; 387 } 388 389 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 390 const Function &F) const { 391 // Default minimum/maximum number of waves per execution unit. 392 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 393 394 // Default/requested minimum/maximum flat work group sizes. 395 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 396 397 // If minimum/maximum flat work group sizes were explicitly requested using 398 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 399 // number of waves per execution unit to values implied by requested 400 // minimum/maximum flat work group sizes. 401 unsigned MinImpliedByFlatWorkGroupSize = 402 getMaxWavesPerEU(FlatWorkGroupSizes.second); 403 bool RequestedFlatWorkGroupSize = false; 404 405 if (F.hasFnAttribute("amdgpu-flat-work-group-size")) { 406 Default.first = MinImpliedByFlatWorkGroupSize; 407 RequestedFlatWorkGroupSize = true; 408 } 409 410 // Requested minimum/maximum number of waves per execution unit. 411 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 412 F, "amdgpu-waves-per-eu", Default, true); 413 414 // Make sure requested minimum is less than requested maximum. 415 if (Requested.second && Requested.first > Requested.second) 416 return Default; 417 418 // Make sure requested values do not violate subtarget's specifications. 419 if (Requested.first < getMinWavesPerEU() || 420 Requested.first > getMaxWavesPerEU()) 421 return Default; 422 if (Requested.second > getMaxWavesPerEU()) 423 return Default; 424 425 // Make sure requested values are compatible with values implied by requested 426 // minimum/maximum flat work group sizes. 427 if (RequestedFlatWorkGroupSize && 428 Requested.first < MinImpliedByFlatWorkGroupSize) 429 return Default; 430 431 return Requested; 432 } 433 434 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 435 Function *Kernel = I->getParent()->getParent(); 436 unsigned MinSize = 0; 437 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 438 bool IdQuery = false; 439 440 // If reqd_work_group_size is present it narrows value down. 441 if (auto *CI = dyn_cast<CallInst>(I)) { 442 const Function *F = CI->getCalledFunction(); 443 if (F) { 444 unsigned Dim = UINT_MAX; 445 switch (F->getIntrinsicID()) { 446 case Intrinsic::amdgcn_workitem_id_x: 447 case Intrinsic::r600_read_tidig_x: 448 IdQuery = true; 449 LLVM_FALLTHROUGH; 450 case Intrinsic::r600_read_local_size_x: 451 Dim = 0; 452 break; 453 case Intrinsic::amdgcn_workitem_id_y: 454 case Intrinsic::r600_read_tidig_y: 455 IdQuery = true; 456 LLVM_FALLTHROUGH; 457 case Intrinsic::r600_read_local_size_y: 458 Dim = 1; 459 break; 460 case Intrinsic::amdgcn_workitem_id_z: 461 case Intrinsic::r600_read_tidig_z: 462 IdQuery = true; 463 LLVM_FALLTHROUGH; 464 case Intrinsic::r600_read_local_size_z: 465 Dim = 2; 466 break; 467 default: 468 break; 469 } 470 if (Dim <= 3) { 471 if (auto Node = Kernel->getMetadata("reqd_work_group_size")) 472 if (Node->getNumOperands() == 3) 473 MinSize = MaxSize = mdconst::extract<ConstantInt>( 474 Node->getOperand(Dim))->getZExtValue(); 475 } 476 } 477 } 478 479 if (!MaxSize) 480 return false; 481 482 // Range metadata is [Lo, Hi). For ID query we need to pass max size 483 // as Hi. For size query we need to pass Hi + 1. 484 if (IdQuery) 485 MinSize = 0; 486 else 487 ++MaxSize; 488 489 MDBuilder MDB(I->getContext()); 490 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 491 APInt(32, MaxSize)); 492 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 493 return true; 494 } 495 496 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 497 Align &MaxAlign) const { 498 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 499 F.getCallingConv() == CallingConv::SPIR_KERNEL); 500 501 const DataLayout &DL = F.getParent()->getDataLayout(); 502 uint64_t ExplicitArgBytes = 0; 503 MaxAlign = Align(1); 504 505 for (const Argument &Arg : F.args()) { 506 Type *ArgTy = Arg.getType(); 507 508 const Align Alignment(DL.getABITypeAlignment(ArgTy)); 509 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 510 ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize; 511 MaxAlign = std::max(MaxAlign, Alignment); 512 } 513 514 return ExplicitArgBytes; 515 } 516 517 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 518 Align &MaxAlign) const { 519 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 520 521 unsigned ExplicitOffset = getExplicitKernelArgOffset(F); 522 523 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 524 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 525 if (ImplicitBytes != 0) { 526 const Align Alignment = getAlignmentForImplicitArgPtr(); 527 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 528 } 529 530 // Being able to dereference past the end is useful for emitting scalar loads. 531 return alignTo(TotalSize, 4); 532 } 533 534 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 535 const TargetMachine &TM) : 536 R600GenSubtargetInfo(TT, GPU, FS), 537 AMDGPUSubtarget(TT), 538 InstrInfo(*this), 539 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 540 FMA(false), 541 CaymanISA(false), 542 CFALUBug(false), 543 HasVertexCache(false), 544 R600ALUInst(false), 545 FP64(false), 546 TexVTXClauseSize(0), 547 Gen(R600), 548 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)), 549 InstrItins(getInstrItineraryForCPU(GPU)) { } 550 551 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 552 unsigned NumRegionInstrs) const { 553 // Track register pressure so the scheduler can try to decrease 554 // pressure once register usage is above the threshold defined by 555 // SIRegisterInfo::getRegPressureSetLimit() 556 Policy.ShouldTrackPressure = true; 557 558 // Enabling both top down and bottom up scheduling seems to give us less 559 // register spills than just using one of these approaches on its own. 560 Policy.OnlyTopDown = false; 561 Policy.OnlyBottomUp = false; 562 563 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 564 if (!enableSIScheduler()) 565 Policy.ShouldTrackLaneMasks = true; 566 } 567 568 bool GCNSubtarget::hasMadF16() const { 569 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1; 570 } 571 572 bool GCNSubtarget::useVGPRIndexMode() const { 573 return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode()); 574 } 575 576 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 577 if (getGeneration() >= AMDGPUSubtarget::GFX10) 578 return getMaxWavesPerEU(); 579 580 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 581 if (SGPRs <= 80) 582 return 10; 583 if (SGPRs <= 88) 584 return 9; 585 if (SGPRs <= 100) 586 return 8; 587 return 7; 588 } 589 if (SGPRs <= 48) 590 return 10; 591 if (SGPRs <= 56) 592 return 9; 593 if (SGPRs <= 64) 594 return 8; 595 if (SGPRs <= 72) 596 return 7; 597 if (SGPRs <= 80) 598 return 6; 599 return 5; 600 } 601 602 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 603 unsigned MaxWaves = getMaxWavesPerEU(); 604 unsigned Granule = getVGPRAllocGranule(); 605 if (VGPRs < Granule) 606 return MaxWaves; 607 unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule; 608 return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves); 609 } 610 611 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 612 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 613 if (getGeneration() >= AMDGPUSubtarget::GFX10) 614 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. 615 616 if (MFI.hasFlatScratchInit()) { 617 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 618 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 619 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 620 return 4; // FLAT_SCRATCH, VCC (in that order). 621 } 622 623 if (isXNACKEnabled()) 624 return 4; // XNACK, VCC (in that order). 625 return 2; // VCC. 626 } 627 628 unsigned GCNSubtarget::computeOccupancy(const MachineFunction &MF, 629 unsigned LDSSize, 630 unsigned NumSGPRs, 631 unsigned NumVGPRs) const { 632 unsigned Occupancy = 633 std::min(getMaxWavesPerEU(), 634 getOccupancyWithLocalMemSize(LDSSize, MF.getFunction())); 635 if (NumSGPRs) 636 Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs)); 637 if (NumVGPRs) 638 Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs)); 639 return Occupancy; 640 } 641 642 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 643 const Function &F = MF.getFunction(); 644 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 645 646 // Compute maximum number of SGPRs function can use using default/requested 647 // minimum number of waves per execution unit. 648 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 649 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 650 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 651 652 // Check if maximum number of SGPRs was explicitly requested using 653 // "amdgpu-num-sgpr" attribute. 654 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 655 unsigned Requested = AMDGPU::getIntegerAttribute( 656 F, "amdgpu-num-sgpr", MaxNumSGPRs); 657 658 // Make sure requested value does not violate subtarget's specifications. 659 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 660 Requested = 0; 661 662 // If more SGPRs are required to support the input user/system SGPRs, 663 // increase to accommodate them. 664 // 665 // FIXME: This really ends up using the requested number of SGPRs + number 666 // of reserved special registers in total. Theoretically you could re-use 667 // the last input registers for these special registers, but this would 668 // require a lot of complexity to deal with the weird aliasing. 669 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 670 if (Requested && Requested < InputNumSGPRs) 671 Requested = InputNumSGPRs; 672 673 // Make sure requested value is compatible with values implied by 674 // default/requested minimum/maximum number of waves per execution unit. 675 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 676 Requested = 0; 677 if (WavesPerEU.second && 678 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 679 Requested = 0; 680 681 if (Requested) 682 MaxNumSGPRs = Requested; 683 } 684 685 if (hasSGPRInitBug()) 686 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 687 688 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 689 MaxAddressableNumSGPRs); 690 } 691 692 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 693 const Function &F = MF.getFunction(); 694 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 695 696 // Compute maximum number of VGPRs function can use using default/requested 697 // minimum number of waves per execution unit. 698 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 699 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 700 701 // Check if maximum number of VGPRs was explicitly requested using 702 // "amdgpu-num-vgpr" attribute. 703 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 704 unsigned Requested = AMDGPU::getIntegerAttribute( 705 F, "amdgpu-num-vgpr", MaxNumVGPRs); 706 707 // Make sure requested value is compatible with values implied by 708 // default/requested minimum/maximum number of waves per execution unit. 709 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 710 Requested = 0; 711 if (WavesPerEU.second && 712 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 713 Requested = 0; 714 715 if (Requested) 716 MaxNumVGPRs = Requested; 717 } 718 719 return MaxNumVGPRs; 720 } 721 722 void GCNSubtarget::adjustSchedDependency(SUnit *Src, SUnit *Dst, 723 SDep &Dep) const { 724 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || 725 !Src->isInstr() || !Dst->isInstr()) 726 return; 727 728 MachineInstr *SrcI = Src->getInstr(); 729 MachineInstr *DstI = Dst->getInstr(); 730 731 if (SrcI->isBundle()) { 732 const SIRegisterInfo *TRI = getRegisterInfo(); 733 auto Reg = Dep.getReg(); 734 MachineBasicBlock::const_instr_iterator I(SrcI->getIterator()); 735 MachineBasicBlock::const_instr_iterator E(SrcI->getParent()->instr_end()); 736 unsigned Lat = 0; 737 for (++I; I != E && I->isBundledWithPred(); ++I) { 738 if (I->modifiesRegister(Reg, TRI)) 739 Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I); 740 else if (Lat) 741 --Lat; 742 } 743 Dep.setLatency(Lat); 744 } else if (DstI->isBundle()) { 745 const SIRegisterInfo *TRI = getRegisterInfo(); 746 auto Reg = Dep.getReg(); 747 MachineBasicBlock::const_instr_iterator I(DstI->getIterator()); 748 MachineBasicBlock::const_instr_iterator E(DstI->getParent()->instr_end()); 749 unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *SrcI); 750 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) { 751 if (I->readsRegister(Reg, TRI)) 752 break; 753 --Lat; 754 } 755 Dep.setLatency(Lat); 756 } 757 } 758 759 namespace { 760 struct FillMFMAShadowMutation : ScheduleDAGMutation { 761 const SIInstrInfo *TII; 762 763 ScheduleDAGMI *DAG; 764 765 FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {} 766 767 bool isSALU(const SUnit *SU) const { 768 const MachineInstr *MI = SU->getInstr(); 769 return MI && TII->isSALU(*MI) && !MI->isTerminator(); 770 } 771 772 bool isVALU(const SUnit *SU) const { 773 const MachineInstr *MI = SU->getInstr(); 774 return MI && TII->isVALU(*MI); 775 } 776 777 bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const { 778 if (Pred->NodeNum < Succ->NodeNum) 779 return true; 780 781 SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred}); 782 783 for (unsigned I = 0; I < Succs.size(); ++I) { 784 for (const SDep &SI : Succs[I]->Succs) { 785 const SUnit *SU = SI.getSUnit(); 786 if (SU != Succs[I] && llvm::find(Succs, SU) == Succs.end()) 787 Succs.push_back(SU); 788 } 789 } 790 791 SmallPtrSet<const SUnit*, 32> Visited; 792 while (!Preds.empty()) { 793 const SUnit *SU = Preds.pop_back_val(); 794 if (llvm::find(Succs, SU) != Succs.end()) 795 return false; 796 Visited.insert(SU); 797 for (const SDep &SI : SU->Preds) 798 if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit())) 799 Preds.push_back(SI.getSUnit()); 800 } 801 802 return true; 803 } 804 805 // Link as much SALU intructions in chain as possible. Return the size 806 // of the chain. Links up to MaxChain instructions. 807 unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain, 808 SmallPtrSetImpl<SUnit *> &Visited) const { 809 SmallVector<SUnit *, 8> Worklist({To}); 810 unsigned Linked = 0; 811 812 while (!Worklist.empty() && MaxChain-- > 0) { 813 SUnit *SU = Worklist.pop_back_val(); 814 if (!Visited.insert(SU).second) 815 continue; 816 817 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From); 818 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n'); 819 820 if (SU->addPred(SDep(From, SDep::Artificial), false)) 821 ++Linked; 822 823 for (SDep &SI : From->Succs) { 824 SUnit *SUv = SI.getSUnit(); 825 if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU)) 826 SUv->addPred(SDep(SU, SDep::Artificial), false); 827 } 828 829 for (SDep &SI : SU->Succs) { 830 SUnit *Succ = SI.getSUnit(); 831 if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ)) 832 Worklist.push_back(Succ); 833 } 834 } 835 836 return Linked; 837 } 838 839 void apply(ScheduleDAGInstrs *DAGInstrs) override { 840 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); 841 if (!ST.hasMAIInsts() || DisablePowerSched) 842 return; 843 DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); 844 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); 845 if (!TSchedModel || DAG->SUnits.empty()) 846 return; 847 848 // Scan for MFMA long latency instructions and try to add a dependency 849 // of available SALU instructions to give them a chance to fill MFMA 850 // shadow. That is desirable to fill MFMA shadow with SALU instructions 851 // rather than VALU to prevent power consumption bursts and throttle. 852 auto LastSALU = DAG->SUnits.begin(); 853 auto E = DAG->SUnits.end(); 854 SmallPtrSet<SUnit*, 32> Visited; 855 for (SUnit &SU : DAG->SUnits) { 856 MachineInstr &MAI = *SU.getInstr(); 857 if (!TII->isMAI(MAI) || 858 MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 || 859 MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32) 860 continue; 861 862 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1; 863 864 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU); 865 dbgs() << "Need " << Lat 866 << " instructions to cover latency.\n"); 867 868 // Find up to Lat independent scalar instructions as early as 869 // possible such that they can be scheduled after this MFMA. 870 for ( ; Lat && LastSALU != E; ++LastSALU) { 871 if (Visited.count(&*LastSALU)) 872 continue; 873 874 if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU)) 875 continue; 876 877 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited); 878 } 879 } 880 } 881 }; 882 } // namespace 883 884 void GCNSubtarget::getPostRAMutations( 885 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 886 Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo)); 887 } 888 889 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 890 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 891 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 892 else 893 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); 894 } 895 896 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 897 if (TM.getTargetTriple().getArch() == Triple::amdgcn) 898 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 899 else 900 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); 901 } 902