1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Implements the AMDGPU specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUSubtarget.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUTargetMachine.h" 17 #include "AMDGPUCallLowering.h" 18 #include "AMDGPUInstructionSelector.h" 19 #include "AMDGPULegalizerInfo.h" 20 #include "AMDGPURegisterBankInfo.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 23 #include "llvm/ADT/SmallString.h" 24 #include "llvm/CodeGen/MachineScheduler.h" 25 #include "llvm/MC/MCSubtargetInfo.h" 26 #include "llvm/IR/MDBuilder.h" 27 #include "llvm/CodeGen/TargetFrameLowering.h" 28 #include <algorithm> 29 30 using namespace llvm; 31 32 #define DEBUG_TYPE "amdgpu-subtarget" 33 34 #define GET_SUBTARGETINFO_TARGET_DESC 35 #define GET_SUBTARGETINFO_CTOR 36 #define AMDGPUSubtarget GCNSubtarget 37 #include "AMDGPUGenSubtargetInfo.inc" 38 #define GET_SUBTARGETINFO_TARGET_DESC 39 #define GET_SUBTARGETINFO_CTOR 40 #undef AMDGPUSubtarget 41 #include "R600GenSubtargetInfo.inc" 42 43 static cl::opt<bool> DisablePowerSched( 44 "amdgpu-disable-power-sched", 45 cl::desc("Disable scheduling to minimize mAI power bursts"), 46 cl::init(false)); 47 48 static cl::opt<bool> EnableVGPRIndexMode( 49 "amdgpu-vgpr-index-mode", 50 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), 51 cl::init(false)); 52 53 GCNSubtarget::~GCNSubtarget() = default; 54 55 R600Subtarget & 56 R600Subtarget::initializeSubtargetDependencies(const Triple &TT, 57 StringRef GPU, StringRef FS) { 58 SmallString<256> FullFS("+promote-alloca,"); 59 FullFS += FS; 60 ParseSubtargetFeatures(GPU, FullFS); 61 62 // FIXME: I don't think think Evergreen has any useful support for 63 // denormals, but should be checked. Should we issue a warning somewhere 64 // if someone tries to enable these? 65 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { 66 FP32Denormals = false; 67 } 68 69 HasMulU24 = getGeneration() >= EVERGREEN; 70 HasMulI24 = hasCaymanISA(); 71 72 return *this; 73 } 74 75 GCNSubtarget & 76 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 77 StringRef GPU, StringRef FS) { 78 // Determine default and user-specified characteristics 79 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be 80 // enabled, but some instructions do not respect them and they run at the 81 // double precision rate, so don't enable by default. 82 // 83 // We want to be able to turn these off, but making this a subtarget feature 84 // for SI has the unhelpful behavior that it unsets everything else if you 85 // disable it. 86 // 87 // Similarly we want enable-prt-strict-null to be on by default and not to 88 // unset everything else if it is disabled 89 90 // Assuming ECC is enabled is the conservative default. 91 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,+xnack,"); 92 93 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 94 FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,"; 95 96 // FIXME: I don't think think Evergreen has any useful support for 97 // denormals, but should be checked. Should we issue a warning somewhere 98 // if someone tries to enable these? 99 if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { 100 FullFS += "+fp64-fp16-denormals,"; 101 } else { 102 FullFS += "-fp32-denormals,"; 103 } 104 105 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS 106 107 // Disable mutually exclusive bits. 108 if (FS.find_lower("+wavefrontsize") != StringRef::npos) { 109 if (FS.find_lower("wavefrontsize16") == StringRef::npos) 110 FullFS += "-wavefrontsize16,"; 111 if (FS.find_lower("wavefrontsize32") == StringRef::npos) 112 FullFS += "-wavefrontsize32,"; 113 if (FS.find_lower("wavefrontsize64") == StringRef::npos) 114 FullFS += "-wavefrontsize64,"; 115 } 116 117 FullFS += FS; 118 119 ParseSubtargetFeatures(GPU, FullFS); 120 121 // We don't support FP64 for EG/NI atm. 122 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 123 124 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es 125 // on VI and newer hardware to avoid assertion failures due to missing ADDR64 126 // variants of MUBUF instructions. 127 if (!hasAddr64() && !FS.contains("flat-for-global")) { 128 FlatForGlobal = true; 129 } 130 131 // Set defaults if needed. 132 if (MaxPrivateElementSize == 0) 133 MaxPrivateElementSize = 4; 134 135 if (LDSBankCount == 0) 136 LDSBankCount = 32; 137 138 if (TT.getArch() == Triple::amdgcn) { 139 if (LocalMemorySize == 0) 140 LocalMemorySize = 32768; 141 142 // Do something sensible for unspecified target. 143 if (!HasMovrel && !HasVGPRIndexMode) 144 HasMovrel = true; 145 } 146 147 // Don't crash on invalid devices. 148 if (WavefrontSize == 0) 149 WavefrontSize = 64; 150 151 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 152 153 if (DoesNotSupportXNACK && EnableXNACK) { 154 ToggleFeature(AMDGPU::FeatureXNACK); 155 EnableXNACK = false; 156 } 157 158 // ECC is on by default, but turn it off if the hardware doesn't support it 159 // anyway. This matters for the gfx9 targets with d16 loads, but don't support 160 // ECC. 161 if (DoesNotSupportSRAMECC && EnableSRAMECC) { 162 ToggleFeature(AMDGPU::FeatureSRAMECC); 163 EnableSRAMECC = false; 164 } 165 166 return *this; 167 } 168 169 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : 170 TargetTriple(TT), 171 Has16BitInsts(false), 172 HasMadMixInsts(false), 173 FP32Denormals(false), 174 FPExceptions(false), 175 HasSDWA(false), 176 HasVOP3PInsts(false), 177 HasMulI24(true), 178 HasMulU24(true), 179 HasInv2PiInlineImm(false), 180 HasFminFmaxLegacy(true), 181 EnablePromoteAlloca(false), 182 HasTrigReducedRange(false), 183 MaxWavesPerEU(10), 184 LocalMemorySize(0), 185 WavefrontSize(0) 186 { } 187 188 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 189 const GCNTargetMachine &TM) : 190 AMDGPUGenSubtargetInfo(TT, GPU, FS), 191 AMDGPUSubtarget(TT), 192 TargetTriple(TT), 193 Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS), 194 InstrItins(getInstrItineraryForCPU(GPU)), 195 LDSBankCount(0), 196 MaxPrivateElementSize(0), 197 198 FastFMAF32(false), 199 HalfRate64Ops(false), 200 201 FP64FP16Denormals(false), 202 FlatForGlobal(false), 203 AutoWaitcntBeforeBarrier(false), 204 CodeObjectV3(false), 205 UnalignedScratchAccess(false), 206 UnalignedBufferAccess(false), 207 208 HasApertureRegs(false), 209 EnableXNACK(false), 210 DoesNotSupportXNACK(false), 211 EnableCuMode(false), 212 TrapHandler(false), 213 214 EnableLoadStoreOpt(false), 215 EnableUnsafeDSOffsetFolding(false), 216 EnableSIScheduler(false), 217 EnableDS128(false), 218 EnablePRTStrictNull(false), 219 DumpCode(false), 220 221 FP64(false), 222 GCN3Encoding(false), 223 CIInsts(false), 224 GFX8Insts(false), 225 GFX9Insts(false), 226 GFX10Insts(false), 227 GFX7GFX8GFX9Insts(false), 228 SGPRInitBug(false), 229 HasSMemRealTime(false), 230 HasIntClamp(false), 231 HasFmaMixInsts(false), 232 HasMovrel(false), 233 HasVGPRIndexMode(false), 234 HasScalarStores(false), 235 HasScalarAtomics(false), 236 HasSDWAOmod(false), 237 HasSDWAScalar(false), 238 HasSDWASdst(false), 239 HasSDWAMac(false), 240 HasSDWAOutModsVOPC(false), 241 HasDPP(false), 242 HasDPP8(false), 243 HasR128A16(false), 244 HasGFX10A16(false), 245 HasNSAEncoding(false), 246 HasDLInsts(false), 247 HasDot1Insts(false), 248 HasDot2Insts(false), 249 HasDot3Insts(false), 250 HasDot4Insts(false), 251 HasDot5Insts(false), 252 HasDot6Insts(false), 253 HasMAIInsts(false), 254 HasPkFmacF16Inst(false), 255 HasAtomicFaddInsts(false), 256 EnableSRAMECC(false), 257 DoesNotSupportSRAMECC(false), 258 HasNoSdstCMPX(false), 259 HasVscnt(false), 260 HasRegisterBanking(false), 261 HasVOP3Literal(false), 262 HasNoDataDepHazard(false), 263 FlatAddressSpace(false), 264 FlatInstOffsets(false), 265 FlatGlobalInsts(false), 266 FlatScratchInsts(false), 267 ScalarFlatScratchInsts(false), 268 AddNoCarryInsts(false), 269 HasUnpackedD16VMem(false), 270 LDSMisalignedBug(false), 271 HasMFMAInlineLiteralBug(false), 272 273 ScalarizeGlobal(false), 274 275 HasVcmpxPermlaneHazard(false), 276 HasVMEMtoScalarWriteHazard(false), 277 HasSMEMtoVectorWriteHazard(false), 278 HasInstFwdPrefetchBug(false), 279 HasVcmpxExecWARHazard(false), 280 HasLdsBranchVmemWARHazard(false), 281 HasNSAtoVMEMBug(false), 282 HasOffset3fBug(false), 283 HasFlatSegmentOffsetBug(false), 284 285 FeatureDisable(false), 286 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 287 TLInfo(TM, *this), 288 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 289 MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this); 290 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 291 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 292 RegBankInfo.reset(new AMDGPURegisterBankInfo(*this)); 293 InstSelector.reset(new AMDGPUInstructionSelector( 294 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 295 } 296 297 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { 298 if (getGeneration() < GFX10) 299 return 1; 300 301 switch (Opcode) { 302 case AMDGPU::V_LSHLREV_B64: 303 case AMDGPU::V_LSHLREV_B64_gfx10: 304 case AMDGPU::V_LSHL_B64: 305 case AMDGPU::V_LSHRREV_B64: 306 case AMDGPU::V_LSHRREV_B64_gfx10: 307 case AMDGPU::V_LSHR_B64: 308 case AMDGPU::V_ASHRREV_I64: 309 case AMDGPU::V_ASHRREV_I64_gfx10: 310 case AMDGPU::V_ASHR_I64: 311 return 1; 312 } 313 314 return 2; 315 } 316 317 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 318 const Function &F) const { 319 if (NWaves == 1) 320 return getLocalMemorySize(); 321 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 322 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 323 if (!WorkGroupsPerCu) 324 return 0; 325 unsigned MaxWaves = getMaxWavesPerEU(); 326 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 327 } 328 329 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 330 const Function &F) const { 331 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 332 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 333 if (!WorkGroupsPerCu) 334 return 0; 335 unsigned MaxWaves = getMaxWavesPerEU(); 336 unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu; 337 unsigned NumWaves = Limit / (Bytes ? Bytes : 1u); 338 NumWaves = std::min(NumWaves, MaxWaves); 339 NumWaves = std::max(NumWaves, 1u); 340 return NumWaves; 341 } 342 343 unsigned 344 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 345 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 346 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 347 } 348 349 std::pair<unsigned, unsigned> 350 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 351 switch (CC) { 352 case CallingConv::AMDGPU_VS: 353 case CallingConv::AMDGPU_LS: 354 case CallingConv::AMDGPU_HS: 355 case CallingConv::AMDGPU_ES: 356 case CallingConv::AMDGPU_GS: 357 case CallingConv::AMDGPU_PS: 358 return std::make_pair(1, getWavefrontSize()); 359 default: 360 return std::make_pair(1u, getMaxFlatWorkGroupSize()); 361 } 362 } 363 364 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 365 const Function &F) const { 366 // Default minimum/maximum flat work group sizes. 367 std::pair<unsigned, unsigned> Default = 368 getDefaultFlatWorkGroupSize(F.getCallingConv()); 369 370 // Requested minimum/maximum flat work group sizes. 371 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 372 F, "amdgpu-flat-work-group-size", Default); 373 374 // Make sure requested minimum is less than requested maximum. 375 if (Requested.first > Requested.second) 376 return Default; 377 378 // Make sure requested values do not violate subtarget's specifications. 379 if (Requested.first < getMinFlatWorkGroupSize()) 380 return Default; 381 if (Requested.second > getMaxFlatWorkGroupSize()) 382 return Default; 383 384 return Requested; 385 } 386 387 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 388 const Function &F) const { 389 // Default minimum/maximum number of waves per execution unit. 390 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 391 392 // Default/requested minimum/maximum flat work group sizes. 393 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 394 395 // If minimum/maximum flat work group sizes were explicitly requested using 396 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 397 // number of waves per execution unit to values implied by requested 398 // minimum/maximum flat work group sizes. 399 unsigned MinImpliedByFlatWorkGroupSize = 400 getMaxWavesPerEU(FlatWorkGroupSizes.second); 401 bool RequestedFlatWorkGroupSize = false; 402 403 if (F.hasFnAttribute("amdgpu-flat-work-group-size")) { 404 Default.first = MinImpliedByFlatWorkGroupSize; 405 RequestedFlatWorkGroupSize = true; 406 } 407 408 // Requested minimum/maximum number of waves per execution unit. 409 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 410 F, "amdgpu-waves-per-eu", Default, true); 411 412 // Make sure requested minimum is less than requested maximum. 413 if (Requested.second && Requested.first > Requested.second) 414 return Default; 415 416 // Make sure requested values do not violate subtarget's specifications. 417 if (Requested.first < getMinWavesPerEU() || 418 Requested.first > getMaxWavesPerEU()) 419 return Default; 420 if (Requested.second > getMaxWavesPerEU()) 421 return Default; 422 423 // Make sure requested values are compatible with values implied by requested 424 // minimum/maximum flat work group sizes. 425 if (RequestedFlatWorkGroupSize && 426 Requested.first < MinImpliedByFlatWorkGroupSize) 427 return Default; 428 429 return Requested; 430 } 431 432 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 433 Function *Kernel = I->getParent()->getParent(); 434 unsigned MinSize = 0; 435 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 436 bool IdQuery = false; 437 438 // If reqd_work_group_size is present it narrows value down. 439 if (auto *CI = dyn_cast<CallInst>(I)) { 440 const Function *F = CI->getCalledFunction(); 441 if (F) { 442 unsigned Dim = UINT_MAX; 443 switch (F->getIntrinsicID()) { 444 case Intrinsic::amdgcn_workitem_id_x: 445 case Intrinsic::r600_read_tidig_x: 446 IdQuery = true; 447 LLVM_FALLTHROUGH; 448 case Intrinsic::r600_read_local_size_x: 449 Dim = 0; 450 break; 451 case Intrinsic::amdgcn_workitem_id_y: 452 case Intrinsic::r600_read_tidig_y: 453 IdQuery = true; 454 LLVM_FALLTHROUGH; 455 case Intrinsic::r600_read_local_size_y: 456 Dim = 1; 457 break; 458 case Intrinsic::amdgcn_workitem_id_z: 459 case Intrinsic::r600_read_tidig_z: 460 IdQuery = true; 461 LLVM_FALLTHROUGH; 462 case Intrinsic::r600_read_local_size_z: 463 Dim = 2; 464 break; 465 default: 466 break; 467 } 468 if (Dim <= 3) { 469 if (auto Node = Kernel->getMetadata("reqd_work_group_size")) 470 if (Node->getNumOperands() == 3) 471 MinSize = MaxSize = mdconst::extract<ConstantInt>( 472 Node->getOperand(Dim))->getZExtValue(); 473 } 474 } 475 } 476 477 if (!MaxSize) 478 return false; 479 480 // Range metadata is [Lo, Hi). For ID query we need to pass max size 481 // as Hi. For size query we need to pass Hi + 1. 482 if (IdQuery) 483 MinSize = 0; 484 else 485 ++MaxSize; 486 487 MDBuilder MDB(I->getContext()); 488 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 489 APInt(32, MaxSize)); 490 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 491 return true; 492 } 493 494 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 495 Align &MaxAlign) const { 496 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 497 F.getCallingConv() == CallingConv::SPIR_KERNEL); 498 499 const DataLayout &DL = F.getParent()->getDataLayout(); 500 uint64_t ExplicitArgBytes = 0; 501 MaxAlign = Align(1); 502 503 for (const Argument &Arg : F.args()) { 504 Type *ArgTy = Arg.getType(); 505 506 const Align Alignment(DL.getABITypeAlignment(ArgTy)); 507 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 508 ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize; 509 MaxAlign = std::max(MaxAlign, Alignment); 510 } 511 512 return ExplicitArgBytes; 513 } 514 515 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 516 Align &MaxAlign) const { 517 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 518 519 unsigned ExplicitOffset = getExplicitKernelArgOffset(F); 520 521 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 522 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 523 if (ImplicitBytes != 0) { 524 const Align Alignment = getAlignmentForImplicitArgPtr(); 525 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 526 } 527 528 // Being able to dereference past the end is useful for emitting scalar loads. 529 return alignTo(TotalSize, 4); 530 } 531 532 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 533 const TargetMachine &TM) : 534 R600GenSubtargetInfo(TT, GPU, FS), 535 AMDGPUSubtarget(TT), 536 InstrInfo(*this), 537 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 538 FMA(false), 539 CaymanISA(false), 540 CFALUBug(false), 541 HasVertexCache(false), 542 R600ALUInst(false), 543 FP64(false), 544 TexVTXClauseSize(0), 545 Gen(R600), 546 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)), 547 InstrItins(getInstrItineraryForCPU(GPU)) { } 548 549 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 550 unsigned NumRegionInstrs) const { 551 // Track register pressure so the scheduler can try to decrease 552 // pressure once register usage is above the threshold defined by 553 // SIRegisterInfo::getRegPressureSetLimit() 554 Policy.ShouldTrackPressure = true; 555 556 // Enabling both top down and bottom up scheduling seems to give us less 557 // register spills than just using one of these approaches on its own. 558 Policy.OnlyTopDown = false; 559 Policy.OnlyBottomUp = false; 560 561 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 562 if (!enableSIScheduler()) 563 Policy.ShouldTrackLaneMasks = true; 564 } 565 566 bool GCNSubtarget::hasMadF16() const { 567 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1; 568 } 569 570 bool GCNSubtarget::useVGPRIndexMode() const { 571 return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode()); 572 } 573 574 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 575 if (getGeneration() >= AMDGPUSubtarget::GFX10) 576 return getMaxWavesPerEU(); 577 578 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 579 if (SGPRs <= 80) 580 return 10; 581 if (SGPRs <= 88) 582 return 9; 583 if (SGPRs <= 100) 584 return 8; 585 return 7; 586 } 587 if (SGPRs <= 48) 588 return 10; 589 if (SGPRs <= 56) 590 return 9; 591 if (SGPRs <= 64) 592 return 8; 593 if (SGPRs <= 72) 594 return 7; 595 if (SGPRs <= 80) 596 return 6; 597 return 5; 598 } 599 600 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 601 unsigned MaxWaves = getMaxWavesPerEU(); 602 unsigned Granule = getVGPRAllocGranule(); 603 if (VGPRs < Granule) 604 return MaxWaves; 605 unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule; 606 return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves); 607 } 608 609 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 610 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 611 if (getGeneration() >= AMDGPUSubtarget::GFX10) 612 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. 613 614 if (MFI.hasFlatScratchInit()) { 615 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 616 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 617 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 618 return 4; // FLAT_SCRATCH, VCC (in that order). 619 } 620 621 if (isXNACKEnabled()) 622 return 4; // XNACK, VCC (in that order). 623 return 2; // VCC. 624 } 625 626 unsigned GCNSubtarget::computeOccupancy(const MachineFunction &MF, 627 unsigned LDSSize, 628 unsigned NumSGPRs, 629 unsigned NumVGPRs) const { 630 unsigned Occupancy = 631 std::min(getMaxWavesPerEU(), 632 getOccupancyWithLocalMemSize(LDSSize, MF.getFunction())); 633 if (NumSGPRs) 634 Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs)); 635 if (NumVGPRs) 636 Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs)); 637 return Occupancy; 638 } 639 640 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 641 const Function &F = MF.getFunction(); 642 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 643 644 // Compute maximum number of SGPRs function can use using default/requested 645 // minimum number of waves per execution unit. 646 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 647 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 648 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 649 650 // Check if maximum number of SGPRs was explicitly requested using 651 // "amdgpu-num-sgpr" attribute. 652 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 653 unsigned Requested = AMDGPU::getIntegerAttribute( 654 F, "amdgpu-num-sgpr", MaxNumSGPRs); 655 656 // Make sure requested value does not violate subtarget's specifications. 657 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 658 Requested = 0; 659 660 // If more SGPRs are required to support the input user/system SGPRs, 661 // increase to accommodate them. 662 // 663 // FIXME: This really ends up using the requested number of SGPRs + number 664 // of reserved special registers in total. Theoretically you could re-use 665 // the last input registers for these special registers, but this would 666 // require a lot of complexity to deal with the weird aliasing. 667 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 668 if (Requested && Requested < InputNumSGPRs) 669 Requested = InputNumSGPRs; 670 671 // Make sure requested value is compatible with values implied by 672 // default/requested minimum/maximum number of waves per execution unit. 673 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 674 Requested = 0; 675 if (WavesPerEU.second && 676 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 677 Requested = 0; 678 679 if (Requested) 680 MaxNumSGPRs = Requested; 681 } 682 683 if (hasSGPRInitBug()) 684 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 685 686 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 687 MaxAddressableNumSGPRs); 688 } 689 690 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 691 const Function &F = MF.getFunction(); 692 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 693 694 // Compute maximum number of VGPRs function can use using default/requested 695 // minimum number of waves per execution unit. 696 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 697 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 698 699 // Check if maximum number of VGPRs was explicitly requested using 700 // "amdgpu-num-vgpr" attribute. 701 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 702 unsigned Requested = AMDGPU::getIntegerAttribute( 703 F, "amdgpu-num-vgpr", MaxNumVGPRs); 704 705 // Make sure requested value is compatible with values implied by 706 // default/requested minimum/maximum number of waves per execution unit. 707 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 708 Requested = 0; 709 if (WavesPerEU.second && 710 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 711 Requested = 0; 712 713 if (Requested) 714 MaxNumVGPRs = Requested; 715 } 716 717 return MaxNumVGPRs; 718 } 719 720 void GCNSubtarget::adjustSchedDependency(SUnit *Src, SUnit *Dst, 721 SDep &Dep) const { 722 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || 723 !Src->isInstr() || !Dst->isInstr()) 724 return; 725 726 MachineInstr *SrcI = Src->getInstr(); 727 MachineInstr *DstI = Dst->getInstr(); 728 729 if (SrcI->isBundle()) { 730 const SIRegisterInfo *TRI = getRegisterInfo(); 731 auto Reg = Dep.getReg(); 732 MachineBasicBlock::const_instr_iterator I(SrcI->getIterator()); 733 MachineBasicBlock::const_instr_iterator E(SrcI->getParent()->instr_end()); 734 unsigned Lat = 0; 735 for (++I; I != E && I->isBundledWithPred(); ++I) { 736 if (I->modifiesRegister(Reg, TRI)) 737 Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I); 738 else if (Lat) 739 --Lat; 740 } 741 Dep.setLatency(Lat); 742 } else if (DstI->isBundle()) { 743 const SIRegisterInfo *TRI = getRegisterInfo(); 744 auto Reg = Dep.getReg(); 745 MachineBasicBlock::const_instr_iterator I(DstI->getIterator()); 746 MachineBasicBlock::const_instr_iterator E(DstI->getParent()->instr_end()); 747 unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *SrcI); 748 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) { 749 if (I->readsRegister(Reg, TRI)) 750 break; 751 --Lat; 752 } 753 Dep.setLatency(Lat); 754 } 755 } 756 757 namespace { 758 struct FillMFMAShadowMutation : ScheduleDAGMutation { 759 const SIInstrInfo *TII; 760 761 ScheduleDAGMI *DAG; 762 763 FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {} 764 765 bool isSALU(const SUnit *SU) const { 766 const MachineInstr *MI = SU->getInstr(); 767 return MI && TII->isSALU(*MI) && !MI->isTerminator(); 768 } 769 770 bool isVALU(const SUnit *SU) const { 771 const MachineInstr *MI = SU->getInstr(); 772 return MI && TII->isVALU(*MI); 773 } 774 775 bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const { 776 if (Pred->NodeNum < Succ->NodeNum) 777 return true; 778 779 SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred}); 780 781 for (unsigned I = 0; I < Succs.size(); ++I) { 782 for (const SDep &SI : Succs[I]->Succs) { 783 const SUnit *SU = SI.getSUnit(); 784 if (SU != Succs[I] && llvm::find(Succs, SU) == Succs.end()) 785 Succs.push_back(SU); 786 } 787 } 788 789 SmallPtrSet<const SUnit*, 32> Visited; 790 while (!Preds.empty()) { 791 const SUnit *SU = Preds.pop_back_val(); 792 if (llvm::find(Succs, SU) != Succs.end()) 793 return false; 794 Visited.insert(SU); 795 for (const SDep &SI : SU->Preds) 796 if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit())) 797 Preds.push_back(SI.getSUnit()); 798 } 799 800 return true; 801 } 802 803 // Link as much SALU intructions in chain as possible. Return the size 804 // of the chain. Links up to MaxChain instructions. 805 unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain, 806 SmallPtrSetImpl<SUnit *> &Visited) const { 807 SmallVector<SUnit *, 8> Worklist({To}); 808 unsigned Linked = 0; 809 810 while (!Worklist.empty() && MaxChain-- > 0) { 811 SUnit *SU = Worklist.pop_back_val(); 812 if (!Visited.insert(SU).second) 813 continue; 814 815 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From); 816 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n'); 817 818 if (SU->addPred(SDep(From, SDep::Artificial), false)) 819 ++Linked; 820 821 for (SDep &SI : From->Succs) { 822 SUnit *SUv = SI.getSUnit(); 823 if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU)) 824 SUv->addPred(SDep(SU, SDep::Artificial), false); 825 } 826 827 for (SDep &SI : SU->Succs) { 828 SUnit *Succ = SI.getSUnit(); 829 if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ)) 830 Worklist.push_back(Succ); 831 } 832 } 833 834 return Linked; 835 } 836 837 void apply(ScheduleDAGInstrs *DAGInstrs) override { 838 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); 839 if (!ST.hasMAIInsts() || DisablePowerSched) 840 return; 841 DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); 842 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); 843 if (!TSchedModel || DAG->SUnits.empty()) 844 return; 845 846 // Scan for MFMA long latency instructions and try to add a dependency 847 // of available SALU instructions to give them a chance to fill MFMA 848 // shadow. That is desirable to fill MFMA shadow with SALU instructions 849 // rather than VALU to prevent power consumption bursts and throttle. 850 auto LastSALU = DAG->SUnits.begin(); 851 auto E = DAG->SUnits.end(); 852 SmallPtrSet<SUnit*, 32> Visited; 853 for (SUnit &SU : DAG->SUnits) { 854 MachineInstr &MAI = *SU.getInstr(); 855 if (!TII->isMAI(MAI) || 856 MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 || 857 MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32) 858 continue; 859 860 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1; 861 862 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU); 863 dbgs() << "Need " << Lat 864 << " instructions to cover latency.\n"); 865 866 // Find up to Lat independent scalar instructions as early as 867 // possible such that they can be scheduled after this MFMA. 868 for ( ; Lat && LastSALU != E; ++LastSALU) { 869 if (Visited.count(&*LastSALU)) 870 continue; 871 872 if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU)) 873 continue; 874 875 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited); 876 } 877 } 878 } 879 }; 880 } // namespace 881 882 void GCNSubtarget::getPostRAMutations( 883 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 884 Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo)); 885 } 886 887 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 888 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 889 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 890 else 891 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); 892 } 893 894 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 895 if (TM.getTargetTriple().getArch() == Triple::amdgcn) 896 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 897 else 898 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); 899 } 900