1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Implements the AMDGPU specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUSubtarget.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUTargetMachine.h" 17 #include "AMDGPUCallLowering.h" 18 #include "AMDGPUInstructionSelector.h" 19 #include "AMDGPULegalizerInfo.h" 20 #include "AMDGPURegisterBankInfo.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 23 #include "llvm/ADT/SmallString.h" 24 #include "llvm/CodeGen/MachineScheduler.h" 25 #include "llvm/MC/MCSubtargetInfo.h" 26 #include "llvm/IR/MDBuilder.h" 27 #include "llvm/CodeGen/TargetFrameLowering.h" 28 #include <algorithm> 29 30 using namespace llvm; 31 32 #define DEBUG_TYPE "amdgpu-subtarget" 33 34 #define GET_SUBTARGETINFO_TARGET_DESC 35 #define GET_SUBTARGETINFO_CTOR 36 #define AMDGPUSubtarget GCNSubtarget 37 #include "AMDGPUGenSubtargetInfo.inc" 38 #define GET_SUBTARGETINFO_TARGET_DESC 39 #define GET_SUBTARGETINFO_CTOR 40 #undef AMDGPUSubtarget 41 #include "R600GenSubtargetInfo.inc" 42 43 static cl::opt<bool> DisablePowerSched( 44 "amdgpu-disable-power-sched", 45 cl::desc("Disable scheduling to minimize mAI power bursts"), 46 cl::init(false)); 47 48 static cl::opt<bool> EnableVGPRIndexMode( 49 "amdgpu-vgpr-index-mode", 50 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), 51 cl::init(false)); 52 53 GCNSubtarget::~GCNSubtarget() = default; 54 55 R600Subtarget & 56 R600Subtarget::initializeSubtargetDependencies(const Triple &TT, 57 StringRef GPU, StringRef FS) { 58 SmallString<256> FullFS("+promote-alloca,"); 59 FullFS += FS; 60 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS); 61 62 HasMulU24 = getGeneration() >= EVERGREEN; 63 HasMulI24 = hasCaymanISA(); 64 65 return *this; 66 } 67 68 GCNSubtarget & 69 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 70 StringRef GPU, StringRef FS) { 71 // Determine default and user-specified characteristics 72 // 73 // We want to be able to turn these off, but making this a subtarget feature 74 // for SI has the unhelpful behavior that it unsets everything else if you 75 // disable it. 76 // 77 // Similarly we want enable-prt-strict-null to be on by default and not to 78 // unset everything else if it is disabled 79 80 // Assuming ECC is enabled is the conservative default. 81 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,+sram-ecc,+xnack,"); 82 83 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 84 FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,"; 85 86 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS 87 88 // Disable mutually exclusive bits. 89 if (FS.find_lower("+wavefrontsize") != StringRef::npos) { 90 if (FS.find_lower("wavefrontsize16") == StringRef::npos) 91 FullFS += "-wavefrontsize16,"; 92 if (FS.find_lower("wavefrontsize32") == StringRef::npos) 93 FullFS += "-wavefrontsize32,"; 94 if (FS.find_lower("wavefrontsize64") == StringRef::npos) 95 FullFS += "-wavefrontsize64,"; 96 } 97 98 FullFS += FS; 99 100 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS); 101 102 // We don't support FP64 for EG/NI atm. 103 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 104 105 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es 106 // on VI and newer hardware to avoid assertion failures due to missing ADDR64 107 // variants of MUBUF instructions. 108 if (!hasAddr64() && !FS.contains("flat-for-global")) { 109 FlatForGlobal = true; 110 } 111 112 // Set defaults if needed. 113 if (MaxPrivateElementSize == 0) 114 MaxPrivateElementSize = 4; 115 116 if (LDSBankCount == 0) 117 LDSBankCount = 32; 118 119 if (TT.getArch() == Triple::amdgcn) { 120 if (LocalMemorySize == 0) 121 LocalMemorySize = 32768; 122 123 // Do something sensible for unspecified target. 124 if (!HasMovrel && !HasVGPRIndexMode) 125 HasMovrel = true; 126 } 127 128 // Don't crash on invalid devices. 129 if (WavefrontSizeLog2 == 0) 130 WavefrontSizeLog2 = 5; 131 132 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 133 134 // Disable XNACK on targets where it is not enabled by default unless it is 135 // explicitly requested. 136 if (!FS.contains("+xnack") && DoesNotSupportXNACK && EnableXNACK) { 137 ToggleFeature(AMDGPU::FeatureXNACK); 138 EnableXNACK = false; 139 } 140 141 // ECC is on by default, but turn it off if the hardware doesn't support it 142 // anyway. This matters for the gfx9 targets with d16 loads, but don't support 143 // ECC. 144 if (DoesNotSupportSRAMECC && EnableSRAMECC) { 145 ToggleFeature(AMDGPU::FeatureSRAMECC); 146 EnableSRAMECC = false; 147 } 148 149 return *this; 150 } 151 152 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : 153 TargetTriple(TT), 154 Has16BitInsts(false), 155 HasMadMixInsts(false), 156 HasMadMacF32Insts(false), 157 HasDsSrc2Insts(false), 158 HasSDWA(false), 159 HasVOP3PInsts(false), 160 HasMulI24(true), 161 HasMulU24(true), 162 HasInv2PiInlineImm(false), 163 HasFminFmaxLegacy(true), 164 EnablePromoteAlloca(false), 165 HasTrigReducedRange(false), 166 MaxWavesPerEU(10), 167 LocalMemorySize(0), 168 WavefrontSizeLog2(0) 169 { } 170 171 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 172 const GCNTargetMachine &TM) : 173 AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS), 174 AMDGPUSubtarget(TT), 175 TargetTriple(TT), 176 Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS), 177 InstrItins(getInstrItineraryForCPU(GPU)), 178 LDSBankCount(0), 179 MaxPrivateElementSize(0), 180 181 FastFMAF32(false), 182 FastDenormalF32(false), 183 HalfRate64Ops(false), 184 185 FlatForGlobal(false), 186 AutoWaitcntBeforeBarrier(false), 187 CodeObjectV3(false), 188 UnalignedScratchAccess(false), 189 UnalignedAccessMode(false), 190 191 HasApertureRegs(false), 192 EnableXNACK(false), 193 DoesNotSupportXNACK(false), 194 EnableCuMode(false), 195 TrapHandler(false), 196 197 EnableLoadStoreOpt(false), 198 EnableUnsafeDSOffsetFolding(false), 199 EnableSIScheduler(false), 200 EnableDS128(false), 201 EnablePRTStrictNull(false), 202 DumpCode(false), 203 204 FP64(false), 205 GCN3Encoding(false), 206 CIInsts(false), 207 GFX8Insts(false), 208 GFX9Insts(false), 209 GFX10Insts(false), 210 GFX10_3Insts(false), 211 GFX7GFX8GFX9Insts(false), 212 SGPRInitBug(false), 213 HasSMemRealTime(false), 214 HasIntClamp(false), 215 HasFmaMixInsts(false), 216 HasMovrel(false), 217 HasVGPRIndexMode(false), 218 HasScalarStores(false), 219 HasScalarAtomics(false), 220 HasSDWAOmod(false), 221 HasSDWAScalar(false), 222 HasSDWASdst(false), 223 HasSDWAMac(false), 224 HasSDWAOutModsVOPC(false), 225 HasDPP(false), 226 HasDPP8(false), 227 HasR128A16(false), 228 HasGFX10A16(false), 229 HasG16(false), 230 HasNSAEncoding(false), 231 GFX10_BEncoding(false), 232 HasDLInsts(false), 233 HasDot1Insts(false), 234 HasDot2Insts(false), 235 HasDot3Insts(false), 236 HasDot4Insts(false), 237 HasDot5Insts(false), 238 HasDot6Insts(false), 239 HasMAIInsts(false), 240 HasPkFmacF16Inst(false), 241 HasAtomicFaddInsts(false), 242 EnableSRAMECC(false), 243 DoesNotSupportSRAMECC(false), 244 HasNoSdstCMPX(false), 245 HasVscnt(false), 246 HasGetWaveIdInst(false), 247 HasSMemTimeInst(false), 248 HasRegisterBanking(false), 249 HasVOP3Literal(false), 250 HasNoDataDepHazard(false), 251 FlatAddressSpace(false), 252 FlatInstOffsets(false), 253 FlatGlobalInsts(false), 254 FlatScratchInsts(false), 255 ScalarFlatScratchInsts(false), 256 AddNoCarryInsts(false), 257 HasUnpackedD16VMem(false), 258 LDSMisalignedBug(false), 259 HasMFMAInlineLiteralBug(false), 260 UnalignedBufferAccess(false), 261 UnalignedDSAccess(false), 262 263 ScalarizeGlobal(false), 264 265 HasVcmpxPermlaneHazard(false), 266 HasVMEMtoScalarWriteHazard(false), 267 HasSMEMtoVectorWriteHazard(false), 268 HasInstFwdPrefetchBug(false), 269 HasVcmpxExecWARHazard(false), 270 HasLdsBranchVmemWARHazard(false), 271 HasNSAtoVMEMBug(false), 272 HasOffset3fBug(false), 273 HasFlatSegmentOffsetBug(false), 274 275 FeatureDisable(false), 276 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 277 TLInfo(TM, *this), 278 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 279 MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this); 280 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 281 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering())); 282 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 283 RegBankInfo.reset(new AMDGPURegisterBankInfo(*this)); 284 InstSelector.reset(new AMDGPUInstructionSelector( 285 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 286 } 287 288 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { 289 if (getGeneration() < GFX10) 290 return 1; 291 292 switch (Opcode) { 293 case AMDGPU::V_LSHLREV_B64: 294 case AMDGPU::V_LSHLREV_B64_gfx10: 295 case AMDGPU::V_LSHL_B64: 296 case AMDGPU::V_LSHRREV_B64: 297 case AMDGPU::V_LSHRREV_B64_gfx10: 298 case AMDGPU::V_LSHR_B64: 299 case AMDGPU::V_ASHRREV_I64: 300 case AMDGPU::V_ASHRREV_I64_gfx10: 301 case AMDGPU::V_ASHR_I64: 302 return 1; 303 } 304 305 return 2; 306 } 307 308 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 309 const Function &F) const { 310 if (NWaves == 1) 311 return getLocalMemorySize(); 312 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 313 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 314 if (!WorkGroupsPerCu) 315 return 0; 316 unsigned MaxWaves = getMaxWavesPerEU(); 317 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 318 } 319 320 // FIXME: Should return min,max range. 321 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 322 const Function &F) const { 323 const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second; 324 const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize); 325 if (!MaxWorkGroupsPerCu) 326 return 0; 327 328 const unsigned WaveSize = getWavefrontSize(); 329 330 // FIXME: Do we need to account for alignment requirement of LDS rounding the 331 // size up? 332 // Compute restriction based on LDS usage 333 unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u); 334 335 // This can be queried with more LDS than is possible, so just assume the 336 // worst. 337 if (NumGroups == 0) 338 return 1; 339 340 NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups); 341 342 // Round to the number of waves. 343 const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize; 344 unsigned MaxWaves = NumGroups * MaxGroupNumWaves; 345 346 // Clamp to the maximum possible number of waves. 347 MaxWaves = std::min(MaxWaves, getMaxWavesPerEU()); 348 349 // FIXME: Needs to be a multiple of the group size? 350 //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves); 351 352 assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() && 353 "computed invalid occupancy"); 354 return MaxWaves; 355 } 356 357 unsigned 358 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 359 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 360 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 361 } 362 363 std::pair<unsigned, unsigned> 364 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 365 switch (CC) { 366 case CallingConv::AMDGPU_VS: 367 case CallingConv::AMDGPU_LS: 368 case CallingConv::AMDGPU_HS: 369 case CallingConv::AMDGPU_ES: 370 case CallingConv::AMDGPU_GS: 371 case CallingConv::AMDGPU_PS: 372 return std::make_pair(1, getWavefrontSize()); 373 default: 374 return std::make_pair(1u, getMaxFlatWorkGroupSize()); 375 } 376 } 377 378 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 379 const Function &F) const { 380 // Default minimum/maximum flat work group sizes. 381 std::pair<unsigned, unsigned> Default = 382 getDefaultFlatWorkGroupSize(F.getCallingConv()); 383 384 // Requested minimum/maximum flat work group sizes. 385 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 386 F, "amdgpu-flat-work-group-size", Default); 387 388 // Make sure requested minimum is less than requested maximum. 389 if (Requested.first > Requested.second) 390 return Default; 391 392 // Make sure requested values do not violate subtarget's specifications. 393 if (Requested.first < getMinFlatWorkGroupSize()) 394 return Default; 395 if (Requested.second > getMaxFlatWorkGroupSize()) 396 return Default; 397 398 return Requested; 399 } 400 401 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 402 const Function &F) const { 403 // Default minimum/maximum number of waves per execution unit. 404 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 405 406 // Default/requested minimum/maximum flat work group sizes. 407 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 408 409 // If minimum/maximum flat work group sizes were explicitly requested using 410 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 411 // number of waves per execution unit to values implied by requested 412 // minimum/maximum flat work group sizes. 413 unsigned MinImpliedByFlatWorkGroupSize = 414 getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second); 415 Default.first = MinImpliedByFlatWorkGroupSize; 416 bool RequestedFlatWorkGroupSize = 417 F.hasFnAttribute("amdgpu-flat-work-group-size"); 418 419 // Requested minimum/maximum number of waves per execution unit. 420 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 421 F, "amdgpu-waves-per-eu", Default, true); 422 423 // Make sure requested minimum is less than requested maximum. 424 if (Requested.second && Requested.first > Requested.second) 425 return Default; 426 427 // Make sure requested values do not violate subtarget's specifications. 428 if (Requested.first < getMinWavesPerEU() || 429 Requested.second > getMaxWavesPerEU()) 430 return Default; 431 432 // Make sure requested values are compatible with values implied by requested 433 // minimum/maximum flat work group sizes. 434 if (RequestedFlatWorkGroupSize && 435 Requested.first < MinImpliedByFlatWorkGroupSize) 436 return Default; 437 438 return Requested; 439 } 440 441 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 442 Function *Kernel = I->getParent()->getParent(); 443 unsigned MinSize = 0; 444 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 445 bool IdQuery = false; 446 447 // If reqd_work_group_size is present it narrows value down. 448 if (auto *CI = dyn_cast<CallInst>(I)) { 449 const Function *F = CI->getCalledFunction(); 450 if (F) { 451 unsigned Dim = UINT_MAX; 452 switch (F->getIntrinsicID()) { 453 case Intrinsic::amdgcn_workitem_id_x: 454 case Intrinsic::r600_read_tidig_x: 455 IdQuery = true; 456 LLVM_FALLTHROUGH; 457 case Intrinsic::r600_read_local_size_x: 458 Dim = 0; 459 break; 460 case Intrinsic::amdgcn_workitem_id_y: 461 case Intrinsic::r600_read_tidig_y: 462 IdQuery = true; 463 LLVM_FALLTHROUGH; 464 case Intrinsic::r600_read_local_size_y: 465 Dim = 1; 466 break; 467 case Intrinsic::amdgcn_workitem_id_z: 468 case Intrinsic::r600_read_tidig_z: 469 IdQuery = true; 470 LLVM_FALLTHROUGH; 471 case Intrinsic::r600_read_local_size_z: 472 Dim = 2; 473 break; 474 default: 475 break; 476 } 477 if (Dim <= 3) { 478 if (auto Node = Kernel->getMetadata("reqd_work_group_size")) 479 if (Node->getNumOperands() == 3) 480 MinSize = MaxSize = mdconst::extract<ConstantInt>( 481 Node->getOperand(Dim))->getZExtValue(); 482 } 483 } 484 } 485 486 if (!MaxSize) 487 return false; 488 489 // Range metadata is [Lo, Hi). For ID query we need to pass max size 490 // as Hi. For size query we need to pass Hi + 1. 491 if (IdQuery) 492 MinSize = 0; 493 else 494 ++MaxSize; 495 496 MDBuilder MDB(I->getContext()); 497 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 498 APInt(32, MaxSize)); 499 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 500 return true; 501 } 502 503 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 504 Align &MaxAlign) const { 505 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 506 F.getCallingConv() == CallingConv::SPIR_KERNEL); 507 508 const DataLayout &DL = F.getParent()->getDataLayout(); 509 uint64_t ExplicitArgBytes = 0; 510 MaxAlign = Align(1); 511 512 for (const Argument &Arg : F.args()) { 513 const bool IsByRef = Arg.hasByRefAttr(); 514 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); 515 MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None; 516 if (!Alignment) 517 Alignment = DL.getABITypeAlign(ArgTy); 518 519 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 520 ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize; 521 MaxAlign = max(MaxAlign, Alignment); 522 } 523 524 return ExplicitArgBytes; 525 } 526 527 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 528 Align &MaxAlign) const { 529 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 530 531 unsigned ExplicitOffset = getExplicitKernelArgOffset(F); 532 533 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 534 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 535 if (ImplicitBytes != 0) { 536 const Align Alignment = getAlignmentForImplicitArgPtr(); 537 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 538 } 539 540 // Being able to dereference past the end is useful for emitting scalar loads. 541 return alignTo(TotalSize, 4); 542 } 543 544 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 545 const TargetMachine &TM) : 546 R600GenSubtargetInfo(TT, GPU, /*TuneCPU*/GPU, FS), 547 AMDGPUSubtarget(TT), 548 InstrInfo(*this), 549 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 550 FMA(false), 551 CaymanISA(false), 552 CFALUBug(false), 553 HasVertexCache(false), 554 R600ALUInst(false), 555 FP64(false), 556 TexVTXClauseSize(0), 557 Gen(R600), 558 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)), 559 InstrItins(getInstrItineraryForCPU(GPU)) { } 560 561 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 562 unsigned NumRegionInstrs) const { 563 // Track register pressure so the scheduler can try to decrease 564 // pressure once register usage is above the threshold defined by 565 // SIRegisterInfo::getRegPressureSetLimit() 566 Policy.ShouldTrackPressure = true; 567 568 // Enabling both top down and bottom up scheduling seems to give us less 569 // register spills than just using one of these approaches on its own. 570 Policy.OnlyTopDown = false; 571 Policy.OnlyBottomUp = false; 572 573 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 574 if (!enableSIScheduler()) 575 Policy.ShouldTrackLaneMasks = true; 576 } 577 578 bool GCNSubtarget::hasMadF16() const { 579 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1; 580 } 581 582 bool GCNSubtarget::useVGPRIndexMode() const { 583 return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode()); 584 } 585 586 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 587 if (getGeneration() >= AMDGPUSubtarget::GFX10) 588 return getMaxWavesPerEU(); 589 590 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 591 if (SGPRs <= 80) 592 return 10; 593 if (SGPRs <= 88) 594 return 9; 595 if (SGPRs <= 100) 596 return 8; 597 return 7; 598 } 599 if (SGPRs <= 48) 600 return 10; 601 if (SGPRs <= 56) 602 return 9; 603 if (SGPRs <= 64) 604 return 8; 605 if (SGPRs <= 72) 606 return 7; 607 if (SGPRs <= 80) 608 return 6; 609 return 5; 610 } 611 612 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 613 unsigned MaxWaves = getMaxWavesPerEU(); 614 unsigned Granule = getVGPRAllocGranule(); 615 if (VGPRs < Granule) 616 return MaxWaves; 617 unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule; 618 return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves); 619 } 620 621 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 622 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 623 if (getGeneration() >= AMDGPUSubtarget::GFX10) 624 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. 625 626 if (MFI.hasFlatScratchInit()) { 627 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 628 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 629 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 630 return 4; // FLAT_SCRATCH, VCC (in that order). 631 } 632 633 if (isXNACKEnabled()) 634 return 4; // XNACK, VCC (in that order). 635 return 2; // VCC. 636 } 637 638 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize, 639 unsigned NumSGPRs, 640 unsigned NumVGPRs) const { 641 unsigned Occupancy = 642 std::min(getMaxWavesPerEU(), 643 getOccupancyWithLocalMemSize(LDSSize, F)); 644 if (NumSGPRs) 645 Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs)); 646 if (NumVGPRs) 647 Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs)); 648 return Occupancy; 649 } 650 651 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 652 const Function &F = MF.getFunction(); 653 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 654 655 // Compute maximum number of SGPRs function can use using default/requested 656 // minimum number of waves per execution unit. 657 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 658 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 659 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 660 661 // Check if maximum number of SGPRs was explicitly requested using 662 // "amdgpu-num-sgpr" attribute. 663 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 664 unsigned Requested = AMDGPU::getIntegerAttribute( 665 F, "amdgpu-num-sgpr", MaxNumSGPRs); 666 667 // Make sure requested value does not violate subtarget's specifications. 668 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 669 Requested = 0; 670 671 // If more SGPRs are required to support the input user/system SGPRs, 672 // increase to accommodate them. 673 // 674 // FIXME: This really ends up using the requested number of SGPRs + number 675 // of reserved special registers in total. Theoretically you could re-use 676 // the last input registers for these special registers, but this would 677 // require a lot of complexity to deal with the weird aliasing. 678 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 679 if (Requested && Requested < InputNumSGPRs) 680 Requested = InputNumSGPRs; 681 682 // Make sure requested value is compatible with values implied by 683 // default/requested minimum/maximum number of waves per execution unit. 684 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 685 Requested = 0; 686 if (WavesPerEU.second && 687 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 688 Requested = 0; 689 690 if (Requested) 691 MaxNumSGPRs = Requested; 692 } 693 694 if (hasSGPRInitBug()) 695 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 696 697 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 698 MaxAddressableNumSGPRs); 699 } 700 701 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 702 const Function &F = MF.getFunction(); 703 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 704 705 // Compute maximum number of VGPRs function can use using default/requested 706 // minimum number of waves per execution unit. 707 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 708 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 709 710 // Check if maximum number of VGPRs was explicitly requested using 711 // "amdgpu-num-vgpr" attribute. 712 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 713 unsigned Requested = AMDGPU::getIntegerAttribute( 714 F, "amdgpu-num-vgpr", MaxNumVGPRs); 715 716 // Make sure requested value is compatible with values implied by 717 // default/requested minimum/maximum number of waves per execution unit. 718 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 719 Requested = 0; 720 if (WavesPerEU.second && 721 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 722 Requested = 0; 723 724 if (Requested) 725 MaxNumVGPRs = Requested; 726 } 727 728 return MaxNumVGPRs; 729 } 730 731 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, 732 int UseOpIdx, SDep &Dep) const { 733 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || 734 !Def->isInstr() || !Use->isInstr()) 735 return; 736 737 MachineInstr *DefI = Def->getInstr(); 738 MachineInstr *UseI = Use->getInstr(); 739 740 if (DefI->isBundle()) { 741 const SIRegisterInfo *TRI = getRegisterInfo(); 742 auto Reg = Dep.getReg(); 743 MachineBasicBlock::const_instr_iterator I(DefI->getIterator()); 744 MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end()); 745 unsigned Lat = 0; 746 for (++I; I != E && I->isBundledWithPred(); ++I) { 747 if (I->modifiesRegister(Reg, TRI)) 748 Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I); 749 else if (Lat) 750 --Lat; 751 } 752 Dep.setLatency(Lat); 753 } else if (UseI->isBundle()) { 754 const SIRegisterInfo *TRI = getRegisterInfo(); 755 auto Reg = Dep.getReg(); 756 MachineBasicBlock::const_instr_iterator I(UseI->getIterator()); 757 MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end()); 758 unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI); 759 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) { 760 if (I->readsRegister(Reg, TRI)) 761 break; 762 --Lat; 763 } 764 Dep.setLatency(Lat); 765 } 766 } 767 768 namespace { 769 struct FillMFMAShadowMutation : ScheduleDAGMutation { 770 const SIInstrInfo *TII; 771 772 ScheduleDAGMI *DAG; 773 774 FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {} 775 776 bool isSALU(const SUnit *SU) const { 777 const MachineInstr *MI = SU->getInstr(); 778 return MI && TII->isSALU(*MI) && !MI->isTerminator(); 779 } 780 781 bool isVALU(const SUnit *SU) const { 782 const MachineInstr *MI = SU->getInstr(); 783 return MI && TII->isVALU(*MI); 784 } 785 786 bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const { 787 if (Pred->NodeNum < Succ->NodeNum) 788 return true; 789 790 SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred}); 791 792 for (unsigned I = 0; I < Succs.size(); ++I) { 793 for (const SDep &SI : Succs[I]->Succs) { 794 const SUnit *SU = SI.getSUnit(); 795 if (SU != Succs[I] && llvm::find(Succs, SU) == Succs.end()) 796 Succs.push_back(SU); 797 } 798 } 799 800 SmallPtrSet<const SUnit*, 32> Visited; 801 while (!Preds.empty()) { 802 const SUnit *SU = Preds.pop_back_val(); 803 if (llvm::find(Succs, SU) != Succs.end()) 804 return false; 805 Visited.insert(SU); 806 for (const SDep &SI : SU->Preds) 807 if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit())) 808 Preds.push_back(SI.getSUnit()); 809 } 810 811 return true; 812 } 813 814 // Link as much SALU intructions in chain as possible. Return the size 815 // of the chain. Links up to MaxChain instructions. 816 unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain, 817 SmallPtrSetImpl<SUnit *> &Visited) const { 818 SmallVector<SUnit *, 8> Worklist({To}); 819 unsigned Linked = 0; 820 821 while (!Worklist.empty() && MaxChain-- > 0) { 822 SUnit *SU = Worklist.pop_back_val(); 823 if (!Visited.insert(SU).second) 824 continue; 825 826 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From); 827 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n'); 828 829 if (SU->addPred(SDep(From, SDep::Artificial), false)) 830 ++Linked; 831 832 for (SDep &SI : From->Succs) { 833 SUnit *SUv = SI.getSUnit(); 834 if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU)) 835 SUv->addPred(SDep(SU, SDep::Artificial), false); 836 } 837 838 for (SDep &SI : SU->Succs) { 839 SUnit *Succ = SI.getSUnit(); 840 if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ)) 841 Worklist.push_back(Succ); 842 } 843 } 844 845 return Linked; 846 } 847 848 void apply(ScheduleDAGInstrs *DAGInstrs) override { 849 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); 850 if (!ST.hasMAIInsts() || DisablePowerSched) 851 return; 852 DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); 853 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); 854 if (!TSchedModel || DAG->SUnits.empty()) 855 return; 856 857 // Scan for MFMA long latency instructions and try to add a dependency 858 // of available SALU instructions to give them a chance to fill MFMA 859 // shadow. That is desirable to fill MFMA shadow with SALU instructions 860 // rather than VALU to prevent power consumption bursts and throttle. 861 auto LastSALU = DAG->SUnits.begin(); 862 auto E = DAG->SUnits.end(); 863 SmallPtrSet<SUnit*, 32> Visited; 864 for (SUnit &SU : DAG->SUnits) { 865 MachineInstr &MAI = *SU.getInstr(); 866 if (!TII->isMAI(MAI) || 867 MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 || 868 MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32) 869 continue; 870 871 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1; 872 873 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU); 874 dbgs() << "Need " << Lat 875 << " instructions to cover latency.\n"); 876 877 // Find up to Lat independent scalar instructions as early as 878 // possible such that they can be scheduled after this MFMA. 879 for ( ; Lat && LastSALU != E; ++LastSALU) { 880 if (Visited.count(&*LastSALU)) 881 continue; 882 883 if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU)) 884 continue; 885 886 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited); 887 } 888 } 889 } 890 }; 891 } // namespace 892 893 void GCNSubtarget::getPostRAMutations( 894 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 895 Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo)); 896 } 897 898 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 899 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 900 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 901 else 902 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); 903 } 904 905 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 906 if (TM.getTargetTriple().getArch() == Triple::amdgcn) 907 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 908 else 909 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); 910 } 911