1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Implements the AMDGPU specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUSubtarget.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUTargetMachine.h" 17 #include "AMDGPUCallLowering.h" 18 #include "AMDGPUInstructionSelector.h" 19 #include "AMDGPULegalizerInfo.h" 20 #include "AMDGPURegisterBankInfo.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 23 #include "llvm/ADT/SmallString.h" 24 #include "llvm/CodeGen/MachineScheduler.h" 25 #include "llvm/MC/MCSubtargetInfo.h" 26 #include "llvm/IR/MDBuilder.h" 27 #include "llvm/CodeGen/TargetFrameLowering.h" 28 #include <algorithm> 29 30 using namespace llvm; 31 32 #define DEBUG_TYPE "amdgpu-subtarget" 33 34 #define GET_SUBTARGETINFO_TARGET_DESC 35 #define GET_SUBTARGETINFO_CTOR 36 #define AMDGPUSubtarget GCNSubtarget 37 #include "AMDGPUGenSubtargetInfo.inc" 38 #define GET_SUBTARGETINFO_TARGET_DESC 39 #define GET_SUBTARGETINFO_CTOR 40 #undef AMDGPUSubtarget 41 #include "R600GenSubtargetInfo.inc" 42 43 static cl::opt<bool> DisablePowerSched( 44 "amdgpu-disable-power-sched", 45 cl::desc("Disable scheduling to minimize mAI power bursts"), 46 cl::init(false)); 47 48 GCNSubtarget::~GCNSubtarget() = default; 49 50 R600Subtarget & 51 R600Subtarget::initializeSubtargetDependencies(const Triple &TT, 52 StringRef GPU, StringRef FS) { 53 SmallString<256> FullFS("+promote-alloca,"); 54 FullFS += FS; 55 ParseSubtargetFeatures(GPU, FullFS); 56 57 // FIXME: I don't think think Evergreen has any useful support for 58 // denormals, but should be checked. Should we issue a warning somewhere 59 // if someone tries to enable these? 60 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { 61 FP32Denormals = false; 62 } 63 64 HasMulU24 = getGeneration() >= EVERGREEN; 65 HasMulI24 = hasCaymanISA(); 66 67 return *this; 68 } 69 70 GCNSubtarget & 71 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 72 StringRef GPU, StringRef FS) { 73 // Determine default and user-specified characteristics 74 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be 75 // enabled, but some instructions do not respect them and they run at the 76 // double precision rate, so don't enable by default. 77 // 78 // We want to be able to turn these off, but making this a subtarget feature 79 // for SI has the unhelpful behavior that it unsets everything else if you 80 // disable it. 81 // 82 // Similarly we want enable-prt-strict-null to be on by default and not to 83 // unset everything else if it is disabled 84 85 // Assuming ECC is enabled is the conservative default. 86 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,+xnack,"); 87 88 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 89 FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,"; 90 91 // FIXME: I don't think think Evergreen has any useful support for 92 // denormals, but should be checked. Should we issue a warning somewhere 93 // if someone tries to enable these? 94 if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { 95 FullFS += "+fp64-fp16-denormals,"; 96 } else { 97 FullFS += "-fp32-denormals,"; 98 } 99 100 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS 101 102 // Disable mutually exclusive bits. 103 if (FS.find_lower("+wavefrontsize") != StringRef::npos) { 104 if (FS.find_lower("wavefrontsize16") == StringRef::npos) 105 FullFS += "-wavefrontsize16,"; 106 if (FS.find_lower("wavefrontsize32") == StringRef::npos) 107 FullFS += "-wavefrontsize32,"; 108 if (FS.find_lower("wavefrontsize64") == StringRef::npos) 109 FullFS += "-wavefrontsize64,"; 110 } 111 112 FullFS += FS; 113 114 ParseSubtargetFeatures(GPU, FullFS); 115 116 // We don't support FP64 for EG/NI atm. 117 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 118 119 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es 120 // on VI and newer hardware to avoid assertion failures due to missing ADDR64 121 // variants of MUBUF instructions. 122 if (!hasAddr64() && !FS.contains("flat-for-global")) { 123 FlatForGlobal = true; 124 } 125 126 // Set defaults if needed. 127 if (MaxPrivateElementSize == 0) 128 MaxPrivateElementSize = 4; 129 130 if (LDSBankCount == 0) 131 LDSBankCount = 32; 132 133 if (TT.getArch() == Triple::amdgcn) { 134 if (LocalMemorySize == 0) 135 LocalMemorySize = 32768; 136 137 // Do something sensible for unspecified target. 138 if (!HasMovrel && !HasVGPRIndexMode) 139 HasMovrel = true; 140 } 141 142 // Don't crash on invalid devices. 143 if (WavefrontSize == 0) 144 WavefrontSize = 64; 145 146 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 147 148 if (DoesNotSupportXNACK && EnableXNACK) { 149 ToggleFeature(AMDGPU::FeatureXNACK); 150 EnableXNACK = false; 151 } 152 153 // ECC is on by default, but turn it off if the hardware doesn't support it 154 // anyway. This matters for the gfx9 targets with d16 loads, but don't support 155 // ECC. 156 if (DoesNotSupportSRAMECC && EnableSRAMECC) { 157 ToggleFeature(AMDGPU::FeatureSRAMECC); 158 EnableSRAMECC = false; 159 } 160 161 return *this; 162 } 163 164 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : 165 TargetTriple(TT), 166 Has16BitInsts(false), 167 HasMadMixInsts(false), 168 FP32Denormals(false), 169 FPExceptions(false), 170 HasSDWA(false), 171 HasVOP3PInsts(false), 172 HasMulI24(true), 173 HasMulU24(true), 174 HasInv2PiInlineImm(false), 175 HasFminFmaxLegacy(true), 176 EnablePromoteAlloca(false), 177 HasTrigReducedRange(false), 178 MaxWavesPerEU(10), 179 LocalMemorySize(0), 180 WavefrontSize(0) 181 { } 182 183 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 184 const GCNTargetMachine &TM) : 185 AMDGPUGenSubtargetInfo(TT, GPU, FS), 186 AMDGPUSubtarget(TT), 187 TargetTriple(TT), 188 Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS), 189 InstrItins(getInstrItineraryForCPU(GPU)), 190 LDSBankCount(0), 191 MaxPrivateElementSize(0), 192 193 FastFMAF32(false), 194 HalfRate64Ops(false), 195 196 FP64FP16Denormals(false), 197 FlatForGlobal(false), 198 AutoWaitcntBeforeBarrier(false), 199 CodeObjectV3(false), 200 UnalignedScratchAccess(false), 201 UnalignedBufferAccess(false), 202 203 HasApertureRegs(false), 204 EnableXNACK(false), 205 DoesNotSupportXNACK(false), 206 EnableCuMode(false), 207 TrapHandler(false), 208 209 EnableLoadStoreOpt(false), 210 EnableUnsafeDSOffsetFolding(false), 211 EnableSIScheduler(false), 212 EnableDS128(false), 213 EnablePRTStrictNull(false), 214 DumpCode(false), 215 216 FP64(false), 217 GCN3Encoding(false), 218 CIInsts(false), 219 GFX8Insts(false), 220 GFX9Insts(false), 221 GFX10Insts(false), 222 GFX7GFX8GFX9Insts(false), 223 SGPRInitBug(false), 224 HasSMemRealTime(false), 225 HasIntClamp(false), 226 HasFmaMixInsts(false), 227 HasMovrel(false), 228 HasVGPRIndexMode(false), 229 HasScalarStores(false), 230 HasScalarAtomics(false), 231 HasSDWAOmod(false), 232 HasSDWAScalar(false), 233 HasSDWASdst(false), 234 HasSDWAMac(false), 235 HasSDWAOutModsVOPC(false), 236 HasDPP(false), 237 HasDPP8(false), 238 HasR128A16(false), 239 HasNSAEncoding(false), 240 HasDLInsts(false), 241 HasDot1Insts(false), 242 HasDot2Insts(false), 243 HasDot3Insts(false), 244 HasDot4Insts(false), 245 HasDot5Insts(false), 246 HasDot6Insts(false), 247 HasMAIInsts(false), 248 HasPkFmacF16Inst(false), 249 HasAtomicFaddInsts(false), 250 EnableSRAMECC(false), 251 DoesNotSupportSRAMECC(false), 252 HasNoSdstCMPX(false), 253 HasVscnt(false), 254 HasRegisterBanking(false), 255 HasVOP3Literal(false), 256 HasNoDataDepHazard(false), 257 FlatAddressSpace(false), 258 FlatInstOffsets(false), 259 FlatGlobalInsts(false), 260 FlatScratchInsts(false), 261 ScalarFlatScratchInsts(false), 262 AddNoCarryInsts(false), 263 HasUnpackedD16VMem(false), 264 LDSMisalignedBug(false), 265 HasMFMAInlineLiteralBug(false), 266 267 ScalarizeGlobal(false), 268 269 HasVcmpxPermlaneHazard(false), 270 HasVMEMtoScalarWriteHazard(false), 271 HasSMEMtoVectorWriteHazard(false), 272 HasInstFwdPrefetchBug(false), 273 HasVcmpxExecWARHazard(false), 274 HasLdsBranchVmemWARHazard(false), 275 HasNSAtoVMEMBug(false), 276 HasOffset3fBug(false), 277 HasFlatSegmentOffsetBug(false), 278 279 FeatureDisable(false), 280 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 281 TLInfo(TM, *this), 282 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 283 MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this); 284 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 285 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 286 RegBankInfo.reset(new AMDGPURegisterBankInfo(*this)); 287 InstSelector.reset(new AMDGPUInstructionSelector( 288 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 289 } 290 291 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { 292 if (getGeneration() < GFX10) 293 return 1; 294 295 switch (Opcode) { 296 case AMDGPU::V_LSHLREV_B64: 297 case AMDGPU::V_LSHLREV_B64_gfx10: 298 case AMDGPU::V_LSHL_B64: 299 case AMDGPU::V_LSHRREV_B64: 300 case AMDGPU::V_LSHRREV_B64_gfx10: 301 case AMDGPU::V_LSHR_B64: 302 case AMDGPU::V_ASHRREV_I64: 303 case AMDGPU::V_ASHRREV_I64_gfx10: 304 case AMDGPU::V_ASHR_I64: 305 return 1; 306 } 307 308 return 2; 309 } 310 311 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 312 const Function &F) const { 313 if (NWaves == 1) 314 return getLocalMemorySize(); 315 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 316 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 317 if (!WorkGroupsPerCu) 318 return 0; 319 unsigned MaxWaves = getMaxWavesPerEU(); 320 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 321 } 322 323 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 324 const Function &F) const { 325 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 326 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 327 if (!WorkGroupsPerCu) 328 return 0; 329 unsigned MaxWaves = getMaxWavesPerEU(); 330 unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu; 331 unsigned NumWaves = Limit / (Bytes ? Bytes : 1u); 332 NumWaves = std::min(NumWaves, MaxWaves); 333 NumWaves = std::max(NumWaves, 1u); 334 return NumWaves; 335 } 336 337 unsigned 338 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 339 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 340 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 341 } 342 343 std::pair<unsigned, unsigned> 344 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 345 switch (CC) { 346 case CallingConv::AMDGPU_VS: 347 case CallingConv::AMDGPU_LS: 348 case CallingConv::AMDGPU_HS: 349 case CallingConv::AMDGPU_ES: 350 case CallingConv::AMDGPU_GS: 351 case CallingConv::AMDGPU_PS: 352 return std::make_pair(1, getWavefrontSize()); 353 default: 354 return std::make_pair(1u, getMaxFlatWorkGroupSize()); 355 } 356 } 357 358 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 359 const Function &F) const { 360 // Default minimum/maximum flat work group sizes. 361 std::pair<unsigned, unsigned> Default = 362 getDefaultFlatWorkGroupSize(F.getCallingConv()); 363 364 // Requested minimum/maximum flat work group sizes. 365 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 366 F, "amdgpu-flat-work-group-size", Default); 367 368 // Make sure requested minimum is less than requested maximum. 369 if (Requested.first > Requested.second) 370 return Default; 371 372 // Make sure requested values do not violate subtarget's specifications. 373 if (Requested.first < getMinFlatWorkGroupSize()) 374 return Default; 375 if (Requested.second > getMaxFlatWorkGroupSize()) 376 return Default; 377 378 return Requested; 379 } 380 381 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 382 const Function &F) const { 383 // Default minimum/maximum number of waves per execution unit. 384 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 385 386 // Default/requested minimum/maximum flat work group sizes. 387 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 388 389 // If minimum/maximum flat work group sizes were explicitly requested using 390 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 391 // number of waves per execution unit to values implied by requested 392 // minimum/maximum flat work group sizes. 393 unsigned MinImpliedByFlatWorkGroupSize = 394 getMaxWavesPerEU(FlatWorkGroupSizes.second); 395 bool RequestedFlatWorkGroupSize = false; 396 397 if (F.hasFnAttribute("amdgpu-flat-work-group-size")) { 398 Default.first = MinImpliedByFlatWorkGroupSize; 399 RequestedFlatWorkGroupSize = true; 400 } 401 402 // Requested minimum/maximum number of waves per execution unit. 403 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 404 F, "amdgpu-waves-per-eu", Default, true); 405 406 // Make sure requested minimum is less than requested maximum. 407 if (Requested.second && Requested.first > Requested.second) 408 return Default; 409 410 // Make sure requested values do not violate subtarget's specifications. 411 if (Requested.first < getMinWavesPerEU() || 412 Requested.first > getMaxWavesPerEU()) 413 return Default; 414 if (Requested.second > getMaxWavesPerEU()) 415 return Default; 416 417 // Make sure requested values are compatible with values implied by requested 418 // minimum/maximum flat work group sizes. 419 if (RequestedFlatWorkGroupSize && 420 Requested.first < MinImpliedByFlatWorkGroupSize) 421 return Default; 422 423 return Requested; 424 } 425 426 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 427 Function *Kernel = I->getParent()->getParent(); 428 unsigned MinSize = 0; 429 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 430 bool IdQuery = false; 431 432 // If reqd_work_group_size is present it narrows value down. 433 if (auto *CI = dyn_cast<CallInst>(I)) { 434 const Function *F = CI->getCalledFunction(); 435 if (F) { 436 unsigned Dim = UINT_MAX; 437 switch (F->getIntrinsicID()) { 438 case Intrinsic::amdgcn_workitem_id_x: 439 case Intrinsic::r600_read_tidig_x: 440 IdQuery = true; 441 LLVM_FALLTHROUGH; 442 case Intrinsic::r600_read_local_size_x: 443 Dim = 0; 444 break; 445 case Intrinsic::amdgcn_workitem_id_y: 446 case Intrinsic::r600_read_tidig_y: 447 IdQuery = true; 448 LLVM_FALLTHROUGH; 449 case Intrinsic::r600_read_local_size_y: 450 Dim = 1; 451 break; 452 case Intrinsic::amdgcn_workitem_id_z: 453 case Intrinsic::r600_read_tidig_z: 454 IdQuery = true; 455 LLVM_FALLTHROUGH; 456 case Intrinsic::r600_read_local_size_z: 457 Dim = 2; 458 break; 459 default: 460 break; 461 } 462 if (Dim <= 3) { 463 if (auto Node = Kernel->getMetadata("reqd_work_group_size")) 464 if (Node->getNumOperands() == 3) 465 MinSize = MaxSize = mdconst::extract<ConstantInt>( 466 Node->getOperand(Dim))->getZExtValue(); 467 } 468 } 469 } 470 471 if (!MaxSize) 472 return false; 473 474 // Range metadata is [Lo, Hi). For ID query we need to pass max size 475 // as Hi. For size query we need to pass Hi + 1. 476 if (IdQuery) 477 MinSize = 0; 478 else 479 ++MaxSize; 480 481 MDBuilder MDB(I->getContext()); 482 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 483 APInt(32, MaxSize)); 484 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 485 return true; 486 } 487 488 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 489 Align &MaxAlign) const { 490 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 491 F.getCallingConv() == CallingConv::SPIR_KERNEL); 492 493 const DataLayout &DL = F.getParent()->getDataLayout(); 494 uint64_t ExplicitArgBytes = 0; 495 MaxAlign = Align::None(); 496 497 for (const Argument &Arg : F.args()) { 498 Type *ArgTy = Arg.getType(); 499 500 const Align Alignment(DL.getABITypeAlignment(ArgTy)); 501 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 502 ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize; 503 MaxAlign = std::max(MaxAlign, Alignment); 504 } 505 506 return ExplicitArgBytes; 507 } 508 509 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 510 Align &MaxAlign) const { 511 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 512 513 unsigned ExplicitOffset = getExplicitKernelArgOffset(F); 514 515 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 516 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 517 if (ImplicitBytes != 0) { 518 const Align Alignment = getAlignmentForImplicitArgPtr(); 519 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 520 } 521 522 // Being able to dereference past the end is useful for emitting scalar loads. 523 return alignTo(TotalSize, 4); 524 } 525 526 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 527 const TargetMachine &TM) : 528 R600GenSubtargetInfo(TT, GPU, FS), 529 AMDGPUSubtarget(TT), 530 InstrInfo(*this), 531 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 532 FMA(false), 533 CaymanISA(false), 534 CFALUBug(false), 535 HasVertexCache(false), 536 R600ALUInst(false), 537 FP64(false), 538 TexVTXClauseSize(0), 539 Gen(R600), 540 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)), 541 InstrItins(getInstrItineraryForCPU(GPU)) { } 542 543 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 544 unsigned NumRegionInstrs) const { 545 // Track register pressure so the scheduler can try to decrease 546 // pressure once register usage is above the threshold defined by 547 // SIRegisterInfo::getRegPressureSetLimit() 548 Policy.ShouldTrackPressure = true; 549 550 // Enabling both top down and bottom up scheduling seems to give us less 551 // register spills than just using one of these approaches on its own. 552 Policy.OnlyTopDown = false; 553 Policy.OnlyBottomUp = false; 554 555 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 556 if (!enableSIScheduler()) 557 Policy.ShouldTrackLaneMasks = true; 558 } 559 560 bool GCNSubtarget::hasMadF16() const { 561 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1; 562 } 563 564 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 565 if (getGeneration() >= AMDGPUSubtarget::GFX10) 566 return getMaxWavesPerEU(); 567 568 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 569 if (SGPRs <= 80) 570 return 10; 571 if (SGPRs <= 88) 572 return 9; 573 if (SGPRs <= 100) 574 return 8; 575 return 7; 576 } 577 if (SGPRs <= 48) 578 return 10; 579 if (SGPRs <= 56) 580 return 9; 581 if (SGPRs <= 64) 582 return 8; 583 if (SGPRs <= 72) 584 return 7; 585 if (SGPRs <= 80) 586 return 6; 587 return 5; 588 } 589 590 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 591 unsigned MaxWaves = getMaxWavesPerEU(); 592 unsigned Granule = getVGPRAllocGranule(); 593 if (VGPRs < Granule) 594 return MaxWaves; 595 unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule; 596 return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves); 597 } 598 599 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 600 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 601 if (getGeneration() >= AMDGPUSubtarget::GFX10) 602 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. 603 604 if (MFI.hasFlatScratchInit()) { 605 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 606 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 607 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 608 return 4; // FLAT_SCRATCH, VCC (in that order). 609 } 610 611 if (isXNACKEnabled()) 612 return 4; // XNACK, VCC (in that order). 613 return 2; // VCC. 614 } 615 616 unsigned GCNSubtarget::computeOccupancy(const MachineFunction &MF, 617 unsigned LDSSize, 618 unsigned NumSGPRs, 619 unsigned NumVGPRs) const { 620 unsigned Occupancy = 621 std::min(getMaxWavesPerEU(), 622 getOccupancyWithLocalMemSize(LDSSize, MF.getFunction())); 623 if (NumSGPRs) 624 Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs)); 625 if (NumVGPRs) 626 Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs)); 627 return Occupancy; 628 } 629 630 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 631 const Function &F = MF.getFunction(); 632 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 633 634 // Compute maximum number of SGPRs function can use using default/requested 635 // minimum number of waves per execution unit. 636 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 637 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 638 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 639 640 // Check if maximum number of SGPRs was explicitly requested using 641 // "amdgpu-num-sgpr" attribute. 642 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 643 unsigned Requested = AMDGPU::getIntegerAttribute( 644 F, "amdgpu-num-sgpr", MaxNumSGPRs); 645 646 // Make sure requested value does not violate subtarget's specifications. 647 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 648 Requested = 0; 649 650 // If more SGPRs are required to support the input user/system SGPRs, 651 // increase to accommodate them. 652 // 653 // FIXME: This really ends up using the requested number of SGPRs + number 654 // of reserved special registers in total. Theoretically you could re-use 655 // the last input registers for these special registers, but this would 656 // require a lot of complexity to deal with the weird aliasing. 657 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 658 if (Requested && Requested < InputNumSGPRs) 659 Requested = InputNumSGPRs; 660 661 // Make sure requested value is compatible with values implied by 662 // default/requested minimum/maximum number of waves per execution unit. 663 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 664 Requested = 0; 665 if (WavesPerEU.second && 666 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 667 Requested = 0; 668 669 if (Requested) 670 MaxNumSGPRs = Requested; 671 } 672 673 if (hasSGPRInitBug()) 674 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 675 676 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 677 MaxAddressableNumSGPRs); 678 } 679 680 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 681 const Function &F = MF.getFunction(); 682 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 683 684 // Compute maximum number of VGPRs function can use using default/requested 685 // minimum number of waves per execution unit. 686 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 687 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 688 689 // Check if maximum number of VGPRs was explicitly requested using 690 // "amdgpu-num-vgpr" attribute. 691 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 692 unsigned Requested = AMDGPU::getIntegerAttribute( 693 F, "amdgpu-num-vgpr", MaxNumVGPRs); 694 695 // Make sure requested value is compatible with values implied by 696 // default/requested minimum/maximum number of waves per execution unit. 697 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 698 Requested = 0; 699 if (WavesPerEU.second && 700 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 701 Requested = 0; 702 703 if (Requested) 704 MaxNumVGPRs = Requested; 705 } 706 707 return MaxNumVGPRs; 708 } 709 710 namespace { 711 struct MemOpClusterMutation : ScheduleDAGMutation { 712 const SIInstrInfo *TII; 713 714 MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {} 715 716 void apply(ScheduleDAGInstrs *DAG) override { 717 SUnit *SUa = nullptr; 718 // Search for two consequent memory operations and link them 719 // to prevent scheduler from moving them apart. 720 // In DAG pre-process SUnits are in the original order of 721 // the instructions before scheduling. 722 for (SUnit &SU : DAG->SUnits) { 723 MachineInstr &MI2 = *SU.getInstr(); 724 if (!MI2.mayLoad() && !MI2.mayStore()) { 725 SUa = nullptr; 726 continue; 727 } 728 if (!SUa) { 729 SUa = &SU; 730 continue; 731 } 732 733 MachineInstr &MI1 = *SUa->getInstr(); 734 if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) || 735 (TII->isFLAT(MI1) && TII->isFLAT(MI2)) || 736 (TII->isSMRD(MI1) && TII->isSMRD(MI2)) || 737 (TII->isDS(MI1) && TII->isDS(MI2))) { 738 SU.addPredBarrier(SUa); 739 740 for (const SDep &SI : SU.Preds) { 741 if (SI.getSUnit() != SUa) 742 SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial)); 743 } 744 745 if (&SU != &DAG->ExitSU) { 746 for (const SDep &SI : SUa->Succs) { 747 if (SI.getSUnit() != &SU) 748 SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial)); 749 } 750 } 751 } 752 753 SUa = &SU; 754 } 755 } 756 }; 757 758 struct FillMFMAShadowMutation : ScheduleDAGMutation { 759 const SIInstrInfo *TII; 760 761 ScheduleDAGMI *DAG; 762 763 FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {} 764 765 bool isSALU(const SUnit *SU) const { 766 const MachineInstr *MI = SU->getInstr(); 767 return MI && TII->isSALU(*MI) && !MI->isTerminator(); 768 } 769 770 bool isVALU(const SUnit *SU) const { 771 const MachineInstr *MI = SU->getInstr(); 772 return MI && TII->isVALU(*MI); 773 } 774 775 bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const { 776 if (Pred->NodeNum < Succ->NodeNum) 777 return true; 778 779 SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred}); 780 781 for (unsigned I = 0; I < Succs.size(); ++I) { 782 for (const SDep &SI : Succs[I]->Succs) { 783 const SUnit *SU = SI.getSUnit(); 784 if (SU != Succs[I] && llvm::find(Succs, SU) == Succs.end()) 785 Succs.push_back(SU); 786 } 787 } 788 789 SmallPtrSet<const SUnit*, 32> Visited; 790 while (!Preds.empty()) { 791 const SUnit *SU = Preds.pop_back_val(); 792 if (llvm::find(Succs, SU) != Succs.end()) 793 return false; 794 Visited.insert(SU); 795 for (const SDep &SI : SU->Preds) 796 if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit())) 797 Preds.push_back(SI.getSUnit()); 798 } 799 800 return true; 801 } 802 803 // Link as much SALU intructions in chain as possible. Return the size 804 // of the chain. Links up to MaxChain instructions. 805 unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain, 806 SmallPtrSetImpl<SUnit *> &Visited) const { 807 SmallVector<SUnit *, 8> Worklist({To}); 808 unsigned Linked = 0; 809 810 while (!Worklist.empty() && MaxChain-- > 0) { 811 SUnit *SU = Worklist.pop_back_val(); 812 if (!Visited.insert(SU).second) 813 continue; 814 815 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From); 816 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n'); 817 818 if (SU->addPred(SDep(From, SDep::Artificial), false)) 819 ++Linked; 820 821 for (SDep &SI : From->Succs) { 822 SUnit *SUv = SI.getSUnit(); 823 if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU)) 824 SUv->addPred(SDep(SU, SDep::Artificial), false); 825 } 826 827 for (SDep &SI : SU->Succs) { 828 SUnit *Succ = SI.getSUnit(); 829 if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ)) 830 Worklist.push_back(Succ); 831 } 832 } 833 834 return Linked; 835 } 836 837 void apply(ScheduleDAGInstrs *DAGInstrs) override { 838 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); 839 if (!ST.hasMAIInsts() || DisablePowerSched) 840 return; 841 DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); 842 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); 843 if (!TSchedModel || DAG->SUnits.empty()) 844 return; 845 846 // Scan for MFMA long latency instructions and try to add a dependency 847 // of available SALU instructions to give them a chance to fill MFMA 848 // shadow. That is desirable to fill MFMA shadow with SALU instructions 849 // rather than VALU to prevent power consumption bursts and throttle. 850 auto LastSALU = DAG->SUnits.begin(); 851 auto E = DAG->SUnits.end(); 852 SmallPtrSet<SUnit*, 32> Visited; 853 for (SUnit &SU : DAG->SUnits) { 854 MachineInstr &MAI = *SU.getInstr(); 855 if (!TII->isMAI(MAI) || 856 MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 || 857 MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32) 858 continue; 859 860 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1; 861 862 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU); 863 dbgs() << "Need " << Lat 864 << " instructions to cover latency.\n"); 865 866 // Find up to Lat independent scalar instructions as early as 867 // possible such that they can be scheduled after this MFMA. 868 for ( ; Lat && LastSALU != E; ++LastSALU) { 869 if (Visited.count(&*LastSALU)) 870 continue; 871 872 if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU)) 873 continue; 874 875 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited); 876 } 877 } 878 } 879 }; 880 } // namespace 881 882 void GCNSubtarget::getPostRAMutations( 883 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 884 Mutations.push_back(std::make_unique<MemOpClusterMutation>(&InstrInfo)); 885 Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo)); 886 } 887 888 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 889 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 890 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 891 else 892 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); 893 } 894 895 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 896 if (TM.getTargetTriple().getArch() == Triple::amdgcn) 897 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 898 else 899 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); 900 } 901