1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Implements the AMDGPU specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUSubtarget.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUTargetMachine.h" 17 #include "AMDGPUCallLowering.h" 18 #include "AMDGPUInstructionSelector.h" 19 #include "AMDGPULegalizerInfo.h" 20 #include "AMDGPURegisterBankInfo.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 23 #include "llvm/ADT/SmallString.h" 24 #include "llvm/CodeGen/MachineScheduler.h" 25 #include "llvm/MC/MCSubtargetInfo.h" 26 #include "llvm/IR/MDBuilder.h" 27 #include "llvm/CodeGen/TargetFrameLowering.h" 28 #include <algorithm> 29 30 using namespace llvm; 31 32 #define DEBUG_TYPE "amdgpu-subtarget" 33 34 #define GET_SUBTARGETINFO_TARGET_DESC 35 #define GET_SUBTARGETINFO_CTOR 36 #define AMDGPUSubtarget GCNSubtarget 37 #include "AMDGPUGenSubtargetInfo.inc" 38 #define GET_SUBTARGETINFO_TARGET_DESC 39 #define GET_SUBTARGETINFO_CTOR 40 #undef AMDGPUSubtarget 41 #include "R600GenSubtargetInfo.inc" 42 43 static cl::opt<bool> DisablePowerSched( 44 "amdgpu-disable-power-sched", 45 cl::desc("Disable scheduling to minimize mAI power bursts"), 46 cl::init(false)); 47 48 GCNSubtarget::~GCNSubtarget() = default; 49 50 R600Subtarget & 51 R600Subtarget::initializeSubtargetDependencies(const Triple &TT, 52 StringRef GPU, StringRef FS) { 53 SmallString<256> FullFS("+promote-alloca,"); 54 FullFS += FS; 55 ParseSubtargetFeatures(GPU, FullFS); 56 57 // FIXME: I don't think think Evergreen has any useful support for 58 // denormals, but should be checked. Should we issue a warning somewhere 59 // if someone tries to enable these? 60 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { 61 FP32Denormals = false; 62 } 63 64 HasMulU24 = getGeneration() >= EVERGREEN; 65 HasMulI24 = hasCaymanISA(); 66 67 return *this; 68 } 69 70 GCNSubtarget & 71 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 72 StringRef GPU, StringRef FS) { 73 // Determine default and user-specified characteristics 74 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be 75 // enabled, but some instructions do not respect them and they run at the 76 // double precision rate, so don't enable by default. 77 // 78 // We want to be able to turn these off, but making this a subtarget feature 79 // for SI has the unhelpful behavior that it unsets everything else if you 80 // disable it. 81 // 82 // Similarly we want enable-prt-strict-null to be on by default and not to 83 // unset everything else if it is disabled 84 85 // Assuming ECC is enabled is the conservative default. 86 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,+xnack,"); 87 88 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 89 FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,"; 90 91 // FIXME: I don't think think Evergreen has any useful support for 92 // denormals, but should be checked. Should we issue a warning somewhere 93 // if someone tries to enable these? 94 if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { 95 FullFS += "+fp64-fp16-denormals,"; 96 } else { 97 FullFS += "-fp32-denormals,"; 98 } 99 100 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS 101 102 // Disable mutually exclusive bits. 103 if (FS.find_lower("+wavefrontsize") != StringRef::npos) { 104 if (FS.find_lower("wavefrontsize16") == StringRef::npos) 105 FullFS += "-wavefrontsize16,"; 106 if (FS.find_lower("wavefrontsize32") == StringRef::npos) 107 FullFS += "-wavefrontsize32,"; 108 if (FS.find_lower("wavefrontsize64") == StringRef::npos) 109 FullFS += "-wavefrontsize64,"; 110 } 111 112 FullFS += FS; 113 114 ParseSubtargetFeatures(GPU, FullFS); 115 116 // We don't support FP64 for EG/NI atm. 117 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 118 119 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es 120 // on VI and newer hardware to avoid assertion failures due to missing ADDR64 121 // variants of MUBUF instructions. 122 if (!hasAddr64() && !FS.contains("flat-for-global")) { 123 FlatForGlobal = true; 124 } 125 126 // Set defaults if needed. 127 if (MaxPrivateElementSize == 0) 128 MaxPrivateElementSize = 4; 129 130 if (LDSBankCount == 0) 131 LDSBankCount = 32; 132 133 if (TT.getArch() == Triple::amdgcn) { 134 if (LocalMemorySize == 0) 135 LocalMemorySize = 32768; 136 137 // Do something sensible for unspecified target. 138 if (!HasMovrel && !HasVGPRIndexMode) 139 HasMovrel = true; 140 } 141 142 // Don't crash on invalid devices. 143 if (WavefrontSize == 0) 144 WavefrontSize = 64; 145 146 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 147 148 if (DoesNotSupportXNACK && EnableXNACK) { 149 ToggleFeature(AMDGPU::FeatureXNACK); 150 EnableXNACK = false; 151 } 152 153 // ECC is on by default, but turn it off if the hardware doesn't support it 154 // anyway. This matters for the gfx9 targets with d16 loads, but don't support 155 // ECC. 156 if (DoesNotSupportSRAMECC && EnableSRAMECC) { 157 ToggleFeature(AMDGPU::FeatureSRAMECC); 158 EnableSRAMECC = false; 159 } 160 161 return *this; 162 } 163 164 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : 165 TargetTriple(TT), 166 Has16BitInsts(false), 167 HasMadMixInsts(false), 168 FP32Denormals(false), 169 FPExceptions(false), 170 HasSDWA(false), 171 HasVOP3PInsts(false), 172 HasMulI24(true), 173 HasMulU24(true), 174 HasInv2PiInlineImm(false), 175 HasFminFmaxLegacy(true), 176 EnablePromoteAlloca(false), 177 HasTrigReducedRange(false), 178 MaxWavesPerEU(10), 179 LocalMemorySize(0), 180 WavefrontSize(0) 181 { } 182 183 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 184 const GCNTargetMachine &TM) : 185 AMDGPUGenSubtargetInfo(TT, GPU, FS), 186 AMDGPUSubtarget(TT), 187 TargetTriple(TT), 188 Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS), 189 InstrItins(getInstrItineraryForCPU(GPU)), 190 LDSBankCount(0), 191 MaxPrivateElementSize(0), 192 193 FastFMAF32(false), 194 HalfRate64Ops(false), 195 196 FP64FP16Denormals(false), 197 FlatForGlobal(false), 198 AutoWaitcntBeforeBarrier(false), 199 CodeObjectV3(false), 200 UnalignedScratchAccess(false), 201 UnalignedBufferAccess(false), 202 203 HasApertureRegs(false), 204 EnableXNACK(false), 205 DoesNotSupportXNACK(false), 206 EnableCuMode(false), 207 TrapHandler(false), 208 209 EnableLoadStoreOpt(false), 210 EnableUnsafeDSOffsetFolding(false), 211 EnableSIScheduler(false), 212 EnableDS128(false), 213 EnablePRTStrictNull(false), 214 DumpCode(false), 215 216 FP64(false), 217 GCN3Encoding(false), 218 CIInsts(false), 219 GFX8Insts(false), 220 GFX9Insts(false), 221 GFX10Insts(false), 222 GFX7GFX8GFX9Insts(false), 223 SGPRInitBug(false), 224 HasSMemRealTime(false), 225 HasIntClamp(false), 226 HasFmaMixInsts(false), 227 HasMovrel(false), 228 HasVGPRIndexMode(false), 229 HasScalarStores(false), 230 HasScalarAtomics(false), 231 HasSDWAOmod(false), 232 HasSDWAScalar(false), 233 HasSDWASdst(false), 234 HasSDWAMac(false), 235 HasSDWAOutModsVOPC(false), 236 HasDPP(false), 237 HasDPP8(false), 238 HasR128A16(false), 239 HasNSAEncoding(false), 240 HasDLInsts(false), 241 HasDot1Insts(false), 242 HasDot2Insts(false), 243 HasDot3Insts(false), 244 HasDot4Insts(false), 245 HasDot5Insts(false), 246 HasDot6Insts(false), 247 HasMAIInsts(false), 248 HasPkFmacF16Inst(false), 249 HasAtomicFaddInsts(false), 250 EnableSRAMECC(false), 251 DoesNotSupportSRAMECC(false), 252 HasNoSdstCMPX(false), 253 HasVscnt(false), 254 HasRegisterBanking(false), 255 HasVOP3Literal(false), 256 HasNoDataDepHazard(false), 257 FlatAddressSpace(false), 258 FlatInstOffsets(false), 259 FlatGlobalInsts(false), 260 FlatScratchInsts(false), 261 ScalarFlatScratchInsts(false), 262 AddNoCarryInsts(false), 263 HasUnpackedD16VMem(false), 264 LDSMisalignedBug(false), 265 266 ScalarizeGlobal(false), 267 268 HasVcmpxPermlaneHazard(false), 269 HasVMEMtoScalarWriteHazard(false), 270 HasSMEMtoVectorWriteHazard(false), 271 HasInstFwdPrefetchBug(false), 272 HasVcmpxExecWARHazard(false), 273 HasLdsBranchVmemWARHazard(false), 274 HasNSAtoVMEMBug(false), 275 HasOffset3fBug(false), 276 HasFlatSegmentOffsetBug(false), 277 278 FeatureDisable(false), 279 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 280 TLInfo(TM, *this), 281 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 282 MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this); 283 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 284 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 285 RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo())); 286 InstSelector.reset(new AMDGPUInstructionSelector( 287 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 288 } 289 290 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { 291 if (getGeneration() < GFX10) 292 return 1; 293 294 switch (Opcode) { 295 case AMDGPU::V_LSHLREV_B64: 296 case AMDGPU::V_LSHLREV_B64_gfx10: 297 case AMDGPU::V_LSHL_B64: 298 case AMDGPU::V_LSHRREV_B64: 299 case AMDGPU::V_LSHRREV_B64_gfx10: 300 case AMDGPU::V_LSHR_B64: 301 case AMDGPU::V_ASHRREV_I64: 302 case AMDGPU::V_ASHRREV_I64_gfx10: 303 case AMDGPU::V_ASHR_I64: 304 return 1; 305 } 306 307 return 2; 308 } 309 310 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 311 const Function &F) const { 312 if (NWaves == 1) 313 return getLocalMemorySize(); 314 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 315 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 316 if (!WorkGroupsPerCu) 317 return 0; 318 unsigned MaxWaves = getMaxWavesPerEU(); 319 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 320 } 321 322 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 323 const Function &F) const { 324 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 325 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 326 if (!WorkGroupsPerCu) 327 return 0; 328 unsigned MaxWaves = getMaxWavesPerEU(); 329 unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu; 330 unsigned NumWaves = Limit / (Bytes ? Bytes : 1u); 331 NumWaves = std::min(NumWaves, MaxWaves); 332 NumWaves = std::max(NumWaves, 1u); 333 return NumWaves; 334 } 335 336 unsigned 337 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 338 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 339 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 340 } 341 342 std::pair<unsigned, unsigned> 343 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 344 switch (CC) { 345 case CallingConv::AMDGPU_CS: 346 case CallingConv::AMDGPU_KERNEL: 347 case CallingConv::SPIR_KERNEL: 348 return std::make_pair(getWavefrontSize() * 2, 349 std::max(getWavefrontSize() * 4, 256u)); 350 case CallingConv::AMDGPU_VS: 351 case CallingConv::AMDGPU_LS: 352 case CallingConv::AMDGPU_HS: 353 case CallingConv::AMDGPU_ES: 354 case CallingConv::AMDGPU_GS: 355 case CallingConv::AMDGPU_PS: 356 return std::make_pair(1, getWavefrontSize()); 357 default: 358 return std::make_pair(1, 16 * getWavefrontSize()); 359 } 360 } 361 362 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 363 const Function &F) const { 364 // FIXME: 1024 if function. 365 // Default minimum/maximum flat work group sizes. 366 std::pair<unsigned, unsigned> Default = 367 getDefaultFlatWorkGroupSize(F.getCallingConv()); 368 369 // Requested minimum/maximum flat work group sizes. 370 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 371 F, "amdgpu-flat-work-group-size", Default); 372 373 // Make sure requested minimum is less than requested maximum. 374 if (Requested.first > Requested.second) 375 return Default; 376 377 // Make sure requested values do not violate subtarget's specifications. 378 if (Requested.first < getMinFlatWorkGroupSize()) 379 return Default; 380 if (Requested.second > getMaxFlatWorkGroupSize()) 381 return Default; 382 383 return Requested; 384 } 385 386 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 387 const Function &F) const { 388 // Default minimum/maximum number of waves per execution unit. 389 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 390 391 // Default/requested minimum/maximum flat work group sizes. 392 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 393 394 // If minimum/maximum flat work group sizes were explicitly requested using 395 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 396 // number of waves per execution unit to values implied by requested 397 // minimum/maximum flat work group sizes. 398 unsigned MinImpliedByFlatWorkGroupSize = 399 getMaxWavesPerEU(FlatWorkGroupSizes.second); 400 bool RequestedFlatWorkGroupSize = false; 401 402 if (F.hasFnAttribute("amdgpu-flat-work-group-size")) { 403 Default.first = MinImpliedByFlatWorkGroupSize; 404 RequestedFlatWorkGroupSize = true; 405 } 406 407 // Requested minimum/maximum number of waves per execution unit. 408 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 409 F, "amdgpu-waves-per-eu", Default, true); 410 411 // Make sure requested minimum is less than requested maximum. 412 if (Requested.second && Requested.first > Requested.second) 413 return Default; 414 415 // Make sure requested values do not violate subtarget's specifications. 416 if (Requested.first < getMinWavesPerEU() || 417 Requested.first > getMaxWavesPerEU()) 418 return Default; 419 if (Requested.second > getMaxWavesPerEU()) 420 return Default; 421 422 // Make sure requested values are compatible with values implied by requested 423 // minimum/maximum flat work group sizes. 424 if (RequestedFlatWorkGroupSize && 425 Requested.first < MinImpliedByFlatWorkGroupSize) 426 return Default; 427 428 return Requested; 429 } 430 431 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 432 Function *Kernel = I->getParent()->getParent(); 433 unsigned MinSize = 0; 434 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 435 bool IdQuery = false; 436 437 // If reqd_work_group_size is present it narrows value down. 438 if (auto *CI = dyn_cast<CallInst>(I)) { 439 const Function *F = CI->getCalledFunction(); 440 if (F) { 441 unsigned Dim = UINT_MAX; 442 switch (F->getIntrinsicID()) { 443 case Intrinsic::amdgcn_workitem_id_x: 444 case Intrinsic::r600_read_tidig_x: 445 IdQuery = true; 446 LLVM_FALLTHROUGH; 447 case Intrinsic::r600_read_local_size_x: 448 Dim = 0; 449 break; 450 case Intrinsic::amdgcn_workitem_id_y: 451 case Intrinsic::r600_read_tidig_y: 452 IdQuery = true; 453 LLVM_FALLTHROUGH; 454 case Intrinsic::r600_read_local_size_y: 455 Dim = 1; 456 break; 457 case Intrinsic::amdgcn_workitem_id_z: 458 case Intrinsic::r600_read_tidig_z: 459 IdQuery = true; 460 LLVM_FALLTHROUGH; 461 case Intrinsic::r600_read_local_size_z: 462 Dim = 2; 463 break; 464 default: 465 break; 466 } 467 if (Dim <= 3) { 468 if (auto Node = Kernel->getMetadata("reqd_work_group_size")) 469 if (Node->getNumOperands() == 3) 470 MinSize = MaxSize = mdconst::extract<ConstantInt>( 471 Node->getOperand(Dim))->getZExtValue(); 472 } 473 } 474 } 475 476 if (!MaxSize) 477 return false; 478 479 // Range metadata is [Lo, Hi). For ID query we need to pass max size 480 // as Hi. For size query we need to pass Hi + 1. 481 if (IdQuery) 482 MinSize = 0; 483 else 484 ++MaxSize; 485 486 MDBuilder MDB(I->getContext()); 487 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 488 APInt(32, MaxSize)); 489 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 490 return true; 491 } 492 493 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 494 unsigned &MaxAlign) const { 495 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 496 F.getCallingConv() == CallingConv::SPIR_KERNEL); 497 498 const DataLayout &DL = F.getParent()->getDataLayout(); 499 uint64_t ExplicitArgBytes = 0; 500 MaxAlign = 1; 501 502 for (const Argument &Arg : F.args()) { 503 Type *ArgTy = Arg.getType(); 504 505 unsigned Align = DL.getABITypeAlignment(ArgTy); 506 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 507 ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize; 508 MaxAlign = std::max(MaxAlign, Align); 509 } 510 511 return ExplicitArgBytes; 512 } 513 514 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 515 unsigned &MaxAlign) const { 516 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 517 518 unsigned ExplicitOffset = getExplicitKernelArgOffset(F); 519 520 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 521 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 522 if (ImplicitBytes != 0) { 523 unsigned Alignment = getAlignmentForImplicitArgPtr(); 524 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 525 } 526 527 // Being able to dereference past the end is useful for emitting scalar loads. 528 return alignTo(TotalSize, 4); 529 } 530 531 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 532 const TargetMachine &TM) : 533 R600GenSubtargetInfo(TT, GPU, FS), 534 AMDGPUSubtarget(TT), 535 InstrInfo(*this), 536 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 537 FMA(false), 538 CaymanISA(false), 539 CFALUBug(false), 540 HasVertexCache(false), 541 R600ALUInst(false), 542 FP64(false), 543 TexVTXClauseSize(0), 544 Gen(R600), 545 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)), 546 InstrItins(getInstrItineraryForCPU(GPU)) { } 547 548 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 549 unsigned NumRegionInstrs) const { 550 // Track register pressure so the scheduler can try to decrease 551 // pressure once register usage is above the threshold defined by 552 // SIRegisterInfo::getRegPressureSetLimit() 553 Policy.ShouldTrackPressure = true; 554 555 // Enabling both top down and bottom up scheduling seems to give us less 556 // register spills than just using one of these approaches on its own. 557 Policy.OnlyTopDown = false; 558 Policy.OnlyBottomUp = false; 559 560 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 561 if (!enableSIScheduler()) 562 Policy.ShouldTrackLaneMasks = true; 563 } 564 565 bool GCNSubtarget::hasMadF16() const { 566 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1; 567 } 568 569 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 570 if (getGeneration() >= AMDGPUSubtarget::GFX10) 571 return getMaxWavesPerEU(); 572 573 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 574 if (SGPRs <= 80) 575 return 10; 576 if (SGPRs <= 88) 577 return 9; 578 if (SGPRs <= 100) 579 return 8; 580 return 7; 581 } 582 if (SGPRs <= 48) 583 return 10; 584 if (SGPRs <= 56) 585 return 9; 586 if (SGPRs <= 64) 587 return 8; 588 if (SGPRs <= 72) 589 return 7; 590 if (SGPRs <= 80) 591 return 6; 592 return 5; 593 } 594 595 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 596 unsigned MaxWaves = getMaxWavesPerEU(); 597 unsigned Granule = getVGPRAllocGranule(); 598 if (VGPRs < Granule) 599 return MaxWaves; 600 unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule; 601 return std::min(getTotalNumVGPRs() / RoundedRegs, MaxWaves); 602 } 603 604 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 605 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 606 if (getGeneration() >= AMDGPUSubtarget::GFX10) 607 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. 608 609 if (MFI.hasFlatScratchInit()) { 610 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 611 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 612 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 613 return 4; // FLAT_SCRATCH, VCC (in that order). 614 } 615 616 if (isXNACKEnabled()) 617 return 4; // XNACK, VCC (in that order). 618 return 2; // VCC. 619 } 620 621 unsigned GCNSubtarget::computeOccupancy(const MachineFunction &MF, 622 unsigned LDSSize, 623 unsigned NumSGPRs, 624 unsigned NumVGPRs) const { 625 unsigned Occupancy = 626 std::min(getMaxWavesPerEU(), 627 getOccupancyWithLocalMemSize(LDSSize, MF.getFunction())); 628 if (NumSGPRs) 629 Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs)); 630 if (NumVGPRs) 631 Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs)); 632 return Occupancy; 633 } 634 635 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 636 const Function &F = MF.getFunction(); 637 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 638 639 // Compute maximum number of SGPRs function can use using default/requested 640 // minimum number of waves per execution unit. 641 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 642 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 643 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 644 645 // Check if maximum number of SGPRs was explicitly requested using 646 // "amdgpu-num-sgpr" attribute. 647 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 648 unsigned Requested = AMDGPU::getIntegerAttribute( 649 F, "amdgpu-num-sgpr", MaxNumSGPRs); 650 651 // Make sure requested value does not violate subtarget's specifications. 652 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 653 Requested = 0; 654 655 // If more SGPRs are required to support the input user/system SGPRs, 656 // increase to accommodate them. 657 // 658 // FIXME: This really ends up using the requested number of SGPRs + number 659 // of reserved special registers in total. Theoretically you could re-use 660 // the last input registers for these special registers, but this would 661 // require a lot of complexity to deal with the weird aliasing. 662 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 663 if (Requested && Requested < InputNumSGPRs) 664 Requested = InputNumSGPRs; 665 666 // Make sure requested value is compatible with values implied by 667 // default/requested minimum/maximum number of waves per execution unit. 668 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 669 Requested = 0; 670 if (WavesPerEU.second && 671 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 672 Requested = 0; 673 674 if (Requested) 675 MaxNumSGPRs = Requested; 676 } 677 678 if (hasSGPRInitBug()) 679 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 680 681 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 682 MaxAddressableNumSGPRs); 683 } 684 685 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 686 const Function &F = MF.getFunction(); 687 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 688 689 // Compute maximum number of VGPRs function can use using default/requested 690 // minimum number of waves per execution unit. 691 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 692 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 693 694 // Check if maximum number of VGPRs was explicitly requested using 695 // "amdgpu-num-vgpr" attribute. 696 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 697 unsigned Requested = AMDGPU::getIntegerAttribute( 698 F, "amdgpu-num-vgpr", MaxNumVGPRs); 699 700 // Make sure requested value is compatible with values implied by 701 // default/requested minimum/maximum number of waves per execution unit. 702 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 703 Requested = 0; 704 if (WavesPerEU.second && 705 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 706 Requested = 0; 707 708 if (Requested) 709 MaxNumVGPRs = Requested; 710 } 711 712 return MaxNumVGPRs; 713 } 714 715 namespace { 716 struct MemOpClusterMutation : ScheduleDAGMutation { 717 const SIInstrInfo *TII; 718 719 MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {} 720 721 void apply(ScheduleDAGInstrs *DAG) override { 722 SUnit *SUa = nullptr; 723 // Search for two consequent memory operations and link them 724 // to prevent scheduler from moving them apart. 725 // In DAG pre-process SUnits are in the original order of 726 // the instructions before scheduling. 727 for (SUnit &SU : DAG->SUnits) { 728 MachineInstr &MI2 = *SU.getInstr(); 729 if (!MI2.mayLoad() && !MI2.mayStore()) { 730 SUa = nullptr; 731 continue; 732 } 733 if (!SUa) { 734 SUa = &SU; 735 continue; 736 } 737 738 MachineInstr &MI1 = *SUa->getInstr(); 739 if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) || 740 (TII->isFLAT(MI1) && TII->isFLAT(MI2)) || 741 (TII->isSMRD(MI1) && TII->isSMRD(MI2)) || 742 (TII->isDS(MI1) && TII->isDS(MI2))) { 743 SU.addPredBarrier(SUa); 744 745 for (const SDep &SI : SU.Preds) { 746 if (SI.getSUnit() != SUa) 747 SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial)); 748 } 749 750 if (&SU != &DAG->ExitSU) { 751 for (const SDep &SI : SUa->Succs) { 752 if (SI.getSUnit() != &SU) 753 SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial)); 754 } 755 } 756 } 757 758 SUa = &SU; 759 } 760 } 761 }; 762 763 struct FillMFMAShadowMutation : ScheduleDAGMutation { 764 const SIInstrInfo *TII; 765 766 ScheduleDAGMI *DAG; 767 768 FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {} 769 770 bool isSALU(const SUnit *SU) const { 771 const MachineInstr *MI = SU->getInstr(); 772 return MI && TII->isSALU(*MI) && !MI->isTerminator(); 773 } 774 775 bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const { 776 if (Pred->NodeNum < Succ->NodeNum) 777 return true; 778 779 SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred}); 780 781 for (unsigned I = 0; I < Succs.size(); ++I) { 782 for (const SDep &SI : Succs[I]->Succs) { 783 const SUnit *SU = SI.getSUnit(); 784 if (SU != Succs[I] && llvm::find(Succs, SU) == Succs.end()) 785 Succs.push_back(SU); 786 } 787 } 788 789 SmallPtrSet<const SUnit*, 32> Visited; 790 while (!Preds.empty()) { 791 const SUnit *SU = Preds.pop_back_val(); 792 if (llvm::find(Succs, SU) != Succs.end()) 793 return false; 794 Visited.insert(SU); 795 for (const SDep &SI : SU->Preds) 796 if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit())) 797 Preds.push_back(SI.getSUnit()); 798 } 799 800 return true; 801 } 802 803 // Link as much SALU intructions in chain as possible. Return the size 804 // of the chain. Links up to MaxChain instructions. 805 unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain, 806 SmallPtrSetImpl<SUnit *> &Visited) const { 807 SmallVector<SUnit *, 8> Worklist({To}); 808 unsigned Linked = 0; 809 810 while (!Worklist.empty() && MaxChain-- > 0) { 811 SUnit *SU = Worklist.pop_back_val(); 812 if (!Visited.insert(SU).second) 813 continue; 814 815 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From); 816 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n'); 817 818 if (SU->addPred(SDep(From, SDep::Artificial), false)) 819 ++Linked; 820 821 for (SDep &SI : From->Succs) { 822 SUnit *SUv = SI.getSUnit(); 823 if (SUv != From && TII->isVALU(*SUv->getInstr()) && canAddEdge(SUv, SU)) 824 SUv->addPred(SDep(SU, SDep::Artificial), false); 825 } 826 827 for (SDep &SI : SU->Succs) { 828 SUnit *Succ = SI.getSUnit(); 829 if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ)) 830 Worklist.push_back(Succ); 831 } 832 } 833 834 return Linked; 835 } 836 837 void apply(ScheduleDAGInstrs *DAGInstrs) override { 838 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); 839 if (!ST.hasMAIInsts() || DisablePowerSched) 840 return; 841 DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); 842 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); 843 if (!TSchedModel || DAG->SUnits.empty()) 844 return; 845 846 // Scan for MFMA long latency instructions and try to add a dependency 847 // of available SALU instructions to give them a chance to fill MFMA 848 // shadow. That is desirable to fill MFMA shadow with SALU instructions 849 // rather than VALU to prevent power consumption bursts and throttle. 850 auto LastSALU = DAG->SUnits.begin(); 851 auto E = DAG->SUnits.end(); 852 SmallPtrSet<SUnit*, 32> Visited; 853 for (SUnit &SU : DAG->SUnits) { 854 MachineInstr &MAI = *SU.getInstr(); 855 if (!TII->isMAI(MAI) || 856 MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 || 857 MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32) 858 continue; 859 860 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1; 861 862 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU); 863 dbgs() << "Need " << Lat 864 << " instructions to cover latency.\n"); 865 866 // Find up to Lat independent scalar instructions as early as 867 // possible such that they can be scheduled after this MFMA. 868 for ( ; Lat && LastSALU != E; ++LastSALU) { 869 if (Visited.count(&*LastSALU)) 870 continue; 871 872 if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU)) 873 continue; 874 875 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited); 876 } 877 } 878 } 879 }; 880 } // namespace 881 882 void GCNSubtarget::getPostRAMutations( 883 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 884 Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo)); 885 Mutations.push_back(llvm::make_unique<FillMFMAShadowMutation>(&InstrInfo)); 886 } 887 888 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 889 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 890 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 891 else 892 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); 893 } 894 895 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 896 if (TM.getTargetTriple().getArch() == Triple::amdgcn) 897 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 898 else 899 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); 900 } 901