1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Implements the AMDGPU specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUSubtarget.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUTargetMachine.h" 17 #include "AMDGPUCallLowering.h" 18 #include "AMDGPUInstructionSelector.h" 19 #include "AMDGPULegalizerInfo.h" 20 #include "AMDGPURegisterBankInfo.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 23 #include "llvm/ADT/SmallString.h" 24 #include "llvm/CodeGen/MachineScheduler.h" 25 #include "llvm/MC/MCSubtargetInfo.h" 26 #include "llvm/IR/MDBuilder.h" 27 #include "llvm/CodeGen/TargetFrameLowering.h" 28 #include <algorithm> 29 30 using namespace llvm; 31 32 #define DEBUG_TYPE "amdgpu-subtarget" 33 34 #define GET_SUBTARGETINFO_TARGET_DESC 35 #define GET_SUBTARGETINFO_CTOR 36 #define AMDGPUSubtarget GCNSubtarget 37 #include "AMDGPUGenSubtargetInfo.inc" 38 #define GET_SUBTARGETINFO_TARGET_DESC 39 #define GET_SUBTARGETINFO_CTOR 40 #undef AMDGPUSubtarget 41 #include "R600GenSubtargetInfo.inc" 42 43 static cl::opt<bool> DisablePowerSched( 44 "amdgpu-disable-power-sched", 45 cl::desc("Disable scheduling to minimize mAI power bursts"), 46 cl::init(false)); 47 48 static cl::opt<bool> EnableVGPRIndexMode( 49 "amdgpu-vgpr-index-mode", 50 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), 51 cl::init(false)); 52 53 GCNSubtarget::~GCNSubtarget() = default; 54 55 R600Subtarget & 56 R600Subtarget::initializeSubtargetDependencies(const Triple &TT, 57 StringRef GPU, StringRef FS) { 58 SmallString<256> FullFS("+promote-alloca,"); 59 FullFS += FS; 60 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS); 61 62 HasMulU24 = getGeneration() >= EVERGREEN; 63 HasMulI24 = hasCaymanISA(); 64 65 return *this; 66 } 67 68 GCNSubtarget & 69 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 70 StringRef GPU, StringRef FS) { 71 // Determine default and user-specified characteristics 72 // 73 // We want to be able to turn these off, but making this a subtarget feature 74 // for SI has the unhelpful behavior that it unsets everything else if you 75 // disable it. 76 // 77 // Similarly we want enable-prt-strict-null to be on by default and not to 78 // unset everything else if it is disabled 79 80 // Assuming ECC is enabled is the conservative default. 81 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,+sram-ecc,+xnack,"); 82 83 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 84 FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,"; 85 86 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS 87 88 // Disable mutually exclusive bits. 89 if (FS.find_lower("+wavefrontsize") != StringRef::npos) { 90 if (FS.find_lower("wavefrontsize16") == StringRef::npos) 91 FullFS += "-wavefrontsize16,"; 92 if (FS.find_lower("wavefrontsize32") == StringRef::npos) 93 FullFS += "-wavefrontsize32,"; 94 if (FS.find_lower("wavefrontsize64") == StringRef::npos) 95 FullFS += "-wavefrontsize64,"; 96 } 97 98 FullFS += FS; 99 100 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS); 101 102 // We don't support FP64 for EG/NI atm. 103 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 104 105 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es 106 // on VI and newer hardware to avoid assertion failures due to missing ADDR64 107 // variants of MUBUF instructions. 108 if (!hasAddr64() && !FS.contains("flat-for-global")) { 109 FlatForGlobal = true; 110 } 111 112 // Set defaults if needed. 113 if (MaxPrivateElementSize == 0) 114 MaxPrivateElementSize = 4; 115 116 if (LDSBankCount == 0) 117 LDSBankCount = 32; 118 119 if (TT.getArch() == Triple::amdgcn) { 120 if (LocalMemorySize == 0) 121 LocalMemorySize = 32768; 122 123 // Do something sensible for unspecified target. 124 if (!HasMovrel && !HasVGPRIndexMode) 125 HasMovrel = true; 126 } 127 128 // Don't crash on invalid devices. 129 if (WavefrontSizeLog2 == 0) 130 WavefrontSizeLog2 = 5; 131 132 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 133 134 // Disable XNACK on targets where it is not enabled by default unless it is 135 // explicitly requested. 136 if (!FS.contains("+xnack") && DoesNotSupportXNACK && EnableXNACK) { 137 ToggleFeature(AMDGPU::FeatureXNACK); 138 EnableXNACK = false; 139 } 140 141 // ECC is on by default, but turn it off if the hardware doesn't support it 142 // anyway. This matters for the gfx9 targets with d16 loads, but don't support 143 // ECC. 144 if (DoesNotSupportSRAMECC && EnableSRAMECC) { 145 ToggleFeature(AMDGPU::FeatureSRAMECC); 146 EnableSRAMECC = false; 147 } 148 149 return *this; 150 } 151 152 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : 153 TargetTriple(TT), 154 Has16BitInsts(false), 155 HasMadMixInsts(false), 156 HasMadMacF32Insts(false), 157 HasDsSrc2Insts(false), 158 HasSDWA(false), 159 HasVOP3PInsts(false), 160 HasMulI24(true), 161 HasMulU24(true), 162 HasInv2PiInlineImm(false), 163 HasFminFmaxLegacy(true), 164 EnablePromoteAlloca(false), 165 HasTrigReducedRange(false), 166 MaxWavesPerEU(10), 167 LocalMemorySize(0), 168 WavefrontSizeLog2(0) 169 { } 170 171 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 172 const GCNTargetMachine &TM) : 173 AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS), 174 AMDGPUSubtarget(TT), 175 TargetTriple(TT), 176 Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS), 177 InstrItins(getInstrItineraryForCPU(GPU)), 178 LDSBankCount(0), 179 MaxPrivateElementSize(0), 180 181 FastFMAF32(false), 182 FastDenormalF32(false), 183 HalfRate64Ops(false), 184 185 FlatForGlobal(false), 186 AutoWaitcntBeforeBarrier(false), 187 CodeObjectV3(false), 188 UnalignedScratchAccess(false), 189 UnalignedAccessMode(false), 190 191 HasApertureRegs(false), 192 EnableXNACK(false), 193 DoesNotSupportXNACK(false), 194 EnableCuMode(false), 195 TrapHandler(false), 196 197 EnableLoadStoreOpt(false), 198 EnableUnsafeDSOffsetFolding(false), 199 EnableSIScheduler(false), 200 EnableDS128(false), 201 EnablePRTStrictNull(false), 202 DumpCode(false), 203 204 FP64(false), 205 GCN3Encoding(false), 206 CIInsts(false), 207 GFX8Insts(false), 208 GFX9Insts(false), 209 GFX10Insts(false), 210 GFX10_3Insts(false), 211 GFX7GFX8GFX9Insts(false), 212 SGPRInitBug(false), 213 HasSMemRealTime(false), 214 HasIntClamp(false), 215 HasFmaMixInsts(false), 216 HasMovrel(false), 217 HasVGPRIndexMode(false), 218 HasScalarStores(false), 219 HasScalarAtomics(false), 220 HasSDWAOmod(false), 221 HasSDWAScalar(false), 222 HasSDWASdst(false), 223 HasSDWAMac(false), 224 HasSDWAOutModsVOPC(false), 225 HasDPP(false), 226 HasDPP8(false), 227 HasR128A16(false), 228 HasGFX10A16(false), 229 HasG16(false), 230 HasNSAEncoding(false), 231 GFX10_BEncoding(false), 232 HasDLInsts(false), 233 HasDot1Insts(false), 234 HasDot2Insts(false), 235 HasDot3Insts(false), 236 HasDot4Insts(false), 237 HasDot5Insts(false), 238 HasDot6Insts(false), 239 HasMAIInsts(false), 240 HasPkFmacF16Inst(false), 241 HasAtomicFaddInsts(false), 242 EnableSRAMECC(false), 243 DoesNotSupportSRAMECC(false), 244 HasNoSdstCMPX(false), 245 HasVscnt(false), 246 HasGetWaveIdInst(false), 247 HasSMemTimeInst(false), 248 HasRegisterBanking(false), 249 HasVOP3Literal(false), 250 HasNoDataDepHazard(false), 251 FlatAddressSpace(false), 252 FlatInstOffsets(false), 253 FlatGlobalInsts(false), 254 FlatScratchInsts(false), 255 ScalarFlatScratchInsts(false), 256 AddNoCarryInsts(false), 257 HasUnpackedD16VMem(false), 258 LDSMisalignedBug(false), 259 HasMFMAInlineLiteralBug(false), 260 UnalignedBufferAccess(false), 261 UnalignedDSAccess(false), 262 263 ScalarizeGlobal(false), 264 265 HasVcmpxPermlaneHazard(false), 266 HasVMEMtoScalarWriteHazard(false), 267 HasSMEMtoVectorWriteHazard(false), 268 HasInstFwdPrefetchBug(false), 269 HasVcmpxExecWARHazard(false), 270 HasLdsBranchVmemWARHazard(false), 271 HasNSAtoVMEMBug(false), 272 HasOffset3fBug(false), 273 HasFlatSegmentOffsetBug(false), 274 275 FeatureDisable(false), 276 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 277 TLInfo(TM, *this), 278 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 279 MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this); 280 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 281 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering())); 282 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 283 RegBankInfo.reset(new AMDGPURegisterBankInfo(*this)); 284 InstSelector.reset(new AMDGPUInstructionSelector( 285 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 286 } 287 288 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { 289 if (getGeneration() < GFX10) 290 return 1; 291 292 switch (Opcode) { 293 case AMDGPU::V_LSHLREV_B64: 294 case AMDGPU::V_LSHLREV_B64_gfx10: 295 case AMDGPU::V_LSHL_B64: 296 case AMDGPU::V_LSHRREV_B64: 297 case AMDGPU::V_LSHRREV_B64_gfx10: 298 case AMDGPU::V_LSHR_B64: 299 case AMDGPU::V_ASHRREV_I64: 300 case AMDGPU::V_ASHRREV_I64_gfx10: 301 case AMDGPU::V_ASHR_I64: 302 return 1; 303 } 304 305 return 2; 306 } 307 308 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 309 const Function &F) const { 310 if (NWaves == 1) 311 return getLocalMemorySize(); 312 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 313 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 314 if (!WorkGroupsPerCu) 315 return 0; 316 unsigned MaxWaves = getMaxWavesPerEU(); 317 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 318 } 319 320 // FIXME: Should return min,max range. 321 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 322 const Function &F) const { 323 const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second; 324 const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize); 325 if (!MaxWorkGroupsPerCu) 326 return 0; 327 328 const unsigned WaveSize = getWavefrontSize(); 329 330 // FIXME: Do we need to account for alignment requirement of LDS rounding the 331 // size up? 332 // Compute restriction based on LDS usage 333 unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u); 334 335 // This can be queried with more LDS than is possible, so just assume the 336 // worst. 337 if (NumGroups == 0) 338 return 1; 339 340 NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups); 341 342 // Round to the number of waves. 343 const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize; 344 unsigned MaxWaves = NumGroups * MaxGroupNumWaves; 345 346 // Clamp to the maximum possible number of waves. 347 MaxWaves = std::min(MaxWaves, getMaxWavesPerEU()); 348 349 // FIXME: Needs to be a multiple of the group size? 350 //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves); 351 352 assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() && 353 "computed invalid occupancy"); 354 return MaxWaves; 355 } 356 357 unsigned 358 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 359 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 360 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 361 } 362 363 std::pair<unsigned, unsigned> 364 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 365 switch (CC) { 366 case CallingConv::AMDGPU_VS: 367 case CallingConv::AMDGPU_LS: 368 case CallingConv::AMDGPU_HS: 369 case CallingConv::AMDGPU_ES: 370 case CallingConv::AMDGPU_GS: 371 case CallingConv::AMDGPU_PS: 372 return std::make_pair(1, getWavefrontSize()); 373 default: 374 return std::make_pair(1u, getMaxFlatWorkGroupSize()); 375 } 376 } 377 378 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 379 const Function &F) const { 380 // Default minimum/maximum flat work group sizes. 381 std::pair<unsigned, unsigned> Default = 382 getDefaultFlatWorkGroupSize(F.getCallingConv()); 383 384 // Requested minimum/maximum flat work group sizes. 385 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 386 F, "amdgpu-flat-work-group-size", Default); 387 388 // Make sure requested minimum is less than requested maximum. 389 if (Requested.first > Requested.second) 390 return Default; 391 392 // Make sure requested values do not violate subtarget's specifications. 393 if (Requested.first < getMinFlatWorkGroupSize()) 394 return Default; 395 if (Requested.second > getMaxFlatWorkGroupSize()) 396 return Default; 397 398 return Requested; 399 } 400 401 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 402 const Function &F) const { 403 // Default minimum/maximum number of waves per execution unit. 404 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 405 406 // Default/requested minimum/maximum flat work group sizes. 407 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 408 409 // If minimum/maximum flat work group sizes were explicitly requested using 410 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 411 // number of waves per execution unit to values implied by requested 412 // minimum/maximum flat work group sizes. 413 unsigned MinImpliedByFlatWorkGroupSize = 414 getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second); 415 Default.first = MinImpliedByFlatWorkGroupSize; 416 bool RequestedFlatWorkGroupSize = 417 F.hasFnAttribute("amdgpu-flat-work-group-size"); 418 419 // Requested minimum/maximum number of waves per execution unit. 420 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 421 F, "amdgpu-waves-per-eu", Default, true); 422 423 // Make sure requested minimum is less than requested maximum. 424 if (Requested.second && Requested.first > Requested.second) 425 return Default; 426 427 // Make sure requested values do not violate subtarget's specifications. 428 if (Requested.first < getMinWavesPerEU() || 429 Requested.second > getMaxWavesPerEU()) 430 return Default; 431 432 // Make sure requested values are compatible with values implied by requested 433 // minimum/maximum flat work group sizes. 434 if (RequestedFlatWorkGroupSize && 435 Requested.first < MinImpliedByFlatWorkGroupSize) 436 return Default; 437 438 return Requested; 439 } 440 441 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) { 442 auto Node = Kernel.getMetadata("reqd_work_group_size"); 443 if (Node && Node->getNumOperands() == 3) 444 return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue(); 445 return std::numeric_limits<unsigned>::max(); 446 } 447 448 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel, 449 unsigned Dimension) const { 450 unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension); 451 if (ReqdSize != std::numeric_limits<unsigned>::max()) 452 return ReqdSize - 1; 453 return getFlatWorkGroupSizes(Kernel).second - 1; 454 } 455 456 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 457 Function *Kernel = I->getParent()->getParent(); 458 unsigned MinSize = 0; 459 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 460 bool IdQuery = false; 461 462 // If reqd_work_group_size is present it narrows value down. 463 if (auto *CI = dyn_cast<CallInst>(I)) { 464 const Function *F = CI->getCalledFunction(); 465 if (F) { 466 unsigned Dim = UINT_MAX; 467 switch (F->getIntrinsicID()) { 468 case Intrinsic::amdgcn_workitem_id_x: 469 case Intrinsic::r600_read_tidig_x: 470 IdQuery = true; 471 LLVM_FALLTHROUGH; 472 case Intrinsic::r600_read_local_size_x: 473 Dim = 0; 474 break; 475 case Intrinsic::amdgcn_workitem_id_y: 476 case Intrinsic::r600_read_tidig_y: 477 IdQuery = true; 478 LLVM_FALLTHROUGH; 479 case Intrinsic::r600_read_local_size_y: 480 Dim = 1; 481 break; 482 case Intrinsic::amdgcn_workitem_id_z: 483 case Intrinsic::r600_read_tidig_z: 484 IdQuery = true; 485 LLVM_FALLTHROUGH; 486 case Intrinsic::r600_read_local_size_z: 487 Dim = 2; 488 break; 489 default: 490 break; 491 } 492 493 if (Dim <= 3) { 494 unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim); 495 if (ReqdSize != std::numeric_limits<unsigned>::max()) 496 MinSize = MaxSize = ReqdSize; 497 } 498 } 499 } 500 501 if (!MaxSize) 502 return false; 503 504 // Range metadata is [Lo, Hi). For ID query we need to pass max size 505 // as Hi. For size query we need to pass Hi + 1. 506 if (IdQuery) 507 MinSize = 0; 508 else 509 ++MaxSize; 510 511 MDBuilder MDB(I->getContext()); 512 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 513 APInt(32, MaxSize)); 514 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 515 return true; 516 } 517 518 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 519 Align &MaxAlign) const { 520 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 521 F.getCallingConv() == CallingConv::SPIR_KERNEL); 522 523 const DataLayout &DL = F.getParent()->getDataLayout(); 524 uint64_t ExplicitArgBytes = 0; 525 MaxAlign = Align(1); 526 527 for (const Argument &Arg : F.args()) { 528 const bool IsByRef = Arg.hasByRefAttr(); 529 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); 530 MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None; 531 if (!Alignment) 532 Alignment = DL.getABITypeAlign(ArgTy); 533 534 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 535 ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize; 536 MaxAlign = max(MaxAlign, Alignment); 537 } 538 539 return ExplicitArgBytes; 540 } 541 542 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 543 Align &MaxAlign) const { 544 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 545 546 unsigned ExplicitOffset = getExplicitKernelArgOffset(F); 547 548 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 549 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 550 if (ImplicitBytes != 0) { 551 const Align Alignment = getAlignmentForImplicitArgPtr(); 552 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 553 } 554 555 // Being able to dereference past the end is useful for emitting scalar loads. 556 return alignTo(TotalSize, 4); 557 } 558 559 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 560 const TargetMachine &TM) : 561 R600GenSubtargetInfo(TT, GPU, /*TuneCPU*/GPU, FS), 562 AMDGPUSubtarget(TT), 563 InstrInfo(*this), 564 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 565 FMA(false), 566 CaymanISA(false), 567 CFALUBug(false), 568 HasVertexCache(false), 569 R600ALUInst(false), 570 FP64(false), 571 TexVTXClauseSize(0), 572 Gen(R600), 573 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)), 574 InstrItins(getInstrItineraryForCPU(GPU)) { } 575 576 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 577 unsigned NumRegionInstrs) const { 578 // Track register pressure so the scheduler can try to decrease 579 // pressure once register usage is above the threshold defined by 580 // SIRegisterInfo::getRegPressureSetLimit() 581 Policy.ShouldTrackPressure = true; 582 583 // Enabling both top down and bottom up scheduling seems to give us less 584 // register spills than just using one of these approaches on its own. 585 Policy.OnlyTopDown = false; 586 Policy.OnlyBottomUp = false; 587 588 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 589 if (!enableSIScheduler()) 590 Policy.ShouldTrackLaneMasks = true; 591 } 592 593 bool GCNSubtarget::hasMadF16() const { 594 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1; 595 } 596 597 bool GCNSubtarget::useVGPRIndexMode() const { 598 return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode()); 599 } 600 601 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 602 if (getGeneration() >= AMDGPUSubtarget::GFX10) 603 return getMaxWavesPerEU(); 604 605 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 606 if (SGPRs <= 80) 607 return 10; 608 if (SGPRs <= 88) 609 return 9; 610 if (SGPRs <= 100) 611 return 8; 612 return 7; 613 } 614 if (SGPRs <= 48) 615 return 10; 616 if (SGPRs <= 56) 617 return 9; 618 if (SGPRs <= 64) 619 return 8; 620 if (SGPRs <= 72) 621 return 7; 622 if (SGPRs <= 80) 623 return 6; 624 return 5; 625 } 626 627 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 628 unsigned MaxWaves = getMaxWavesPerEU(); 629 unsigned Granule = getVGPRAllocGranule(); 630 if (VGPRs < Granule) 631 return MaxWaves; 632 unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule; 633 return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves); 634 } 635 636 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 637 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 638 if (getGeneration() >= AMDGPUSubtarget::GFX10) 639 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. 640 641 if (MFI.hasFlatScratchInit()) { 642 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 643 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 644 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 645 return 4; // FLAT_SCRATCH, VCC (in that order). 646 } 647 648 if (isXNACKEnabled()) 649 return 4; // XNACK, VCC (in that order). 650 return 2; // VCC. 651 } 652 653 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize, 654 unsigned NumSGPRs, 655 unsigned NumVGPRs) const { 656 unsigned Occupancy = 657 std::min(getMaxWavesPerEU(), 658 getOccupancyWithLocalMemSize(LDSSize, F)); 659 if (NumSGPRs) 660 Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs)); 661 if (NumVGPRs) 662 Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs)); 663 return Occupancy; 664 } 665 666 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 667 const Function &F = MF.getFunction(); 668 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 669 670 // Compute maximum number of SGPRs function can use using default/requested 671 // minimum number of waves per execution unit. 672 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 673 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 674 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 675 676 // Check if maximum number of SGPRs was explicitly requested using 677 // "amdgpu-num-sgpr" attribute. 678 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 679 unsigned Requested = AMDGPU::getIntegerAttribute( 680 F, "amdgpu-num-sgpr", MaxNumSGPRs); 681 682 // Make sure requested value does not violate subtarget's specifications. 683 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 684 Requested = 0; 685 686 // If more SGPRs are required to support the input user/system SGPRs, 687 // increase to accommodate them. 688 // 689 // FIXME: This really ends up using the requested number of SGPRs + number 690 // of reserved special registers in total. Theoretically you could re-use 691 // the last input registers for these special registers, but this would 692 // require a lot of complexity to deal with the weird aliasing. 693 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 694 if (Requested && Requested < InputNumSGPRs) 695 Requested = InputNumSGPRs; 696 697 // Make sure requested value is compatible with values implied by 698 // default/requested minimum/maximum number of waves per execution unit. 699 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 700 Requested = 0; 701 if (WavesPerEU.second && 702 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 703 Requested = 0; 704 705 if (Requested) 706 MaxNumSGPRs = Requested; 707 } 708 709 if (hasSGPRInitBug()) 710 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 711 712 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 713 MaxAddressableNumSGPRs); 714 } 715 716 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 717 const Function &F = MF.getFunction(); 718 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 719 720 // Compute maximum number of VGPRs function can use using default/requested 721 // minimum number of waves per execution unit. 722 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 723 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 724 725 // Check if maximum number of VGPRs was explicitly requested using 726 // "amdgpu-num-vgpr" attribute. 727 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 728 unsigned Requested = AMDGPU::getIntegerAttribute( 729 F, "amdgpu-num-vgpr", MaxNumVGPRs); 730 731 // Make sure requested value is compatible with values implied by 732 // default/requested minimum/maximum number of waves per execution unit. 733 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 734 Requested = 0; 735 if (WavesPerEU.second && 736 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 737 Requested = 0; 738 739 if (Requested) 740 MaxNumVGPRs = Requested; 741 } 742 743 return MaxNumVGPRs; 744 } 745 746 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, 747 int UseOpIdx, SDep &Dep) const { 748 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || 749 !Def->isInstr() || !Use->isInstr()) 750 return; 751 752 MachineInstr *DefI = Def->getInstr(); 753 MachineInstr *UseI = Use->getInstr(); 754 755 if (DefI->isBundle()) { 756 const SIRegisterInfo *TRI = getRegisterInfo(); 757 auto Reg = Dep.getReg(); 758 MachineBasicBlock::const_instr_iterator I(DefI->getIterator()); 759 MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end()); 760 unsigned Lat = 0; 761 for (++I; I != E && I->isBundledWithPred(); ++I) { 762 if (I->modifiesRegister(Reg, TRI)) 763 Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I); 764 else if (Lat) 765 --Lat; 766 } 767 Dep.setLatency(Lat); 768 } else if (UseI->isBundle()) { 769 const SIRegisterInfo *TRI = getRegisterInfo(); 770 auto Reg = Dep.getReg(); 771 MachineBasicBlock::const_instr_iterator I(UseI->getIterator()); 772 MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end()); 773 unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI); 774 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) { 775 if (I->readsRegister(Reg, TRI)) 776 break; 777 --Lat; 778 } 779 Dep.setLatency(Lat); 780 } 781 } 782 783 namespace { 784 struct FillMFMAShadowMutation : ScheduleDAGMutation { 785 const SIInstrInfo *TII; 786 787 ScheduleDAGMI *DAG; 788 789 FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {} 790 791 bool isSALU(const SUnit *SU) const { 792 const MachineInstr *MI = SU->getInstr(); 793 return MI && TII->isSALU(*MI) && !MI->isTerminator(); 794 } 795 796 bool isVALU(const SUnit *SU) const { 797 const MachineInstr *MI = SU->getInstr(); 798 return MI && TII->isVALU(*MI); 799 } 800 801 bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const { 802 if (Pred->NodeNum < Succ->NodeNum) 803 return true; 804 805 SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred}); 806 807 for (unsigned I = 0; I < Succs.size(); ++I) { 808 for (const SDep &SI : Succs[I]->Succs) { 809 const SUnit *SU = SI.getSUnit(); 810 if (SU != Succs[I] && llvm::find(Succs, SU) == Succs.end()) 811 Succs.push_back(SU); 812 } 813 } 814 815 SmallPtrSet<const SUnit*, 32> Visited; 816 while (!Preds.empty()) { 817 const SUnit *SU = Preds.pop_back_val(); 818 if (llvm::find(Succs, SU) != Succs.end()) 819 return false; 820 Visited.insert(SU); 821 for (const SDep &SI : SU->Preds) 822 if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit())) 823 Preds.push_back(SI.getSUnit()); 824 } 825 826 return true; 827 } 828 829 // Link as much SALU intructions in chain as possible. Return the size 830 // of the chain. Links up to MaxChain instructions. 831 unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain, 832 SmallPtrSetImpl<SUnit *> &Visited) const { 833 SmallVector<SUnit *, 8> Worklist({To}); 834 unsigned Linked = 0; 835 836 while (!Worklist.empty() && MaxChain-- > 0) { 837 SUnit *SU = Worklist.pop_back_val(); 838 if (!Visited.insert(SU).second) 839 continue; 840 841 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From); 842 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n'); 843 844 if (SU->addPred(SDep(From, SDep::Artificial), false)) 845 ++Linked; 846 847 for (SDep &SI : From->Succs) { 848 SUnit *SUv = SI.getSUnit(); 849 if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU)) 850 SUv->addPred(SDep(SU, SDep::Artificial), false); 851 } 852 853 for (SDep &SI : SU->Succs) { 854 SUnit *Succ = SI.getSUnit(); 855 if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ)) 856 Worklist.push_back(Succ); 857 } 858 } 859 860 return Linked; 861 } 862 863 void apply(ScheduleDAGInstrs *DAGInstrs) override { 864 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); 865 if (!ST.hasMAIInsts() || DisablePowerSched) 866 return; 867 DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); 868 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); 869 if (!TSchedModel || DAG->SUnits.empty()) 870 return; 871 872 // Scan for MFMA long latency instructions and try to add a dependency 873 // of available SALU instructions to give them a chance to fill MFMA 874 // shadow. That is desirable to fill MFMA shadow with SALU instructions 875 // rather than VALU to prevent power consumption bursts and throttle. 876 auto LastSALU = DAG->SUnits.begin(); 877 auto E = DAG->SUnits.end(); 878 SmallPtrSet<SUnit*, 32> Visited; 879 for (SUnit &SU : DAG->SUnits) { 880 MachineInstr &MAI = *SU.getInstr(); 881 if (!TII->isMAI(MAI) || 882 MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 || 883 MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32) 884 continue; 885 886 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1; 887 888 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU); 889 dbgs() << "Need " << Lat 890 << " instructions to cover latency.\n"); 891 892 // Find up to Lat independent scalar instructions as early as 893 // possible such that they can be scheduled after this MFMA. 894 for ( ; Lat && LastSALU != E; ++LastSALU) { 895 if (Visited.count(&*LastSALU)) 896 continue; 897 898 if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU)) 899 continue; 900 901 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited); 902 } 903 } 904 } 905 }; 906 } // namespace 907 908 void GCNSubtarget::getPostRAMutations( 909 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 910 Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo)); 911 } 912 913 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 914 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 915 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 916 else 917 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); 918 } 919 920 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 921 if (TM.getTargetTriple().getArch() == Triple::amdgcn) 922 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 923 else 924 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); 925 } 926