1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Implements the AMDGPU specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUSubtarget.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUTargetMachine.h" 17 #include "AMDGPUCallLowering.h" 18 #include "AMDGPUInstructionSelector.h" 19 #include "AMDGPULegalizerInfo.h" 20 #include "AMDGPURegisterBankInfo.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 23 #include "llvm/ADT/SmallString.h" 24 #include "llvm/CodeGen/MachineScheduler.h" 25 #include "llvm/MC/MCSubtargetInfo.h" 26 #include "llvm/IR/MDBuilder.h" 27 #include "llvm/CodeGen/TargetFrameLowering.h" 28 #include <algorithm> 29 30 using namespace llvm; 31 32 #define DEBUG_TYPE "amdgpu-subtarget" 33 34 #define GET_SUBTARGETINFO_TARGET_DESC 35 #define GET_SUBTARGETINFO_CTOR 36 #define AMDGPUSubtarget GCNSubtarget 37 #include "AMDGPUGenSubtargetInfo.inc" 38 #define GET_SUBTARGETINFO_TARGET_DESC 39 #define GET_SUBTARGETINFO_CTOR 40 #undef AMDGPUSubtarget 41 #include "R600GenSubtargetInfo.inc" 42 43 static cl::opt<bool> DisablePowerSched( 44 "amdgpu-disable-power-sched", 45 cl::desc("Disable scheduling to minimize mAI power bursts"), 46 cl::init(false)); 47 48 static cl::opt<bool> EnableVGPRIndexMode( 49 "amdgpu-vgpr-index-mode", 50 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), 51 cl::init(false)); 52 53 GCNSubtarget::~GCNSubtarget() = default; 54 55 R600Subtarget & 56 R600Subtarget::initializeSubtargetDependencies(const Triple &TT, 57 StringRef GPU, StringRef FS) { 58 SmallString<256> FullFS("+promote-alloca,"); 59 FullFS += FS; 60 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS); 61 62 HasMulU24 = getGeneration() >= EVERGREEN; 63 HasMulI24 = hasCaymanISA(); 64 65 return *this; 66 } 67 68 GCNSubtarget & 69 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 70 StringRef GPU, StringRef FS) { 71 // Determine default and user-specified characteristics 72 // 73 // We want to be able to turn these off, but making this a subtarget feature 74 // for SI has the unhelpful behavior that it unsets everything else if you 75 // disable it. 76 // 77 // Similarly we want enable-prt-strict-null to be on by default and not to 78 // unset everything else if it is disabled 79 80 // Assuming ECC is enabled is the conservative default. 81 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,+sram-ecc,+xnack,"); 82 83 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 84 FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,"; 85 86 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS 87 88 // Disable mutually exclusive bits. 89 if (FS.find_lower("+wavefrontsize") != StringRef::npos) { 90 if (FS.find_lower("wavefrontsize16") == StringRef::npos) 91 FullFS += "-wavefrontsize16,"; 92 if (FS.find_lower("wavefrontsize32") == StringRef::npos) 93 FullFS += "-wavefrontsize32,"; 94 if (FS.find_lower("wavefrontsize64") == StringRef::npos) 95 FullFS += "-wavefrontsize64,"; 96 } 97 98 FullFS += FS; 99 100 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS); 101 102 // We don't support FP64 for EG/NI atm. 103 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 104 105 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es 106 // on VI and newer hardware to avoid assertion failures due to missing ADDR64 107 // variants of MUBUF instructions. 108 if (!hasAddr64() && !FS.contains("flat-for-global")) { 109 FlatForGlobal = true; 110 } 111 112 // Set defaults if needed. 113 if (MaxPrivateElementSize == 0) 114 MaxPrivateElementSize = 4; 115 116 if (LDSBankCount == 0) 117 LDSBankCount = 32; 118 119 if (TT.getArch() == Triple::amdgcn) { 120 if (LocalMemorySize == 0) 121 LocalMemorySize = 32768; 122 123 // Do something sensible for unspecified target. 124 if (!HasMovrel && !HasVGPRIndexMode) 125 HasMovrel = true; 126 } 127 128 // Don't crash on invalid devices. 129 if (WavefrontSizeLog2 == 0) 130 WavefrontSizeLog2 = 5; 131 132 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 133 134 // Disable XNACK on targets where it is not enabled by default unless it is 135 // explicitly requested. 136 if (!FS.contains("+xnack") && DoesNotSupportXNACK && EnableXNACK) { 137 ToggleFeature(AMDGPU::FeatureXNACK); 138 EnableXNACK = false; 139 } 140 141 // ECC is on by default, but turn it off if the hardware doesn't support it 142 // anyway. This matters for the gfx9 targets with d16 loads, but don't support 143 // ECC. 144 if (DoesNotSupportSRAMECC && EnableSRAMECC) { 145 ToggleFeature(AMDGPU::FeatureSRAMECC); 146 EnableSRAMECC = false; 147 } 148 149 return *this; 150 } 151 152 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : 153 TargetTriple(TT), 154 Has16BitInsts(false), 155 HasMadMixInsts(false), 156 HasMadMacF32Insts(false), 157 HasDsSrc2Insts(false), 158 HasSDWA(false), 159 HasVOP3PInsts(false), 160 HasMulI24(true), 161 HasMulU24(true), 162 HasInv2PiInlineImm(false), 163 HasFminFmaxLegacy(true), 164 EnablePromoteAlloca(false), 165 HasTrigReducedRange(false), 166 MaxWavesPerEU(10), 167 LocalMemorySize(0), 168 WavefrontSizeLog2(0) 169 { } 170 171 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 172 const GCNTargetMachine &TM) : 173 AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS), 174 AMDGPUSubtarget(TT), 175 TargetTriple(TT), 176 Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS), 177 InstrItins(getInstrItineraryForCPU(GPU)), 178 LDSBankCount(0), 179 MaxPrivateElementSize(0), 180 181 FastFMAF32(false), 182 FastDenormalF32(false), 183 HalfRate64Ops(false), 184 185 FlatForGlobal(false), 186 AutoWaitcntBeforeBarrier(false), 187 CodeObjectV3(false), 188 UnalignedScratchAccess(false), 189 UnalignedBufferAccess(false), 190 UnalignedAccessMode(false), 191 192 HasApertureRegs(false), 193 EnableXNACK(false), 194 DoesNotSupportXNACK(false), 195 EnableCuMode(false), 196 TrapHandler(false), 197 198 EnableLoadStoreOpt(false), 199 EnableUnsafeDSOffsetFolding(false), 200 EnableSIScheduler(false), 201 EnableDS128(false), 202 EnablePRTStrictNull(false), 203 DumpCode(false), 204 205 FP64(false), 206 GCN3Encoding(false), 207 CIInsts(false), 208 GFX8Insts(false), 209 GFX9Insts(false), 210 GFX10Insts(false), 211 GFX10_3Insts(false), 212 GFX7GFX8GFX9Insts(false), 213 SGPRInitBug(false), 214 HasSMemRealTime(false), 215 HasIntClamp(false), 216 HasFmaMixInsts(false), 217 HasMovrel(false), 218 HasVGPRIndexMode(false), 219 HasScalarStores(false), 220 HasScalarAtomics(false), 221 HasSDWAOmod(false), 222 HasSDWAScalar(false), 223 HasSDWASdst(false), 224 HasSDWAMac(false), 225 HasSDWAOutModsVOPC(false), 226 HasDPP(false), 227 HasDPP8(false), 228 HasR128A16(false), 229 HasGFX10A16(false), 230 HasG16(false), 231 HasNSAEncoding(false), 232 GFX10_BEncoding(false), 233 HasDLInsts(false), 234 HasDot1Insts(false), 235 HasDot2Insts(false), 236 HasDot3Insts(false), 237 HasDot4Insts(false), 238 HasDot5Insts(false), 239 HasDot6Insts(false), 240 HasMAIInsts(false), 241 HasPkFmacF16Inst(false), 242 HasAtomicFaddInsts(false), 243 EnableSRAMECC(false), 244 DoesNotSupportSRAMECC(false), 245 HasNoSdstCMPX(false), 246 HasVscnt(false), 247 HasGetWaveIdInst(false), 248 HasSMemTimeInst(false), 249 HasRegisterBanking(false), 250 HasVOP3Literal(false), 251 HasNoDataDepHazard(false), 252 FlatAddressSpace(false), 253 FlatInstOffsets(false), 254 FlatGlobalInsts(false), 255 FlatScratchInsts(false), 256 ScalarFlatScratchInsts(false), 257 AddNoCarryInsts(false), 258 HasUnpackedD16VMem(false), 259 LDSMisalignedBug(false), 260 HasMFMAInlineLiteralBug(false), 261 UnalignedDSAccess(false), 262 263 ScalarizeGlobal(false), 264 265 HasVcmpxPermlaneHazard(false), 266 HasVMEMtoScalarWriteHazard(false), 267 HasSMEMtoVectorWriteHazard(false), 268 HasInstFwdPrefetchBug(false), 269 HasVcmpxExecWARHazard(false), 270 HasLdsBranchVmemWARHazard(false), 271 HasNSAtoVMEMBug(false), 272 HasOffset3fBug(false), 273 HasFlatSegmentOffsetBug(false), 274 HasImageStoreD16Bug(false), 275 HasImageGather4D16Bug(false), 276 277 FeatureDisable(false), 278 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 279 TLInfo(TM, *this), 280 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 281 MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this); 282 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 283 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering())); 284 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 285 RegBankInfo.reset(new AMDGPURegisterBankInfo(*this)); 286 InstSelector.reset(new AMDGPUInstructionSelector( 287 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 288 } 289 290 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { 291 if (getGeneration() < GFX10) 292 return 1; 293 294 switch (Opcode) { 295 case AMDGPU::V_LSHLREV_B64: 296 case AMDGPU::V_LSHLREV_B64_gfx10: 297 case AMDGPU::V_LSHL_B64: 298 case AMDGPU::V_LSHRREV_B64: 299 case AMDGPU::V_LSHRREV_B64_gfx10: 300 case AMDGPU::V_LSHR_B64: 301 case AMDGPU::V_ASHRREV_I64: 302 case AMDGPU::V_ASHRREV_I64_gfx10: 303 case AMDGPU::V_ASHR_I64: 304 return 1; 305 } 306 307 return 2; 308 } 309 310 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 311 const Function &F) const { 312 if (NWaves == 1) 313 return getLocalMemorySize(); 314 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 315 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 316 if (!WorkGroupsPerCu) 317 return 0; 318 unsigned MaxWaves = getMaxWavesPerEU(); 319 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 320 } 321 322 // FIXME: Should return min,max range. 323 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 324 const Function &F) const { 325 const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second; 326 const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize); 327 if (!MaxWorkGroupsPerCu) 328 return 0; 329 330 const unsigned WaveSize = getWavefrontSize(); 331 332 // FIXME: Do we need to account for alignment requirement of LDS rounding the 333 // size up? 334 // Compute restriction based on LDS usage 335 unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u); 336 337 // This can be queried with more LDS than is possible, so just assume the 338 // worst. 339 if (NumGroups == 0) 340 return 1; 341 342 NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups); 343 344 // Round to the number of waves. 345 const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize; 346 unsigned MaxWaves = NumGroups * MaxGroupNumWaves; 347 348 // Clamp to the maximum possible number of waves. 349 MaxWaves = std::min(MaxWaves, getMaxWavesPerEU()); 350 351 // FIXME: Needs to be a multiple of the group size? 352 //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves); 353 354 assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() && 355 "computed invalid occupancy"); 356 return MaxWaves; 357 } 358 359 unsigned 360 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 361 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 362 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 363 } 364 365 std::pair<unsigned, unsigned> 366 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 367 switch (CC) { 368 case CallingConv::AMDGPU_VS: 369 case CallingConv::AMDGPU_LS: 370 case CallingConv::AMDGPU_HS: 371 case CallingConv::AMDGPU_ES: 372 case CallingConv::AMDGPU_GS: 373 case CallingConv::AMDGPU_PS: 374 return std::make_pair(1, getWavefrontSize()); 375 default: 376 return std::make_pair(1u, getMaxFlatWorkGroupSize()); 377 } 378 } 379 380 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 381 const Function &F) const { 382 // Default minimum/maximum flat work group sizes. 383 std::pair<unsigned, unsigned> Default = 384 getDefaultFlatWorkGroupSize(F.getCallingConv()); 385 386 // Requested minimum/maximum flat work group sizes. 387 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 388 F, "amdgpu-flat-work-group-size", Default); 389 390 // Make sure requested minimum is less than requested maximum. 391 if (Requested.first > Requested.second) 392 return Default; 393 394 // Make sure requested values do not violate subtarget's specifications. 395 if (Requested.first < getMinFlatWorkGroupSize()) 396 return Default; 397 if (Requested.second > getMaxFlatWorkGroupSize()) 398 return Default; 399 400 return Requested; 401 } 402 403 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 404 const Function &F) const { 405 // Default minimum/maximum number of waves per execution unit. 406 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 407 408 // Default/requested minimum/maximum flat work group sizes. 409 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 410 411 // If minimum/maximum flat work group sizes were explicitly requested using 412 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 413 // number of waves per execution unit to values implied by requested 414 // minimum/maximum flat work group sizes. 415 unsigned MinImpliedByFlatWorkGroupSize = 416 getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second); 417 Default.first = MinImpliedByFlatWorkGroupSize; 418 bool RequestedFlatWorkGroupSize = 419 F.hasFnAttribute("amdgpu-flat-work-group-size"); 420 421 // Requested minimum/maximum number of waves per execution unit. 422 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 423 F, "amdgpu-waves-per-eu", Default, true); 424 425 // Make sure requested minimum is less than requested maximum. 426 if (Requested.second && Requested.first > Requested.second) 427 return Default; 428 429 // Make sure requested values do not violate subtarget's specifications. 430 if (Requested.first < getMinWavesPerEU() || 431 Requested.second > getMaxWavesPerEU()) 432 return Default; 433 434 // Make sure requested values are compatible with values implied by requested 435 // minimum/maximum flat work group sizes. 436 if (RequestedFlatWorkGroupSize && 437 Requested.first < MinImpliedByFlatWorkGroupSize) 438 return Default; 439 440 return Requested; 441 } 442 443 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) { 444 auto Node = Kernel.getMetadata("reqd_work_group_size"); 445 if (Node && Node->getNumOperands() == 3) 446 return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue(); 447 return std::numeric_limits<unsigned>::max(); 448 } 449 450 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel, 451 unsigned Dimension) const { 452 unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension); 453 if (ReqdSize != std::numeric_limits<unsigned>::max()) 454 return ReqdSize - 1; 455 return getFlatWorkGroupSizes(Kernel).second - 1; 456 } 457 458 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 459 Function *Kernel = I->getParent()->getParent(); 460 unsigned MinSize = 0; 461 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 462 bool IdQuery = false; 463 464 // If reqd_work_group_size is present it narrows value down. 465 if (auto *CI = dyn_cast<CallInst>(I)) { 466 const Function *F = CI->getCalledFunction(); 467 if (F) { 468 unsigned Dim = UINT_MAX; 469 switch (F->getIntrinsicID()) { 470 case Intrinsic::amdgcn_workitem_id_x: 471 case Intrinsic::r600_read_tidig_x: 472 IdQuery = true; 473 LLVM_FALLTHROUGH; 474 case Intrinsic::r600_read_local_size_x: 475 Dim = 0; 476 break; 477 case Intrinsic::amdgcn_workitem_id_y: 478 case Intrinsic::r600_read_tidig_y: 479 IdQuery = true; 480 LLVM_FALLTHROUGH; 481 case Intrinsic::r600_read_local_size_y: 482 Dim = 1; 483 break; 484 case Intrinsic::amdgcn_workitem_id_z: 485 case Intrinsic::r600_read_tidig_z: 486 IdQuery = true; 487 LLVM_FALLTHROUGH; 488 case Intrinsic::r600_read_local_size_z: 489 Dim = 2; 490 break; 491 default: 492 break; 493 } 494 495 if (Dim <= 3) { 496 unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim); 497 if (ReqdSize != std::numeric_limits<unsigned>::max()) 498 MinSize = MaxSize = ReqdSize; 499 } 500 } 501 } 502 503 if (!MaxSize) 504 return false; 505 506 // Range metadata is [Lo, Hi). For ID query we need to pass max size 507 // as Hi. For size query we need to pass Hi + 1. 508 if (IdQuery) 509 MinSize = 0; 510 else 511 ++MaxSize; 512 513 MDBuilder MDB(I->getContext()); 514 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 515 APInt(32, MaxSize)); 516 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 517 return true; 518 } 519 520 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 521 Align &MaxAlign) const { 522 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 523 F.getCallingConv() == CallingConv::SPIR_KERNEL); 524 525 const DataLayout &DL = F.getParent()->getDataLayout(); 526 uint64_t ExplicitArgBytes = 0; 527 MaxAlign = Align(1); 528 529 for (const Argument &Arg : F.args()) { 530 const bool IsByRef = Arg.hasByRefAttr(); 531 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); 532 MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None; 533 if (!Alignment) 534 Alignment = DL.getABITypeAlign(ArgTy); 535 536 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 537 ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize; 538 MaxAlign = max(MaxAlign, Alignment); 539 } 540 541 return ExplicitArgBytes; 542 } 543 544 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 545 Align &MaxAlign) const { 546 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 547 548 unsigned ExplicitOffset = getExplicitKernelArgOffset(F); 549 550 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 551 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 552 if (ImplicitBytes != 0) { 553 const Align Alignment = getAlignmentForImplicitArgPtr(); 554 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 555 } 556 557 // Being able to dereference past the end is useful for emitting scalar loads. 558 return alignTo(TotalSize, 4); 559 } 560 561 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 562 const TargetMachine &TM) : 563 R600GenSubtargetInfo(TT, GPU, /*TuneCPU*/GPU, FS), 564 AMDGPUSubtarget(TT), 565 InstrInfo(*this), 566 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 567 FMA(false), 568 CaymanISA(false), 569 CFALUBug(false), 570 HasVertexCache(false), 571 R600ALUInst(false), 572 FP64(false), 573 TexVTXClauseSize(0), 574 Gen(R600), 575 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)), 576 InstrItins(getInstrItineraryForCPU(GPU)) { } 577 578 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 579 unsigned NumRegionInstrs) const { 580 // Track register pressure so the scheduler can try to decrease 581 // pressure once register usage is above the threshold defined by 582 // SIRegisterInfo::getRegPressureSetLimit() 583 Policy.ShouldTrackPressure = true; 584 585 // Enabling both top down and bottom up scheduling seems to give us less 586 // register spills than just using one of these approaches on its own. 587 Policy.OnlyTopDown = false; 588 Policy.OnlyBottomUp = false; 589 590 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 591 if (!enableSIScheduler()) 592 Policy.ShouldTrackLaneMasks = true; 593 } 594 595 bool GCNSubtarget::hasMadF16() const { 596 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1; 597 } 598 599 bool GCNSubtarget::useVGPRIndexMode() const { 600 return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode()); 601 } 602 603 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 604 if (getGeneration() >= AMDGPUSubtarget::GFX10) 605 return getMaxWavesPerEU(); 606 607 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 608 if (SGPRs <= 80) 609 return 10; 610 if (SGPRs <= 88) 611 return 9; 612 if (SGPRs <= 100) 613 return 8; 614 return 7; 615 } 616 if (SGPRs <= 48) 617 return 10; 618 if (SGPRs <= 56) 619 return 9; 620 if (SGPRs <= 64) 621 return 8; 622 if (SGPRs <= 72) 623 return 7; 624 if (SGPRs <= 80) 625 return 6; 626 return 5; 627 } 628 629 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 630 unsigned MaxWaves = getMaxWavesPerEU(); 631 unsigned Granule = getVGPRAllocGranule(); 632 if (VGPRs < Granule) 633 return MaxWaves; 634 unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule; 635 return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves); 636 } 637 638 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 639 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 640 if (getGeneration() >= AMDGPUSubtarget::GFX10) 641 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. 642 643 if (MFI.hasFlatScratchInit()) { 644 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 645 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 646 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 647 return 4; // FLAT_SCRATCH, VCC (in that order). 648 } 649 650 if (isXNACKEnabled()) 651 return 4; // XNACK, VCC (in that order). 652 return 2; // VCC. 653 } 654 655 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize, 656 unsigned NumSGPRs, 657 unsigned NumVGPRs) const { 658 unsigned Occupancy = 659 std::min(getMaxWavesPerEU(), 660 getOccupancyWithLocalMemSize(LDSSize, F)); 661 if (NumSGPRs) 662 Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs)); 663 if (NumVGPRs) 664 Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs)); 665 return Occupancy; 666 } 667 668 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 669 const Function &F = MF.getFunction(); 670 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 671 672 // Compute maximum number of SGPRs function can use using default/requested 673 // minimum number of waves per execution unit. 674 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 675 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 676 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 677 678 // Check if maximum number of SGPRs was explicitly requested using 679 // "amdgpu-num-sgpr" attribute. 680 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 681 unsigned Requested = AMDGPU::getIntegerAttribute( 682 F, "amdgpu-num-sgpr", MaxNumSGPRs); 683 684 // Make sure requested value does not violate subtarget's specifications. 685 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 686 Requested = 0; 687 688 // If more SGPRs are required to support the input user/system SGPRs, 689 // increase to accommodate them. 690 // 691 // FIXME: This really ends up using the requested number of SGPRs + number 692 // of reserved special registers in total. Theoretically you could re-use 693 // the last input registers for these special registers, but this would 694 // require a lot of complexity to deal with the weird aliasing. 695 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 696 if (Requested && Requested < InputNumSGPRs) 697 Requested = InputNumSGPRs; 698 699 // Make sure requested value is compatible with values implied by 700 // default/requested minimum/maximum number of waves per execution unit. 701 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 702 Requested = 0; 703 if (WavesPerEU.second && 704 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 705 Requested = 0; 706 707 if (Requested) 708 MaxNumSGPRs = Requested; 709 } 710 711 if (hasSGPRInitBug()) 712 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 713 714 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 715 MaxAddressableNumSGPRs); 716 } 717 718 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 719 const Function &F = MF.getFunction(); 720 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 721 722 // Compute maximum number of VGPRs function can use using default/requested 723 // minimum number of waves per execution unit. 724 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 725 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 726 727 // Check if maximum number of VGPRs was explicitly requested using 728 // "amdgpu-num-vgpr" attribute. 729 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 730 unsigned Requested = AMDGPU::getIntegerAttribute( 731 F, "amdgpu-num-vgpr", MaxNumVGPRs); 732 733 // Make sure requested value is compatible with values implied by 734 // default/requested minimum/maximum number of waves per execution unit. 735 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 736 Requested = 0; 737 if (WavesPerEU.second && 738 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 739 Requested = 0; 740 741 if (Requested) 742 MaxNumVGPRs = Requested; 743 } 744 745 return MaxNumVGPRs; 746 } 747 748 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, 749 int UseOpIdx, SDep &Dep) const { 750 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || 751 !Def->isInstr() || !Use->isInstr()) 752 return; 753 754 MachineInstr *DefI = Def->getInstr(); 755 MachineInstr *UseI = Use->getInstr(); 756 757 if (DefI->isBundle()) { 758 const SIRegisterInfo *TRI = getRegisterInfo(); 759 auto Reg = Dep.getReg(); 760 MachineBasicBlock::const_instr_iterator I(DefI->getIterator()); 761 MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end()); 762 unsigned Lat = 0; 763 for (++I; I != E && I->isBundledWithPred(); ++I) { 764 if (I->modifiesRegister(Reg, TRI)) 765 Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I); 766 else if (Lat) 767 --Lat; 768 } 769 Dep.setLatency(Lat); 770 } else if (UseI->isBundle()) { 771 const SIRegisterInfo *TRI = getRegisterInfo(); 772 auto Reg = Dep.getReg(); 773 MachineBasicBlock::const_instr_iterator I(UseI->getIterator()); 774 MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end()); 775 unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI); 776 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) { 777 if (I->readsRegister(Reg, TRI)) 778 break; 779 --Lat; 780 } 781 Dep.setLatency(Lat); 782 } 783 } 784 785 namespace { 786 struct FillMFMAShadowMutation : ScheduleDAGMutation { 787 const SIInstrInfo *TII; 788 789 ScheduleDAGMI *DAG; 790 791 FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {} 792 793 bool isSALU(const SUnit *SU) const { 794 const MachineInstr *MI = SU->getInstr(); 795 return MI && TII->isSALU(*MI) && !MI->isTerminator(); 796 } 797 798 bool isVALU(const SUnit *SU) const { 799 const MachineInstr *MI = SU->getInstr(); 800 return MI && TII->isVALU(*MI); 801 } 802 803 bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const { 804 if (Pred->NodeNum < Succ->NodeNum) 805 return true; 806 807 SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred}); 808 809 for (unsigned I = 0; I < Succs.size(); ++I) { 810 for (const SDep &SI : Succs[I]->Succs) { 811 const SUnit *SU = SI.getSUnit(); 812 if (SU != Succs[I] && llvm::find(Succs, SU) == Succs.end()) 813 Succs.push_back(SU); 814 } 815 } 816 817 SmallPtrSet<const SUnit*, 32> Visited; 818 while (!Preds.empty()) { 819 const SUnit *SU = Preds.pop_back_val(); 820 if (llvm::find(Succs, SU) != Succs.end()) 821 return false; 822 Visited.insert(SU); 823 for (const SDep &SI : SU->Preds) 824 if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit())) 825 Preds.push_back(SI.getSUnit()); 826 } 827 828 return true; 829 } 830 831 // Link as much SALU intructions in chain as possible. Return the size 832 // of the chain. Links up to MaxChain instructions. 833 unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain, 834 SmallPtrSetImpl<SUnit *> &Visited) const { 835 SmallVector<SUnit *, 8> Worklist({To}); 836 unsigned Linked = 0; 837 838 while (!Worklist.empty() && MaxChain-- > 0) { 839 SUnit *SU = Worklist.pop_back_val(); 840 if (!Visited.insert(SU).second) 841 continue; 842 843 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From); 844 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n'); 845 846 if (SU->addPred(SDep(From, SDep::Artificial), false)) 847 ++Linked; 848 849 for (SDep &SI : From->Succs) { 850 SUnit *SUv = SI.getSUnit(); 851 if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU)) 852 SUv->addPred(SDep(SU, SDep::Artificial), false); 853 } 854 855 for (SDep &SI : SU->Succs) { 856 SUnit *Succ = SI.getSUnit(); 857 if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ)) 858 Worklist.push_back(Succ); 859 } 860 } 861 862 return Linked; 863 } 864 865 void apply(ScheduleDAGInstrs *DAGInstrs) override { 866 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); 867 if (!ST.hasMAIInsts() || DisablePowerSched) 868 return; 869 DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); 870 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); 871 if (!TSchedModel || DAG->SUnits.empty()) 872 return; 873 874 // Scan for MFMA long latency instructions and try to add a dependency 875 // of available SALU instructions to give them a chance to fill MFMA 876 // shadow. That is desirable to fill MFMA shadow with SALU instructions 877 // rather than VALU to prevent power consumption bursts and throttle. 878 auto LastSALU = DAG->SUnits.begin(); 879 auto E = DAG->SUnits.end(); 880 SmallPtrSet<SUnit*, 32> Visited; 881 for (SUnit &SU : DAG->SUnits) { 882 MachineInstr &MAI = *SU.getInstr(); 883 if (!TII->isMAI(MAI) || 884 MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 || 885 MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32) 886 continue; 887 888 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1; 889 890 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU); 891 dbgs() << "Need " << Lat 892 << " instructions to cover latency.\n"); 893 894 // Find up to Lat independent scalar instructions as early as 895 // possible such that they can be scheduled after this MFMA. 896 for ( ; Lat && LastSALU != E; ++LastSALU) { 897 if (Visited.count(&*LastSALU)) 898 continue; 899 900 if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU)) 901 continue; 902 903 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited); 904 } 905 } 906 } 907 }; 908 } // namespace 909 910 void GCNSubtarget::getPostRAMutations( 911 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 912 Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo)); 913 } 914 915 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 916 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 917 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 918 else 919 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); 920 } 921 922 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 923 if (TM.getTargetTriple().getArch() == Triple::amdgcn) 924 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 925 else 926 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); 927 } 928