1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Implements the AMDGPU specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUSubtarget.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUTargetMachine.h" 17 #include "AMDGPUCallLowering.h" 18 #include "AMDGPUInstructionSelector.h" 19 #include "AMDGPULegalizerInfo.h" 20 #include "AMDGPURegisterBankInfo.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 23 #include "llvm/ADT/SmallString.h" 24 #include "llvm/CodeGen/MachineScheduler.h" 25 #include "llvm/MC/MCSubtargetInfo.h" 26 #include "llvm/IR/MDBuilder.h" 27 #include "llvm/CodeGen/TargetFrameLowering.h" 28 #include <algorithm> 29 30 using namespace llvm; 31 32 #define DEBUG_TYPE "amdgpu-subtarget" 33 34 #define GET_SUBTARGETINFO_TARGET_DESC 35 #define GET_SUBTARGETINFO_CTOR 36 #define AMDGPUSubtarget GCNSubtarget 37 #include "AMDGPUGenSubtargetInfo.inc" 38 #define GET_SUBTARGETINFO_TARGET_DESC 39 #define GET_SUBTARGETINFO_CTOR 40 #undef AMDGPUSubtarget 41 #include "R600GenSubtargetInfo.inc" 42 43 static cl::opt<bool> DisablePowerSched( 44 "amdgpu-disable-power-sched", 45 cl::desc("Disable scheduling to minimize mAI power bursts"), 46 cl::init(false)); 47 48 static cl::opt<bool> EnableVGPRIndexMode( 49 "amdgpu-vgpr-index-mode", 50 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), 51 cl::init(false)); 52 53 GCNSubtarget::~GCNSubtarget() = default; 54 55 R600Subtarget & 56 R600Subtarget::initializeSubtargetDependencies(const Triple &TT, 57 StringRef GPU, StringRef FS) { 58 SmallString<256> FullFS("+promote-alloca,"); 59 FullFS += FS; 60 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS); 61 62 HasMulU24 = getGeneration() >= EVERGREEN; 63 HasMulI24 = hasCaymanISA(); 64 65 return *this; 66 } 67 68 GCNSubtarget & 69 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 70 StringRef GPU, StringRef FS) { 71 // Determine default and user-specified characteristics 72 // 73 // We want to be able to turn these off, but making this a subtarget feature 74 // for SI has the unhelpful behavior that it unsets everything else if you 75 // disable it. 76 // 77 // Similarly we want enable-prt-strict-null to be on by default and not to 78 // unset everything else if it is disabled 79 80 // Assuming ECC is enabled is the conservative default. 81 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,+sram-ecc,+xnack,"); 82 83 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 84 FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,"; 85 86 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS 87 88 // Disable mutually exclusive bits. 89 if (FS.find_lower("+wavefrontsize") != StringRef::npos) { 90 if (FS.find_lower("wavefrontsize16") == StringRef::npos) 91 FullFS += "-wavefrontsize16,"; 92 if (FS.find_lower("wavefrontsize32") == StringRef::npos) 93 FullFS += "-wavefrontsize32,"; 94 if (FS.find_lower("wavefrontsize64") == StringRef::npos) 95 FullFS += "-wavefrontsize64,"; 96 } 97 98 FullFS += FS; 99 100 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS); 101 102 // We don't support FP64 for EG/NI atm. 103 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 104 105 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es 106 // on VI and newer hardware to avoid assertion failures due to missing ADDR64 107 // variants of MUBUF instructions. 108 if (!hasAddr64() && !FS.contains("flat-for-global")) { 109 FlatForGlobal = true; 110 } 111 112 // Set defaults if needed. 113 if (MaxPrivateElementSize == 0) 114 MaxPrivateElementSize = 4; 115 116 if (LDSBankCount == 0) 117 LDSBankCount = 32; 118 119 if (TT.getArch() == Triple::amdgcn) { 120 if (LocalMemorySize == 0) 121 LocalMemorySize = 32768; 122 123 // Do something sensible for unspecified target. 124 if (!HasMovrel && !HasVGPRIndexMode) 125 HasMovrel = true; 126 } 127 128 // Don't crash on invalid devices. 129 if (WavefrontSizeLog2 == 0) 130 WavefrontSizeLog2 = 5; 131 132 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 133 134 // Disable XNACK on targets where it is not enabled by default unless it is 135 // explicitly requested. 136 if (!FS.contains("+xnack") && DoesNotSupportXNACK && EnableXNACK) { 137 ToggleFeature(AMDGPU::FeatureXNACK); 138 EnableXNACK = false; 139 } 140 141 // ECC is on by default, but turn it off if the hardware doesn't support it 142 // anyway. This matters for the gfx9 targets with d16 loads, but don't support 143 // ECC. 144 if (DoesNotSupportSRAMECC && EnableSRAMECC) { 145 ToggleFeature(AMDGPU::FeatureSRAMECC); 146 EnableSRAMECC = false; 147 } 148 149 return *this; 150 } 151 152 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : 153 TargetTriple(TT), 154 Has16BitInsts(false), 155 HasMadMixInsts(false), 156 HasMadMacF32Insts(false), 157 HasDsSrc2Insts(false), 158 HasSDWA(false), 159 HasVOP3PInsts(false), 160 HasMulI24(true), 161 HasMulU24(true), 162 HasInv2PiInlineImm(false), 163 HasFminFmaxLegacy(true), 164 EnablePromoteAlloca(false), 165 HasTrigReducedRange(false), 166 MaxWavesPerEU(10), 167 LocalMemorySize(0), 168 WavefrontSizeLog2(0) 169 { } 170 171 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 172 const GCNTargetMachine &TM) : 173 AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS), 174 AMDGPUSubtarget(TT), 175 TargetTriple(TT), 176 Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS), 177 InstrItins(getInstrItineraryForCPU(GPU)), 178 LDSBankCount(0), 179 MaxPrivateElementSize(0), 180 181 FastFMAF32(false), 182 FastDenormalF32(false), 183 HalfRate64Ops(false), 184 185 FlatForGlobal(false), 186 AutoWaitcntBeforeBarrier(false), 187 UnalignedScratchAccess(false), 188 UnalignedBufferAccess(false), 189 UnalignedAccessMode(false), 190 191 HasApertureRegs(false), 192 EnableXNACK(false), 193 DoesNotSupportXNACK(false), 194 EnableCuMode(false), 195 TrapHandler(false), 196 197 EnableLoadStoreOpt(false), 198 EnableUnsafeDSOffsetFolding(false), 199 EnableSIScheduler(false), 200 EnableDS128(false), 201 EnablePRTStrictNull(false), 202 DumpCode(false), 203 204 FP64(false), 205 GCN3Encoding(false), 206 CIInsts(false), 207 GFX8Insts(false), 208 GFX9Insts(false), 209 GFX10Insts(false), 210 GFX10_3Insts(false), 211 GFX7GFX8GFX9Insts(false), 212 SGPRInitBug(false), 213 HasSMemRealTime(false), 214 HasIntClamp(false), 215 HasFmaMixInsts(false), 216 HasMovrel(false), 217 HasVGPRIndexMode(false), 218 HasScalarStores(false), 219 HasScalarAtomics(false), 220 HasSDWAOmod(false), 221 HasSDWAScalar(false), 222 HasSDWASdst(false), 223 HasSDWAMac(false), 224 HasSDWAOutModsVOPC(false), 225 HasDPP(false), 226 HasDPP8(false), 227 HasR128A16(false), 228 HasGFX10A16(false), 229 HasG16(false), 230 HasNSAEncoding(false), 231 GFX10_BEncoding(false), 232 HasDLInsts(false), 233 HasDot1Insts(false), 234 HasDot2Insts(false), 235 HasDot3Insts(false), 236 HasDot4Insts(false), 237 HasDot5Insts(false), 238 HasDot6Insts(false), 239 HasMAIInsts(false), 240 HasPkFmacF16Inst(false), 241 HasAtomicFaddInsts(false), 242 EnableSRAMECC(false), 243 DoesNotSupportSRAMECC(false), 244 HasNoSdstCMPX(false), 245 HasVscnt(false), 246 HasGetWaveIdInst(false), 247 HasSMemTimeInst(false), 248 HasRegisterBanking(false), 249 HasVOP3Literal(false), 250 HasNoDataDepHazard(false), 251 FlatAddressSpace(false), 252 FlatInstOffsets(false), 253 FlatGlobalInsts(false), 254 FlatScratchInsts(false), 255 ScalarFlatScratchInsts(false), 256 AddNoCarryInsts(false), 257 HasUnpackedD16VMem(false), 258 LDSMisalignedBug(false), 259 HasMFMAInlineLiteralBug(false), 260 UnalignedDSAccess(false), 261 262 ScalarizeGlobal(false), 263 264 HasVcmpxPermlaneHazard(false), 265 HasVMEMtoScalarWriteHazard(false), 266 HasSMEMtoVectorWriteHazard(false), 267 HasInstFwdPrefetchBug(false), 268 HasVcmpxExecWARHazard(false), 269 HasLdsBranchVmemWARHazard(false), 270 HasNSAtoVMEMBug(false), 271 HasOffset3fBug(false), 272 HasFlatSegmentOffsetBug(false), 273 HasImageStoreD16Bug(false), 274 HasImageGather4D16Bug(false), 275 276 FeatureDisable(false), 277 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 278 TLInfo(TM, *this), 279 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 280 MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this); 281 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 282 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering())); 283 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 284 RegBankInfo.reset(new AMDGPURegisterBankInfo(*this)); 285 InstSelector.reset(new AMDGPUInstructionSelector( 286 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 287 } 288 289 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { 290 if (getGeneration() < GFX10) 291 return 1; 292 293 switch (Opcode) { 294 case AMDGPU::V_LSHLREV_B64: 295 case AMDGPU::V_LSHLREV_B64_gfx10: 296 case AMDGPU::V_LSHL_B64: 297 case AMDGPU::V_LSHRREV_B64: 298 case AMDGPU::V_LSHRREV_B64_gfx10: 299 case AMDGPU::V_LSHR_B64: 300 case AMDGPU::V_ASHRREV_I64: 301 case AMDGPU::V_ASHRREV_I64_gfx10: 302 case AMDGPU::V_ASHR_I64: 303 return 1; 304 } 305 306 return 2; 307 } 308 309 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 310 const Function &F) const { 311 if (NWaves == 1) 312 return getLocalMemorySize(); 313 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 314 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 315 if (!WorkGroupsPerCu) 316 return 0; 317 unsigned MaxWaves = getMaxWavesPerEU(); 318 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 319 } 320 321 // FIXME: Should return min,max range. 322 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 323 const Function &F) const { 324 const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second; 325 const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize); 326 if (!MaxWorkGroupsPerCu) 327 return 0; 328 329 const unsigned WaveSize = getWavefrontSize(); 330 331 // FIXME: Do we need to account for alignment requirement of LDS rounding the 332 // size up? 333 // Compute restriction based on LDS usage 334 unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u); 335 336 // This can be queried with more LDS than is possible, so just assume the 337 // worst. 338 if (NumGroups == 0) 339 return 1; 340 341 NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups); 342 343 // Round to the number of waves. 344 const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize; 345 unsigned MaxWaves = NumGroups * MaxGroupNumWaves; 346 347 // Clamp to the maximum possible number of waves. 348 MaxWaves = std::min(MaxWaves, getMaxWavesPerEU()); 349 350 // FIXME: Needs to be a multiple of the group size? 351 //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves); 352 353 assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() && 354 "computed invalid occupancy"); 355 return MaxWaves; 356 } 357 358 unsigned 359 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 360 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 361 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 362 } 363 364 std::pair<unsigned, unsigned> 365 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 366 switch (CC) { 367 case CallingConv::AMDGPU_VS: 368 case CallingConv::AMDGPU_LS: 369 case CallingConv::AMDGPU_HS: 370 case CallingConv::AMDGPU_ES: 371 case CallingConv::AMDGPU_GS: 372 case CallingConv::AMDGPU_PS: 373 return std::make_pair(1, getWavefrontSize()); 374 default: 375 return std::make_pair(1u, getMaxFlatWorkGroupSize()); 376 } 377 } 378 379 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 380 const Function &F) const { 381 // Default minimum/maximum flat work group sizes. 382 std::pair<unsigned, unsigned> Default = 383 getDefaultFlatWorkGroupSize(F.getCallingConv()); 384 385 // Requested minimum/maximum flat work group sizes. 386 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 387 F, "amdgpu-flat-work-group-size", Default); 388 389 // Make sure requested minimum is less than requested maximum. 390 if (Requested.first > Requested.second) 391 return Default; 392 393 // Make sure requested values do not violate subtarget's specifications. 394 if (Requested.first < getMinFlatWorkGroupSize()) 395 return Default; 396 if (Requested.second > getMaxFlatWorkGroupSize()) 397 return Default; 398 399 return Requested; 400 } 401 402 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 403 const Function &F) const { 404 // Default minimum/maximum number of waves per execution unit. 405 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 406 407 // Default/requested minimum/maximum flat work group sizes. 408 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 409 410 // If minimum/maximum flat work group sizes were explicitly requested using 411 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 412 // number of waves per execution unit to values implied by requested 413 // minimum/maximum flat work group sizes. 414 unsigned MinImpliedByFlatWorkGroupSize = 415 getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second); 416 Default.first = MinImpliedByFlatWorkGroupSize; 417 bool RequestedFlatWorkGroupSize = 418 F.hasFnAttribute("amdgpu-flat-work-group-size"); 419 420 // Requested minimum/maximum number of waves per execution unit. 421 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 422 F, "amdgpu-waves-per-eu", Default, true); 423 424 // Make sure requested minimum is less than requested maximum. 425 if (Requested.second && Requested.first > Requested.second) 426 return Default; 427 428 // Make sure requested values do not violate subtarget's specifications. 429 if (Requested.first < getMinWavesPerEU() || 430 Requested.second > getMaxWavesPerEU()) 431 return Default; 432 433 // Make sure requested values are compatible with values implied by requested 434 // minimum/maximum flat work group sizes. 435 if (RequestedFlatWorkGroupSize && 436 Requested.first < MinImpliedByFlatWorkGroupSize) 437 return Default; 438 439 return Requested; 440 } 441 442 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) { 443 auto Node = Kernel.getMetadata("reqd_work_group_size"); 444 if (Node && Node->getNumOperands() == 3) 445 return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue(); 446 return std::numeric_limits<unsigned>::max(); 447 } 448 449 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel, 450 unsigned Dimension) const { 451 unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension); 452 if (ReqdSize != std::numeric_limits<unsigned>::max()) 453 return ReqdSize - 1; 454 return getFlatWorkGroupSizes(Kernel).second - 1; 455 } 456 457 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 458 Function *Kernel = I->getParent()->getParent(); 459 unsigned MinSize = 0; 460 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 461 bool IdQuery = false; 462 463 // If reqd_work_group_size is present it narrows value down. 464 if (auto *CI = dyn_cast<CallInst>(I)) { 465 const Function *F = CI->getCalledFunction(); 466 if (F) { 467 unsigned Dim = UINT_MAX; 468 switch (F->getIntrinsicID()) { 469 case Intrinsic::amdgcn_workitem_id_x: 470 case Intrinsic::r600_read_tidig_x: 471 IdQuery = true; 472 LLVM_FALLTHROUGH; 473 case Intrinsic::r600_read_local_size_x: 474 Dim = 0; 475 break; 476 case Intrinsic::amdgcn_workitem_id_y: 477 case Intrinsic::r600_read_tidig_y: 478 IdQuery = true; 479 LLVM_FALLTHROUGH; 480 case Intrinsic::r600_read_local_size_y: 481 Dim = 1; 482 break; 483 case Intrinsic::amdgcn_workitem_id_z: 484 case Intrinsic::r600_read_tidig_z: 485 IdQuery = true; 486 LLVM_FALLTHROUGH; 487 case Intrinsic::r600_read_local_size_z: 488 Dim = 2; 489 break; 490 default: 491 break; 492 } 493 494 if (Dim <= 3) { 495 unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim); 496 if (ReqdSize != std::numeric_limits<unsigned>::max()) 497 MinSize = MaxSize = ReqdSize; 498 } 499 } 500 } 501 502 if (!MaxSize) 503 return false; 504 505 // Range metadata is [Lo, Hi). For ID query we need to pass max size 506 // as Hi. For size query we need to pass Hi + 1. 507 if (IdQuery) 508 MinSize = 0; 509 else 510 ++MaxSize; 511 512 MDBuilder MDB(I->getContext()); 513 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 514 APInt(32, MaxSize)); 515 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 516 return true; 517 } 518 519 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 520 Align &MaxAlign) const { 521 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 522 F.getCallingConv() == CallingConv::SPIR_KERNEL); 523 524 const DataLayout &DL = F.getParent()->getDataLayout(); 525 uint64_t ExplicitArgBytes = 0; 526 MaxAlign = Align(1); 527 528 for (const Argument &Arg : F.args()) { 529 const bool IsByRef = Arg.hasByRefAttr(); 530 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); 531 MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None; 532 if (!Alignment) 533 Alignment = DL.getABITypeAlign(ArgTy); 534 535 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 536 ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize; 537 MaxAlign = max(MaxAlign, Alignment); 538 } 539 540 return ExplicitArgBytes; 541 } 542 543 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 544 Align &MaxAlign) const { 545 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 546 547 unsigned ExplicitOffset = getExplicitKernelArgOffset(F); 548 549 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 550 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 551 if (ImplicitBytes != 0) { 552 const Align Alignment = getAlignmentForImplicitArgPtr(); 553 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 554 } 555 556 // Being able to dereference past the end is useful for emitting scalar loads. 557 return alignTo(TotalSize, 4); 558 } 559 560 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 561 const TargetMachine &TM) : 562 R600GenSubtargetInfo(TT, GPU, /*TuneCPU*/GPU, FS), 563 AMDGPUSubtarget(TT), 564 InstrInfo(*this), 565 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 566 FMA(false), 567 CaymanISA(false), 568 CFALUBug(false), 569 HasVertexCache(false), 570 R600ALUInst(false), 571 FP64(false), 572 TexVTXClauseSize(0), 573 Gen(R600), 574 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)), 575 InstrItins(getInstrItineraryForCPU(GPU)) { } 576 577 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 578 unsigned NumRegionInstrs) const { 579 // Track register pressure so the scheduler can try to decrease 580 // pressure once register usage is above the threshold defined by 581 // SIRegisterInfo::getRegPressureSetLimit() 582 Policy.ShouldTrackPressure = true; 583 584 // Enabling both top down and bottom up scheduling seems to give us less 585 // register spills than just using one of these approaches on its own. 586 Policy.OnlyTopDown = false; 587 Policy.OnlyBottomUp = false; 588 589 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 590 if (!enableSIScheduler()) 591 Policy.ShouldTrackLaneMasks = true; 592 } 593 594 bool GCNSubtarget::hasMadF16() const { 595 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1; 596 } 597 598 bool GCNSubtarget::useVGPRIndexMode() const { 599 return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode()); 600 } 601 602 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 603 if (getGeneration() >= AMDGPUSubtarget::GFX10) 604 return getMaxWavesPerEU(); 605 606 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 607 if (SGPRs <= 80) 608 return 10; 609 if (SGPRs <= 88) 610 return 9; 611 if (SGPRs <= 100) 612 return 8; 613 return 7; 614 } 615 if (SGPRs <= 48) 616 return 10; 617 if (SGPRs <= 56) 618 return 9; 619 if (SGPRs <= 64) 620 return 8; 621 if (SGPRs <= 72) 622 return 7; 623 if (SGPRs <= 80) 624 return 6; 625 return 5; 626 } 627 628 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 629 unsigned MaxWaves = getMaxWavesPerEU(); 630 unsigned Granule = getVGPRAllocGranule(); 631 if (VGPRs < Granule) 632 return MaxWaves; 633 unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule; 634 return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves); 635 } 636 637 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 638 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 639 if (getGeneration() >= AMDGPUSubtarget::GFX10) 640 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. 641 642 if (MFI.hasFlatScratchInit()) { 643 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 644 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 645 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 646 return 4; // FLAT_SCRATCH, VCC (in that order). 647 } 648 649 if (isXNACKEnabled()) 650 return 4; // XNACK, VCC (in that order). 651 return 2; // VCC. 652 } 653 654 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize, 655 unsigned NumSGPRs, 656 unsigned NumVGPRs) const { 657 unsigned Occupancy = 658 std::min(getMaxWavesPerEU(), 659 getOccupancyWithLocalMemSize(LDSSize, F)); 660 if (NumSGPRs) 661 Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs)); 662 if (NumVGPRs) 663 Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs)); 664 return Occupancy; 665 } 666 667 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 668 const Function &F = MF.getFunction(); 669 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 670 671 // Compute maximum number of SGPRs function can use using default/requested 672 // minimum number of waves per execution unit. 673 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 674 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 675 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 676 677 // Check if maximum number of SGPRs was explicitly requested using 678 // "amdgpu-num-sgpr" attribute. 679 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 680 unsigned Requested = AMDGPU::getIntegerAttribute( 681 F, "amdgpu-num-sgpr", MaxNumSGPRs); 682 683 // Make sure requested value does not violate subtarget's specifications. 684 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 685 Requested = 0; 686 687 // If more SGPRs are required to support the input user/system SGPRs, 688 // increase to accommodate them. 689 // 690 // FIXME: This really ends up using the requested number of SGPRs + number 691 // of reserved special registers in total. Theoretically you could re-use 692 // the last input registers for these special registers, but this would 693 // require a lot of complexity to deal with the weird aliasing. 694 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 695 if (Requested && Requested < InputNumSGPRs) 696 Requested = InputNumSGPRs; 697 698 // Make sure requested value is compatible with values implied by 699 // default/requested minimum/maximum number of waves per execution unit. 700 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 701 Requested = 0; 702 if (WavesPerEU.second && 703 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 704 Requested = 0; 705 706 if (Requested) 707 MaxNumSGPRs = Requested; 708 } 709 710 if (hasSGPRInitBug()) 711 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 712 713 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 714 MaxAddressableNumSGPRs); 715 } 716 717 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 718 const Function &F = MF.getFunction(); 719 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 720 721 // Compute maximum number of VGPRs function can use using default/requested 722 // minimum number of waves per execution unit. 723 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 724 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 725 726 // Check if maximum number of VGPRs was explicitly requested using 727 // "amdgpu-num-vgpr" attribute. 728 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 729 unsigned Requested = AMDGPU::getIntegerAttribute( 730 F, "amdgpu-num-vgpr", MaxNumVGPRs); 731 732 // Make sure requested value is compatible with values implied by 733 // default/requested minimum/maximum number of waves per execution unit. 734 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 735 Requested = 0; 736 if (WavesPerEU.second && 737 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 738 Requested = 0; 739 740 if (Requested) 741 MaxNumVGPRs = Requested; 742 } 743 744 return MaxNumVGPRs; 745 } 746 747 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, 748 int UseOpIdx, SDep &Dep) const { 749 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || 750 !Def->isInstr() || !Use->isInstr()) 751 return; 752 753 MachineInstr *DefI = Def->getInstr(); 754 MachineInstr *UseI = Use->getInstr(); 755 756 if (DefI->isBundle()) { 757 const SIRegisterInfo *TRI = getRegisterInfo(); 758 auto Reg = Dep.getReg(); 759 MachineBasicBlock::const_instr_iterator I(DefI->getIterator()); 760 MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end()); 761 unsigned Lat = 0; 762 for (++I; I != E && I->isBundledWithPred(); ++I) { 763 if (I->modifiesRegister(Reg, TRI)) 764 Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I); 765 else if (Lat) 766 --Lat; 767 } 768 Dep.setLatency(Lat); 769 } else if (UseI->isBundle()) { 770 const SIRegisterInfo *TRI = getRegisterInfo(); 771 auto Reg = Dep.getReg(); 772 MachineBasicBlock::const_instr_iterator I(UseI->getIterator()); 773 MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end()); 774 unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI); 775 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) { 776 if (I->readsRegister(Reg, TRI)) 777 break; 778 --Lat; 779 } 780 Dep.setLatency(Lat); 781 } 782 } 783 784 namespace { 785 struct FillMFMAShadowMutation : ScheduleDAGMutation { 786 const SIInstrInfo *TII; 787 788 ScheduleDAGMI *DAG; 789 790 FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {} 791 792 bool isSALU(const SUnit *SU) const { 793 const MachineInstr *MI = SU->getInstr(); 794 return MI && TII->isSALU(*MI) && !MI->isTerminator(); 795 } 796 797 bool isVALU(const SUnit *SU) const { 798 const MachineInstr *MI = SU->getInstr(); 799 return MI && TII->isVALU(*MI); 800 } 801 802 bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const { 803 if (Pred->NodeNum < Succ->NodeNum) 804 return true; 805 806 SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred}); 807 808 for (unsigned I = 0; I < Succs.size(); ++I) { 809 for (const SDep &SI : Succs[I]->Succs) { 810 const SUnit *SU = SI.getSUnit(); 811 if (SU != Succs[I] && llvm::find(Succs, SU) == Succs.end()) 812 Succs.push_back(SU); 813 } 814 } 815 816 SmallPtrSet<const SUnit*, 32> Visited; 817 while (!Preds.empty()) { 818 const SUnit *SU = Preds.pop_back_val(); 819 if (llvm::find(Succs, SU) != Succs.end()) 820 return false; 821 Visited.insert(SU); 822 for (const SDep &SI : SU->Preds) 823 if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit())) 824 Preds.push_back(SI.getSUnit()); 825 } 826 827 return true; 828 } 829 830 // Link as much SALU intructions in chain as possible. Return the size 831 // of the chain. Links up to MaxChain instructions. 832 unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain, 833 SmallPtrSetImpl<SUnit *> &Visited) const { 834 SmallVector<SUnit *, 8> Worklist({To}); 835 unsigned Linked = 0; 836 837 while (!Worklist.empty() && MaxChain-- > 0) { 838 SUnit *SU = Worklist.pop_back_val(); 839 if (!Visited.insert(SU).second) 840 continue; 841 842 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From); 843 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n'); 844 845 if (SU->addPred(SDep(From, SDep::Artificial), false)) 846 ++Linked; 847 848 for (SDep &SI : From->Succs) { 849 SUnit *SUv = SI.getSUnit(); 850 if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU)) 851 SUv->addPred(SDep(SU, SDep::Artificial), false); 852 } 853 854 for (SDep &SI : SU->Succs) { 855 SUnit *Succ = SI.getSUnit(); 856 if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ)) 857 Worklist.push_back(Succ); 858 } 859 } 860 861 return Linked; 862 } 863 864 void apply(ScheduleDAGInstrs *DAGInstrs) override { 865 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); 866 if (!ST.hasMAIInsts() || DisablePowerSched) 867 return; 868 DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); 869 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); 870 if (!TSchedModel || DAG->SUnits.empty()) 871 return; 872 873 // Scan for MFMA long latency instructions and try to add a dependency 874 // of available SALU instructions to give them a chance to fill MFMA 875 // shadow. That is desirable to fill MFMA shadow with SALU instructions 876 // rather than VALU to prevent power consumption bursts and throttle. 877 auto LastSALU = DAG->SUnits.begin(); 878 auto E = DAG->SUnits.end(); 879 SmallPtrSet<SUnit*, 32> Visited; 880 for (SUnit &SU : DAG->SUnits) { 881 MachineInstr &MAI = *SU.getInstr(); 882 if (!TII->isMAI(MAI) || 883 MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 || 884 MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32) 885 continue; 886 887 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1; 888 889 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU); 890 dbgs() << "Need " << Lat 891 << " instructions to cover latency.\n"); 892 893 // Find up to Lat independent scalar instructions as early as 894 // possible such that they can be scheduled after this MFMA. 895 for ( ; Lat && LastSALU != E; ++LastSALU) { 896 if (Visited.count(&*LastSALU)) 897 continue; 898 899 if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU)) 900 continue; 901 902 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited); 903 } 904 } 905 } 906 }; 907 } // namespace 908 909 void GCNSubtarget::getPostRAMutations( 910 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 911 Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo)); 912 } 913 914 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 915 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 916 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 917 else 918 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); 919 } 920 921 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 922 if (TM.getTargetTriple().getArch() == Triple::amdgcn) 923 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 924 else 925 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); 926 } 927