1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Implements the AMDGPU specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUSubtarget.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUTargetMachine.h" 17 #include "AMDGPUCallLowering.h" 18 #include "AMDGPUInstructionSelector.h" 19 #include "AMDGPULegalizerInfo.h" 20 #include "AMDGPURegisterBankInfo.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 23 #include "llvm/ADT/SmallString.h" 24 #include "llvm/CodeGen/MachineScheduler.h" 25 #include "llvm/MC/MCSubtargetInfo.h" 26 #include "llvm/IR/MDBuilder.h" 27 #include "llvm/CodeGen/TargetFrameLowering.h" 28 #include <algorithm> 29 30 using namespace llvm; 31 32 #define DEBUG_TYPE "amdgpu-subtarget" 33 34 #define GET_SUBTARGETINFO_TARGET_DESC 35 #define GET_SUBTARGETINFO_CTOR 36 #define AMDGPUSubtarget GCNSubtarget 37 #include "AMDGPUGenSubtargetInfo.inc" 38 #define GET_SUBTARGETINFO_TARGET_DESC 39 #define GET_SUBTARGETINFO_CTOR 40 #undef AMDGPUSubtarget 41 #include "R600GenSubtargetInfo.inc" 42 43 static cl::opt<bool> DisablePowerSched( 44 "amdgpu-disable-power-sched", 45 cl::desc("Disable scheduling to minimize mAI power bursts"), 46 cl::init(false)); 47 48 static cl::opt<bool> EnableVGPRIndexMode( 49 "amdgpu-vgpr-index-mode", 50 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), 51 cl::init(false)); 52 53 GCNSubtarget::~GCNSubtarget() = default; 54 55 R600Subtarget & 56 R600Subtarget::initializeSubtargetDependencies(const Triple &TT, 57 StringRef GPU, StringRef FS) { 58 SmallString<256> FullFS("+promote-alloca,"); 59 FullFS += FS; 60 ParseSubtargetFeatures(GPU, FullFS); 61 62 // FIXME: I don't think think Evergreen has any useful support for 63 // denormals, but should be checked. Should we issue a warning somewhere 64 // if someone tries to enable these? 65 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { 66 FP32Denormals = false; 67 } 68 69 HasMulU24 = getGeneration() >= EVERGREEN; 70 HasMulI24 = hasCaymanISA(); 71 72 return *this; 73 } 74 75 GCNSubtarget & 76 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 77 StringRef GPU, StringRef FS) { 78 // Determine default and user-specified characteristics 79 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be 80 // enabled, but some instructions do not respect them and they run at the 81 // double precision rate, so don't enable by default. 82 // 83 // We want to be able to turn these off, but making this a subtarget feature 84 // for SI has the unhelpful behavior that it unsets everything else if you 85 // disable it. 86 // 87 // Similarly we want enable-prt-strict-null to be on by default and not to 88 // unset everything else if it is disabled 89 90 // Assuming ECC is enabled is the conservative default. 91 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,+xnack,"); 92 93 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 94 FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,"; 95 96 // FIXME: I don't think think Evergreen has any useful support for 97 // denormals, but should be checked. Should we issue a warning somewhere 98 // if someone tries to enable these? 99 if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { 100 FullFS += "+fp64-fp16-denormals,"; 101 } else { 102 FullFS += "-fp32-denormals,"; 103 } 104 105 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS 106 107 // Disable mutually exclusive bits. 108 if (FS.find_lower("+wavefrontsize") != StringRef::npos) { 109 if (FS.find_lower("wavefrontsize16") == StringRef::npos) 110 FullFS += "-wavefrontsize16,"; 111 if (FS.find_lower("wavefrontsize32") == StringRef::npos) 112 FullFS += "-wavefrontsize32,"; 113 if (FS.find_lower("wavefrontsize64") == StringRef::npos) 114 FullFS += "-wavefrontsize64,"; 115 } 116 117 FullFS += FS; 118 119 ParseSubtargetFeatures(GPU, FullFS); 120 121 // We don't support FP64 for EG/NI atm. 122 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 123 124 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es 125 // on VI and newer hardware to avoid assertion failures due to missing ADDR64 126 // variants of MUBUF instructions. 127 if (!hasAddr64() && !FS.contains("flat-for-global")) { 128 FlatForGlobal = true; 129 } 130 131 // Set defaults if needed. 132 if (MaxPrivateElementSize == 0) 133 MaxPrivateElementSize = 4; 134 135 if (LDSBankCount == 0) 136 LDSBankCount = 32; 137 138 if (TT.getArch() == Triple::amdgcn) { 139 if (LocalMemorySize == 0) 140 LocalMemorySize = 32768; 141 142 // Do something sensible for unspecified target. 143 if (!HasMovrel && !HasVGPRIndexMode) 144 HasMovrel = true; 145 } 146 147 // Don't crash on invalid devices. 148 if (WavefrontSize == 0) 149 WavefrontSize = 64; 150 151 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 152 153 // Disable XNACK on targets where it is not enabled by default unless it is 154 // explicitly requested. 155 if (!FS.contains("+xnack") && DoesNotSupportXNACK && EnableXNACK) { 156 ToggleFeature(AMDGPU::FeatureXNACK); 157 EnableXNACK = false; 158 } 159 160 // ECC is on by default, but turn it off if the hardware doesn't support it 161 // anyway. This matters for the gfx9 targets with d16 loads, but don't support 162 // ECC. 163 if (DoesNotSupportSRAMECC && EnableSRAMECC) { 164 ToggleFeature(AMDGPU::FeatureSRAMECC); 165 EnableSRAMECC = false; 166 } 167 168 return *this; 169 } 170 171 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : 172 TargetTriple(TT), 173 Has16BitInsts(false), 174 HasMadMixInsts(false), 175 FP32Denormals(false), 176 FPExceptions(false), 177 HasSDWA(false), 178 HasVOP3PInsts(false), 179 HasMulI24(true), 180 HasMulU24(true), 181 HasInv2PiInlineImm(false), 182 HasFminFmaxLegacy(true), 183 EnablePromoteAlloca(false), 184 HasTrigReducedRange(false), 185 MaxWavesPerEU(10), 186 LocalMemorySize(0), 187 WavefrontSize(0) 188 { } 189 190 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 191 const GCNTargetMachine &TM) : 192 AMDGPUGenSubtargetInfo(TT, GPU, FS), 193 AMDGPUSubtarget(TT), 194 TargetTriple(TT), 195 Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS), 196 InstrItins(getInstrItineraryForCPU(GPU)), 197 LDSBankCount(0), 198 MaxPrivateElementSize(0), 199 200 FastFMAF32(false), 201 HalfRate64Ops(false), 202 203 FP64FP16Denormals(false), 204 FlatForGlobal(false), 205 AutoWaitcntBeforeBarrier(false), 206 CodeObjectV3(false), 207 UnalignedScratchAccess(false), 208 UnalignedBufferAccess(false), 209 210 HasApertureRegs(false), 211 EnableXNACK(false), 212 DoesNotSupportXNACK(false), 213 EnableCuMode(false), 214 TrapHandler(false), 215 216 EnableLoadStoreOpt(false), 217 EnableUnsafeDSOffsetFolding(false), 218 EnableSIScheduler(false), 219 EnableDS128(false), 220 EnablePRTStrictNull(false), 221 DumpCode(false), 222 223 FP64(false), 224 GCN3Encoding(false), 225 CIInsts(false), 226 GFX8Insts(false), 227 GFX9Insts(false), 228 GFX10Insts(false), 229 GFX7GFX8GFX9Insts(false), 230 SGPRInitBug(false), 231 HasSMemRealTime(false), 232 HasIntClamp(false), 233 HasFmaMixInsts(false), 234 HasMovrel(false), 235 HasVGPRIndexMode(false), 236 HasScalarStores(false), 237 HasScalarAtomics(false), 238 HasSDWAOmod(false), 239 HasSDWAScalar(false), 240 HasSDWASdst(false), 241 HasSDWAMac(false), 242 HasSDWAOutModsVOPC(false), 243 HasDPP(false), 244 HasDPP8(false), 245 HasR128A16(false), 246 HasGFX10A16(false), 247 HasNSAEncoding(false), 248 HasDLInsts(false), 249 HasDot1Insts(false), 250 HasDot2Insts(false), 251 HasDot3Insts(false), 252 HasDot4Insts(false), 253 HasDot5Insts(false), 254 HasDot6Insts(false), 255 HasMAIInsts(false), 256 HasPkFmacF16Inst(false), 257 HasAtomicFaddInsts(false), 258 EnableSRAMECC(false), 259 DoesNotSupportSRAMECC(false), 260 HasNoSdstCMPX(false), 261 HasVscnt(false), 262 HasRegisterBanking(false), 263 HasVOP3Literal(false), 264 HasNoDataDepHazard(false), 265 FlatAddressSpace(false), 266 FlatInstOffsets(false), 267 FlatGlobalInsts(false), 268 FlatScratchInsts(false), 269 ScalarFlatScratchInsts(false), 270 AddNoCarryInsts(false), 271 HasUnpackedD16VMem(false), 272 LDSMisalignedBug(false), 273 HasMFMAInlineLiteralBug(false), 274 275 ScalarizeGlobal(false), 276 277 HasVcmpxPermlaneHazard(false), 278 HasVMEMtoScalarWriteHazard(false), 279 HasSMEMtoVectorWriteHazard(false), 280 HasInstFwdPrefetchBug(false), 281 HasVcmpxExecWARHazard(false), 282 HasLdsBranchVmemWARHazard(false), 283 HasNSAtoVMEMBug(false), 284 HasOffset3fBug(false), 285 HasFlatSegmentOffsetBug(false), 286 287 FeatureDisable(false), 288 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 289 TLInfo(TM, *this), 290 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 291 MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this); 292 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 293 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 294 RegBankInfo.reset(new AMDGPURegisterBankInfo(*this)); 295 InstSelector.reset(new AMDGPUInstructionSelector( 296 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 297 } 298 299 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { 300 if (getGeneration() < GFX10) 301 return 1; 302 303 switch (Opcode) { 304 case AMDGPU::V_LSHLREV_B64: 305 case AMDGPU::V_LSHLREV_B64_gfx10: 306 case AMDGPU::V_LSHL_B64: 307 case AMDGPU::V_LSHRREV_B64: 308 case AMDGPU::V_LSHRREV_B64_gfx10: 309 case AMDGPU::V_LSHR_B64: 310 case AMDGPU::V_ASHRREV_I64: 311 case AMDGPU::V_ASHRREV_I64_gfx10: 312 case AMDGPU::V_ASHR_I64: 313 return 1; 314 } 315 316 return 2; 317 } 318 319 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 320 const Function &F) const { 321 if (NWaves == 1) 322 return getLocalMemorySize(); 323 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 324 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 325 if (!WorkGroupsPerCu) 326 return 0; 327 unsigned MaxWaves = getMaxWavesPerEU(); 328 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 329 } 330 331 // FIXME: Should return min,max range. 332 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 333 const Function &F) const { 334 const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second; 335 const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize); 336 if (!MaxWorkGroupsPerCu) 337 return 0; 338 339 const unsigned WaveSize = getWavefrontSize(); 340 341 // FIXME: Do we need to account for alignment requirement of LDS rounding the 342 // size up? 343 // Compute restriction based on LDS usage 344 unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u); 345 346 // This can be queried with more LDS than is possible, so just assume the 347 // worst. 348 if (NumGroups == 0) 349 return 1; 350 351 NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups); 352 353 // Round to the number of waves. 354 const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize; 355 unsigned MaxWaves = NumGroups * MaxGroupNumWaves; 356 357 // Clamp to the maximum possible number of waves. 358 MaxWaves = std::min(MaxWaves, getMaxWavesPerEU()); 359 360 // FIXME: Needs to be a multiple of the group size? 361 //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves); 362 363 assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() && 364 "computed invalid occupancy"); 365 return MaxWaves; 366 } 367 368 unsigned 369 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 370 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 371 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 372 } 373 374 std::pair<unsigned, unsigned> 375 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 376 switch (CC) { 377 case CallingConv::AMDGPU_VS: 378 case CallingConv::AMDGPU_LS: 379 case CallingConv::AMDGPU_HS: 380 case CallingConv::AMDGPU_ES: 381 case CallingConv::AMDGPU_GS: 382 case CallingConv::AMDGPU_PS: 383 return std::make_pair(1, getWavefrontSize()); 384 default: 385 return std::make_pair(1u, getMaxFlatWorkGroupSize()); 386 } 387 } 388 389 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 390 const Function &F) const { 391 // Default minimum/maximum flat work group sizes. 392 std::pair<unsigned, unsigned> Default = 393 getDefaultFlatWorkGroupSize(F.getCallingConv()); 394 395 // Requested minimum/maximum flat work group sizes. 396 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 397 F, "amdgpu-flat-work-group-size", Default); 398 399 // Make sure requested minimum is less than requested maximum. 400 if (Requested.first > Requested.second) 401 return Default; 402 403 // Make sure requested values do not violate subtarget's specifications. 404 if (Requested.first < getMinFlatWorkGroupSize()) 405 return Default; 406 if (Requested.second > getMaxFlatWorkGroupSize()) 407 return Default; 408 409 return Requested; 410 } 411 412 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 413 const Function &F) const { 414 // Default minimum/maximum number of waves per execution unit. 415 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 416 417 // Default/requested minimum/maximum flat work group sizes. 418 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 419 420 // If minimum/maximum flat work group sizes were explicitly requested using 421 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 422 // number of waves per execution unit to values implied by requested 423 // minimum/maximum flat work group sizes. 424 unsigned MinImpliedByFlatWorkGroupSize = 425 getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second); 426 bool RequestedFlatWorkGroupSize = false; 427 428 if (F.hasFnAttribute("amdgpu-flat-work-group-size")) { 429 Default.first = MinImpliedByFlatWorkGroupSize; 430 RequestedFlatWorkGroupSize = true; 431 } 432 433 // Requested minimum/maximum number of waves per execution unit. 434 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 435 F, "amdgpu-waves-per-eu", Default, true); 436 437 // Make sure requested minimum is less than requested maximum. 438 if (Requested.second && Requested.first > Requested.second) 439 return Default; 440 441 // Make sure requested values do not violate subtarget's specifications. 442 if (Requested.first < getMinWavesPerEU() || 443 Requested.first > getMaxWavesPerEU()) 444 return Default; 445 if (Requested.second > getMaxWavesPerEU()) 446 return Default; 447 448 // Make sure requested values are compatible with values implied by requested 449 // minimum/maximum flat work group sizes. 450 if (RequestedFlatWorkGroupSize && 451 Requested.first < MinImpliedByFlatWorkGroupSize) 452 return Default; 453 454 return Requested; 455 } 456 457 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 458 Function *Kernel = I->getParent()->getParent(); 459 unsigned MinSize = 0; 460 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 461 bool IdQuery = false; 462 463 // If reqd_work_group_size is present it narrows value down. 464 if (auto *CI = dyn_cast<CallInst>(I)) { 465 const Function *F = CI->getCalledFunction(); 466 if (F) { 467 unsigned Dim = UINT_MAX; 468 switch (F->getIntrinsicID()) { 469 case Intrinsic::amdgcn_workitem_id_x: 470 case Intrinsic::r600_read_tidig_x: 471 IdQuery = true; 472 LLVM_FALLTHROUGH; 473 case Intrinsic::r600_read_local_size_x: 474 Dim = 0; 475 break; 476 case Intrinsic::amdgcn_workitem_id_y: 477 case Intrinsic::r600_read_tidig_y: 478 IdQuery = true; 479 LLVM_FALLTHROUGH; 480 case Intrinsic::r600_read_local_size_y: 481 Dim = 1; 482 break; 483 case Intrinsic::amdgcn_workitem_id_z: 484 case Intrinsic::r600_read_tidig_z: 485 IdQuery = true; 486 LLVM_FALLTHROUGH; 487 case Intrinsic::r600_read_local_size_z: 488 Dim = 2; 489 break; 490 default: 491 break; 492 } 493 if (Dim <= 3) { 494 if (auto Node = Kernel->getMetadata("reqd_work_group_size")) 495 if (Node->getNumOperands() == 3) 496 MinSize = MaxSize = mdconst::extract<ConstantInt>( 497 Node->getOperand(Dim))->getZExtValue(); 498 } 499 } 500 } 501 502 if (!MaxSize) 503 return false; 504 505 // Range metadata is [Lo, Hi). For ID query we need to pass max size 506 // as Hi. For size query we need to pass Hi + 1. 507 if (IdQuery) 508 MinSize = 0; 509 else 510 ++MaxSize; 511 512 MDBuilder MDB(I->getContext()); 513 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 514 APInt(32, MaxSize)); 515 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 516 return true; 517 } 518 519 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 520 Align &MaxAlign) const { 521 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 522 F.getCallingConv() == CallingConv::SPIR_KERNEL); 523 524 const DataLayout &DL = F.getParent()->getDataLayout(); 525 uint64_t ExplicitArgBytes = 0; 526 MaxAlign = Align(1); 527 528 for (const Argument &Arg : F.args()) { 529 Type *ArgTy = Arg.getType(); 530 531 const Align Alignment(DL.getABITypeAlignment(ArgTy)); 532 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 533 ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize; 534 MaxAlign = std::max(MaxAlign, Alignment); 535 } 536 537 return ExplicitArgBytes; 538 } 539 540 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 541 Align &MaxAlign) const { 542 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 543 544 unsigned ExplicitOffset = getExplicitKernelArgOffset(F); 545 546 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 547 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 548 if (ImplicitBytes != 0) { 549 const Align Alignment = getAlignmentForImplicitArgPtr(); 550 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 551 } 552 553 // Being able to dereference past the end is useful for emitting scalar loads. 554 return alignTo(TotalSize, 4); 555 } 556 557 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 558 const TargetMachine &TM) : 559 R600GenSubtargetInfo(TT, GPU, FS), 560 AMDGPUSubtarget(TT), 561 InstrInfo(*this), 562 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 563 FMA(false), 564 CaymanISA(false), 565 CFALUBug(false), 566 HasVertexCache(false), 567 R600ALUInst(false), 568 FP64(false), 569 TexVTXClauseSize(0), 570 Gen(R600), 571 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)), 572 InstrItins(getInstrItineraryForCPU(GPU)) { } 573 574 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 575 unsigned NumRegionInstrs) const { 576 // Track register pressure so the scheduler can try to decrease 577 // pressure once register usage is above the threshold defined by 578 // SIRegisterInfo::getRegPressureSetLimit() 579 Policy.ShouldTrackPressure = true; 580 581 // Enabling both top down and bottom up scheduling seems to give us less 582 // register spills than just using one of these approaches on its own. 583 Policy.OnlyTopDown = false; 584 Policy.OnlyBottomUp = false; 585 586 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 587 if (!enableSIScheduler()) 588 Policy.ShouldTrackLaneMasks = true; 589 } 590 591 bool GCNSubtarget::hasMadF16() const { 592 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1; 593 } 594 595 bool GCNSubtarget::useVGPRIndexMode() const { 596 return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode()); 597 } 598 599 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 600 if (getGeneration() >= AMDGPUSubtarget::GFX10) 601 return getMaxWavesPerEU(); 602 603 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 604 if (SGPRs <= 80) 605 return 10; 606 if (SGPRs <= 88) 607 return 9; 608 if (SGPRs <= 100) 609 return 8; 610 return 7; 611 } 612 if (SGPRs <= 48) 613 return 10; 614 if (SGPRs <= 56) 615 return 9; 616 if (SGPRs <= 64) 617 return 8; 618 if (SGPRs <= 72) 619 return 7; 620 if (SGPRs <= 80) 621 return 6; 622 return 5; 623 } 624 625 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 626 unsigned MaxWaves = getMaxWavesPerEU(); 627 unsigned Granule = getVGPRAllocGranule(); 628 if (VGPRs < Granule) 629 return MaxWaves; 630 unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule; 631 return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves); 632 } 633 634 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 635 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 636 if (getGeneration() >= AMDGPUSubtarget::GFX10) 637 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. 638 639 if (MFI.hasFlatScratchInit()) { 640 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 641 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 642 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 643 return 4; // FLAT_SCRATCH, VCC (in that order). 644 } 645 646 if (isXNACKEnabled()) 647 return 4; // XNACK, VCC (in that order). 648 return 2; // VCC. 649 } 650 651 unsigned GCNSubtarget::computeOccupancy(const MachineFunction &MF, 652 unsigned LDSSize, 653 unsigned NumSGPRs, 654 unsigned NumVGPRs) const { 655 unsigned Occupancy = 656 std::min(getMaxWavesPerEU(), 657 getOccupancyWithLocalMemSize(LDSSize, MF.getFunction())); 658 if (NumSGPRs) 659 Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs)); 660 if (NumVGPRs) 661 Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs)); 662 return Occupancy; 663 } 664 665 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 666 const Function &F = MF.getFunction(); 667 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 668 669 // Compute maximum number of SGPRs function can use using default/requested 670 // minimum number of waves per execution unit. 671 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 672 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 673 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 674 675 // Check if maximum number of SGPRs was explicitly requested using 676 // "amdgpu-num-sgpr" attribute. 677 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 678 unsigned Requested = AMDGPU::getIntegerAttribute( 679 F, "amdgpu-num-sgpr", MaxNumSGPRs); 680 681 // Make sure requested value does not violate subtarget's specifications. 682 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 683 Requested = 0; 684 685 // If more SGPRs are required to support the input user/system SGPRs, 686 // increase to accommodate them. 687 // 688 // FIXME: This really ends up using the requested number of SGPRs + number 689 // of reserved special registers in total. Theoretically you could re-use 690 // the last input registers for these special registers, but this would 691 // require a lot of complexity to deal with the weird aliasing. 692 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 693 if (Requested && Requested < InputNumSGPRs) 694 Requested = InputNumSGPRs; 695 696 // Make sure requested value is compatible with values implied by 697 // default/requested minimum/maximum number of waves per execution unit. 698 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 699 Requested = 0; 700 if (WavesPerEU.second && 701 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 702 Requested = 0; 703 704 if (Requested) 705 MaxNumSGPRs = Requested; 706 } 707 708 if (hasSGPRInitBug()) 709 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 710 711 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 712 MaxAddressableNumSGPRs); 713 } 714 715 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 716 const Function &F = MF.getFunction(); 717 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 718 719 // Compute maximum number of VGPRs function can use using default/requested 720 // minimum number of waves per execution unit. 721 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 722 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 723 724 // Check if maximum number of VGPRs was explicitly requested using 725 // "amdgpu-num-vgpr" attribute. 726 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 727 unsigned Requested = AMDGPU::getIntegerAttribute( 728 F, "amdgpu-num-vgpr", MaxNumVGPRs); 729 730 // Make sure requested value is compatible with values implied by 731 // default/requested minimum/maximum number of waves per execution unit. 732 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 733 Requested = 0; 734 if (WavesPerEU.second && 735 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 736 Requested = 0; 737 738 if (Requested) 739 MaxNumVGPRs = Requested; 740 } 741 742 return MaxNumVGPRs; 743 } 744 745 void GCNSubtarget::adjustSchedDependency(SUnit *Src, SUnit *Dst, 746 SDep &Dep) const { 747 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || 748 !Src->isInstr() || !Dst->isInstr()) 749 return; 750 751 MachineInstr *SrcI = Src->getInstr(); 752 MachineInstr *DstI = Dst->getInstr(); 753 754 if (SrcI->isBundle()) { 755 const SIRegisterInfo *TRI = getRegisterInfo(); 756 auto Reg = Dep.getReg(); 757 MachineBasicBlock::const_instr_iterator I(SrcI->getIterator()); 758 MachineBasicBlock::const_instr_iterator E(SrcI->getParent()->instr_end()); 759 unsigned Lat = 0; 760 for (++I; I != E && I->isBundledWithPred(); ++I) { 761 if (I->modifiesRegister(Reg, TRI)) 762 Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I); 763 else if (Lat) 764 --Lat; 765 } 766 Dep.setLatency(Lat); 767 } else if (DstI->isBundle()) { 768 const SIRegisterInfo *TRI = getRegisterInfo(); 769 auto Reg = Dep.getReg(); 770 MachineBasicBlock::const_instr_iterator I(DstI->getIterator()); 771 MachineBasicBlock::const_instr_iterator E(DstI->getParent()->instr_end()); 772 unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *SrcI); 773 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) { 774 if (I->readsRegister(Reg, TRI)) 775 break; 776 --Lat; 777 } 778 Dep.setLatency(Lat); 779 } 780 } 781 782 namespace { 783 struct FillMFMAShadowMutation : ScheduleDAGMutation { 784 const SIInstrInfo *TII; 785 786 ScheduleDAGMI *DAG; 787 788 FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {} 789 790 bool isSALU(const SUnit *SU) const { 791 const MachineInstr *MI = SU->getInstr(); 792 return MI && TII->isSALU(*MI) && !MI->isTerminator(); 793 } 794 795 bool isVALU(const SUnit *SU) const { 796 const MachineInstr *MI = SU->getInstr(); 797 return MI && TII->isVALU(*MI); 798 } 799 800 bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const { 801 if (Pred->NodeNum < Succ->NodeNum) 802 return true; 803 804 SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred}); 805 806 for (unsigned I = 0; I < Succs.size(); ++I) { 807 for (const SDep &SI : Succs[I]->Succs) { 808 const SUnit *SU = SI.getSUnit(); 809 if (SU != Succs[I] && llvm::find(Succs, SU) == Succs.end()) 810 Succs.push_back(SU); 811 } 812 } 813 814 SmallPtrSet<const SUnit*, 32> Visited; 815 while (!Preds.empty()) { 816 const SUnit *SU = Preds.pop_back_val(); 817 if (llvm::find(Succs, SU) != Succs.end()) 818 return false; 819 Visited.insert(SU); 820 for (const SDep &SI : SU->Preds) 821 if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit())) 822 Preds.push_back(SI.getSUnit()); 823 } 824 825 return true; 826 } 827 828 // Link as much SALU intructions in chain as possible. Return the size 829 // of the chain. Links up to MaxChain instructions. 830 unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain, 831 SmallPtrSetImpl<SUnit *> &Visited) const { 832 SmallVector<SUnit *, 8> Worklist({To}); 833 unsigned Linked = 0; 834 835 while (!Worklist.empty() && MaxChain-- > 0) { 836 SUnit *SU = Worklist.pop_back_val(); 837 if (!Visited.insert(SU).second) 838 continue; 839 840 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From); 841 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n'); 842 843 if (SU->addPred(SDep(From, SDep::Artificial), false)) 844 ++Linked; 845 846 for (SDep &SI : From->Succs) { 847 SUnit *SUv = SI.getSUnit(); 848 if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU)) 849 SUv->addPred(SDep(SU, SDep::Artificial), false); 850 } 851 852 for (SDep &SI : SU->Succs) { 853 SUnit *Succ = SI.getSUnit(); 854 if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ)) 855 Worklist.push_back(Succ); 856 } 857 } 858 859 return Linked; 860 } 861 862 void apply(ScheduleDAGInstrs *DAGInstrs) override { 863 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); 864 if (!ST.hasMAIInsts() || DisablePowerSched) 865 return; 866 DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); 867 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); 868 if (!TSchedModel || DAG->SUnits.empty()) 869 return; 870 871 // Scan for MFMA long latency instructions and try to add a dependency 872 // of available SALU instructions to give them a chance to fill MFMA 873 // shadow. That is desirable to fill MFMA shadow with SALU instructions 874 // rather than VALU to prevent power consumption bursts and throttle. 875 auto LastSALU = DAG->SUnits.begin(); 876 auto E = DAG->SUnits.end(); 877 SmallPtrSet<SUnit*, 32> Visited; 878 for (SUnit &SU : DAG->SUnits) { 879 MachineInstr &MAI = *SU.getInstr(); 880 if (!TII->isMAI(MAI) || 881 MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 || 882 MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32) 883 continue; 884 885 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1; 886 887 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU); 888 dbgs() << "Need " << Lat 889 << " instructions to cover latency.\n"); 890 891 // Find up to Lat independent scalar instructions as early as 892 // possible such that they can be scheduled after this MFMA. 893 for ( ; Lat && LastSALU != E; ++LastSALU) { 894 if (Visited.count(&*LastSALU)) 895 continue; 896 897 if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU)) 898 continue; 899 900 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited); 901 } 902 } 903 } 904 }; 905 } // namespace 906 907 void GCNSubtarget::getPostRAMutations( 908 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 909 Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo)); 910 } 911 912 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 913 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 914 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 915 else 916 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); 917 } 918 919 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 920 if (TM.getTargetTriple().getArch() == Triple::amdgcn) 921 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 922 else 923 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); 924 } 925