1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Implements the AMDGPU specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUSubtarget.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUTargetMachine.h" 17 #include "AMDGPUCallLowering.h" 18 #include "AMDGPUInstructionSelector.h" 19 #include "AMDGPULegalizerInfo.h" 20 #include "AMDGPURegisterBankInfo.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 23 #include "llvm/ADT/SmallString.h" 24 #include "llvm/CodeGen/MachineScheduler.h" 25 #include "llvm/MC/MCSubtargetInfo.h" 26 #include "llvm/IR/MDBuilder.h" 27 #include "llvm/CodeGen/TargetFrameLowering.h" 28 #include <algorithm> 29 30 using namespace llvm; 31 32 #define DEBUG_TYPE "amdgpu-subtarget" 33 34 #define GET_SUBTARGETINFO_TARGET_DESC 35 #define GET_SUBTARGETINFO_CTOR 36 #define AMDGPUSubtarget GCNSubtarget 37 #include "AMDGPUGenSubtargetInfo.inc" 38 #define GET_SUBTARGETINFO_TARGET_DESC 39 #define GET_SUBTARGETINFO_CTOR 40 #undef AMDGPUSubtarget 41 #include "R600GenSubtargetInfo.inc" 42 43 static cl::opt<bool> DisablePowerSched( 44 "amdgpu-disable-power-sched", 45 cl::desc("Disable scheduling to minimize mAI power bursts"), 46 cl::init(false)); 47 48 static cl::opt<bool> EnableVGPRIndexMode( 49 "amdgpu-vgpr-index-mode", 50 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), 51 cl::init(false)); 52 53 static cl::opt<bool> EnableFlatScratch( 54 "amdgpu-enable-flat-scratch", 55 cl::desc("Use flat scratch instructions"), 56 cl::init(false)); 57 58 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen", 59 cl::desc("Enable the use of AA during codegen."), 60 cl::init(true)); 61 62 GCNSubtarget::~GCNSubtarget() = default; 63 64 R600Subtarget & 65 R600Subtarget::initializeSubtargetDependencies(const Triple &TT, 66 StringRef GPU, StringRef FS) { 67 SmallString<256> FullFS("+promote-alloca,"); 68 FullFS += FS; 69 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS); 70 71 HasMulU24 = getGeneration() >= EVERGREEN; 72 HasMulI24 = hasCaymanISA(); 73 74 return *this; 75 } 76 77 GCNSubtarget & 78 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 79 StringRef GPU, StringRef FS) { 80 // Determine default and user-specified characteristics 81 // 82 // We want to be able to turn these off, but making this a subtarget feature 83 // for SI has the unhelpful behavior that it unsets everything else if you 84 // disable it. 85 // 86 // Similarly we want enable-prt-strict-null to be on by default and not to 87 // unset everything else if it is disabled 88 89 // Assuming ECC is enabled is the conservative default. 90 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,+sram-ecc,+xnack,"); 91 92 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 93 FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,"; 94 95 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS 96 97 // Disable mutually exclusive bits. 98 if (FS.find_lower("+wavefrontsize") != StringRef::npos) { 99 if (FS.find_lower("wavefrontsize16") == StringRef::npos) 100 FullFS += "-wavefrontsize16,"; 101 if (FS.find_lower("wavefrontsize32") == StringRef::npos) 102 FullFS += "-wavefrontsize32,"; 103 if (FS.find_lower("wavefrontsize64") == StringRef::npos) 104 FullFS += "-wavefrontsize64,"; 105 } 106 107 FullFS += FS; 108 109 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS); 110 111 // We don't support FP64 for EG/NI atm. 112 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 113 114 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es 115 // on VI and newer hardware to avoid assertion failures due to missing ADDR64 116 // variants of MUBUF instructions. 117 if (!hasAddr64() && !FS.contains("flat-for-global")) { 118 FlatForGlobal = true; 119 } 120 121 // Set defaults if needed. 122 if (MaxPrivateElementSize == 0) 123 MaxPrivateElementSize = 4; 124 125 if (LDSBankCount == 0) 126 LDSBankCount = 32; 127 128 if (TT.getArch() == Triple::amdgcn) { 129 if (LocalMemorySize == 0) 130 LocalMemorySize = 32768; 131 132 // Do something sensible for unspecified target. 133 if (!HasMovrel && !HasVGPRIndexMode) 134 HasMovrel = true; 135 } 136 137 // Don't crash on invalid devices. 138 if (WavefrontSizeLog2 == 0) 139 WavefrontSizeLog2 = 5; 140 141 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 142 143 // Disable XNACK on targets where it is not enabled by default unless it is 144 // explicitly requested. 145 if (!FS.contains("+xnack") && DoesNotSupportXNACK && EnableXNACK) { 146 ToggleFeature(AMDGPU::FeatureXNACK); 147 EnableXNACK = false; 148 } 149 150 // ECC is on by default, but turn it off if the hardware doesn't support it 151 // anyway. This matters for the gfx9 targets with d16 loads, but don't support 152 // ECC. 153 if (DoesNotSupportSRAMECC && EnableSRAMECC) { 154 ToggleFeature(AMDGPU::FeatureSRAMECC); 155 EnableSRAMECC = false; 156 } 157 158 return *this; 159 } 160 161 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : 162 TargetTriple(TT), 163 Has16BitInsts(false), 164 HasMadMixInsts(false), 165 HasMadMacF32Insts(false), 166 HasDsSrc2Insts(false), 167 HasSDWA(false), 168 HasVOP3PInsts(false), 169 HasMulI24(true), 170 HasMulU24(true), 171 HasInv2PiInlineImm(false), 172 HasFminFmaxLegacy(true), 173 EnablePromoteAlloca(false), 174 HasTrigReducedRange(false), 175 MaxWavesPerEU(10), 176 LocalMemorySize(0), 177 WavefrontSizeLog2(0) 178 { } 179 180 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 181 const GCNTargetMachine &TM) : 182 AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS), 183 AMDGPUSubtarget(TT), 184 TargetTriple(TT), 185 Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS), 186 InstrItins(getInstrItineraryForCPU(GPU)), 187 LDSBankCount(0), 188 MaxPrivateElementSize(0), 189 190 FastFMAF32(false), 191 FastDenormalF32(false), 192 HalfRate64Ops(false), 193 194 FlatForGlobal(false), 195 AutoWaitcntBeforeBarrier(false), 196 UnalignedScratchAccess(false), 197 UnalignedAccessMode(false), 198 199 HasApertureRegs(false), 200 EnableXNACK(false), 201 DoesNotSupportXNACK(false), 202 EnableCuMode(false), 203 TrapHandler(false), 204 205 EnableLoadStoreOpt(false), 206 EnableUnsafeDSOffsetFolding(false), 207 EnableSIScheduler(false), 208 EnableDS128(false), 209 EnablePRTStrictNull(false), 210 DumpCode(false), 211 212 FP64(false), 213 GCN3Encoding(false), 214 CIInsts(false), 215 GFX8Insts(false), 216 GFX9Insts(false), 217 GFX10Insts(false), 218 GFX10_3Insts(false), 219 GFX7GFX8GFX9Insts(false), 220 SGPRInitBug(false), 221 HasSMemRealTime(false), 222 HasIntClamp(false), 223 HasFmaMixInsts(false), 224 HasMovrel(false), 225 HasVGPRIndexMode(false), 226 HasScalarStores(false), 227 HasScalarAtomics(false), 228 HasSDWAOmod(false), 229 HasSDWAScalar(false), 230 HasSDWASdst(false), 231 HasSDWAMac(false), 232 HasSDWAOutModsVOPC(false), 233 HasDPP(false), 234 HasDPP8(false), 235 HasR128A16(false), 236 HasGFX10A16(false), 237 HasG16(false), 238 HasNSAEncoding(false), 239 GFX10_BEncoding(false), 240 HasDLInsts(false), 241 HasDot1Insts(false), 242 HasDot2Insts(false), 243 HasDot3Insts(false), 244 HasDot4Insts(false), 245 HasDot5Insts(false), 246 HasDot6Insts(false), 247 HasMAIInsts(false), 248 HasPkFmacF16Inst(false), 249 HasAtomicFaddInsts(false), 250 EnableSRAMECC(false), 251 DoesNotSupportSRAMECC(false), 252 HasNoSdstCMPX(false), 253 HasVscnt(false), 254 HasGetWaveIdInst(false), 255 HasSMemTimeInst(false), 256 HasRegisterBanking(false), 257 HasVOP3Literal(false), 258 HasNoDataDepHazard(false), 259 FlatAddressSpace(false), 260 FlatInstOffsets(false), 261 FlatGlobalInsts(false), 262 FlatScratchInsts(false), 263 ScalarFlatScratchInsts(false), 264 AddNoCarryInsts(false), 265 HasUnpackedD16VMem(false), 266 LDSMisalignedBug(false), 267 HasMFMAInlineLiteralBug(false), 268 UnalignedBufferAccess(false), 269 UnalignedDSAccess(false), 270 271 ScalarizeGlobal(false), 272 273 HasVcmpxPermlaneHazard(false), 274 HasVMEMtoScalarWriteHazard(false), 275 HasSMEMtoVectorWriteHazard(false), 276 HasInstFwdPrefetchBug(false), 277 HasVcmpxExecWARHazard(false), 278 HasLdsBranchVmemWARHazard(false), 279 HasNSAtoVMEMBug(false), 280 HasOffset3fBug(false), 281 HasFlatSegmentOffsetBug(false), 282 HasImageStoreD16Bug(false), 283 HasImageGather4D16Bug(false), 284 285 FeatureDisable(false), 286 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 287 TLInfo(TM, *this), 288 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 289 MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this); 290 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 291 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering())); 292 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 293 RegBankInfo.reset(new AMDGPURegisterBankInfo(*this)); 294 InstSelector.reset(new AMDGPUInstructionSelector( 295 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 296 } 297 298 bool GCNSubtarget::enableFlatScratch() const { 299 return EnableFlatScratch && hasFlatScratchInsts(); 300 } 301 302 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { 303 if (getGeneration() < GFX10) 304 return 1; 305 306 switch (Opcode) { 307 case AMDGPU::V_LSHLREV_B64: 308 case AMDGPU::V_LSHLREV_B64_gfx10: 309 case AMDGPU::V_LSHL_B64: 310 case AMDGPU::V_LSHRREV_B64: 311 case AMDGPU::V_LSHRREV_B64_gfx10: 312 case AMDGPU::V_LSHR_B64: 313 case AMDGPU::V_ASHRREV_I64: 314 case AMDGPU::V_ASHRREV_I64_gfx10: 315 case AMDGPU::V_ASHR_I64: 316 return 1; 317 } 318 319 return 2; 320 } 321 322 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 323 const Function &F) const { 324 if (NWaves == 1) 325 return getLocalMemorySize(); 326 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 327 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 328 if (!WorkGroupsPerCu) 329 return 0; 330 unsigned MaxWaves = getMaxWavesPerEU(); 331 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 332 } 333 334 // FIXME: Should return min,max range. 335 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 336 const Function &F) const { 337 const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second; 338 const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize); 339 if (!MaxWorkGroupsPerCu) 340 return 0; 341 342 const unsigned WaveSize = getWavefrontSize(); 343 344 // FIXME: Do we need to account for alignment requirement of LDS rounding the 345 // size up? 346 // Compute restriction based on LDS usage 347 unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u); 348 349 // This can be queried with more LDS than is possible, so just assume the 350 // worst. 351 if (NumGroups == 0) 352 return 1; 353 354 NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups); 355 356 // Round to the number of waves. 357 const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize; 358 unsigned MaxWaves = NumGroups * MaxGroupNumWaves; 359 360 // Clamp to the maximum possible number of waves. 361 MaxWaves = std::min(MaxWaves, getMaxWavesPerEU()); 362 363 // FIXME: Needs to be a multiple of the group size? 364 //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves); 365 366 assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() && 367 "computed invalid occupancy"); 368 return MaxWaves; 369 } 370 371 unsigned 372 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 373 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 374 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 375 } 376 377 std::pair<unsigned, unsigned> 378 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 379 switch (CC) { 380 case CallingConv::AMDGPU_VS: 381 case CallingConv::AMDGPU_LS: 382 case CallingConv::AMDGPU_HS: 383 case CallingConv::AMDGPU_ES: 384 case CallingConv::AMDGPU_GS: 385 case CallingConv::AMDGPU_PS: 386 return std::make_pair(1, getWavefrontSize()); 387 default: 388 return std::make_pair(1u, getMaxFlatWorkGroupSize()); 389 } 390 } 391 392 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 393 const Function &F) const { 394 // Default minimum/maximum flat work group sizes. 395 std::pair<unsigned, unsigned> Default = 396 getDefaultFlatWorkGroupSize(F.getCallingConv()); 397 398 // Requested minimum/maximum flat work group sizes. 399 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 400 F, "amdgpu-flat-work-group-size", Default); 401 402 // Make sure requested minimum is less than requested maximum. 403 if (Requested.first > Requested.second) 404 return Default; 405 406 // Make sure requested values do not violate subtarget's specifications. 407 if (Requested.first < getMinFlatWorkGroupSize()) 408 return Default; 409 if (Requested.second > getMaxFlatWorkGroupSize()) 410 return Default; 411 412 return Requested; 413 } 414 415 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 416 const Function &F) const { 417 // Default minimum/maximum number of waves per execution unit. 418 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 419 420 // Default/requested minimum/maximum flat work group sizes. 421 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 422 423 // If minimum/maximum flat work group sizes were explicitly requested using 424 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 425 // number of waves per execution unit to values implied by requested 426 // minimum/maximum flat work group sizes. 427 unsigned MinImpliedByFlatWorkGroupSize = 428 getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second); 429 Default.first = MinImpliedByFlatWorkGroupSize; 430 bool RequestedFlatWorkGroupSize = 431 F.hasFnAttribute("amdgpu-flat-work-group-size"); 432 433 // Requested minimum/maximum number of waves per execution unit. 434 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 435 F, "amdgpu-waves-per-eu", Default, true); 436 437 // Make sure requested minimum is less than requested maximum. 438 if (Requested.second && Requested.first > Requested.second) 439 return Default; 440 441 // Make sure requested values do not violate subtarget's specifications. 442 if (Requested.first < getMinWavesPerEU() || 443 Requested.second > getMaxWavesPerEU()) 444 return Default; 445 446 // Make sure requested values are compatible with values implied by requested 447 // minimum/maximum flat work group sizes. 448 if (RequestedFlatWorkGroupSize && 449 Requested.first < MinImpliedByFlatWorkGroupSize) 450 return Default; 451 452 return Requested; 453 } 454 455 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) { 456 auto Node = Kernel.getMetadata("reqd_work_group_size"); 457 if (Node && Node->getNumOperands() == 3) 458 return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue(); 459 return std::numeric_limits<unsigned>::max(); 460 } 461 462 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel, 463 unsigned Dimension) const { 464 unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension); 465 if (ReqdSize != std::numeric_limits<unsigned>::max()) 466 return ReqdSize - 1; 467 return getFlatWorkGroupSizes(Kernel).second - 1; 468 } 469 470 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 471 Function *Kernel = I->getParent()->getParent(); 472 unsigned MinSize = 0; 473 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 474 bool IdQuery = false; 475 476 // If reqd_work_group_size is present it narrows value down. 477 if (auto *CI = dyn_cast<CallInst>(I)) { 478 const Function *F = CI->getCalledFunction(); 479 if (F) { 480 unsigned Dim = UINT_MAX; 481 switch (F->getIntrinsicID()) { 482 case Intrinsic::amdgcn_workitem_id_x: 483 case Intrinsic::r600_read_tidig_x: 484 IdQuery = true; 485 LLVM_FALLTHROUGH; 486 case Intrinsic::r600_read_local_size_x: 487 Dim = 0; 488 break; 489 case Intrinsic::amdgcn_workitem_id_y: 490 case Intrinsic::r600_read_tidig_y: 491 IdQuery = true; 492 LLVM_FALLTHROUGH; 493 case Intrinsic::r600_read_local_size_y: 494 Dim = 1; 495 break; 496 case Intrinsic::amdgcn_workitem_id_z: 497 case Intrinsic::r600_read_tidig_z: 498 IdQuery = true; 499 LLVM_FALLTHROUGH; 500 case Intrinsic::r600_read_local_size_z: 501 Dim = 2; 502 break; 503 default: 504 break; 505 } 506 507 if (Dim <= 3) { 508 unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim); 509 if (ReqdSize != std::numeric_limits<unsigned>::max()) 510 MinSize = MaxSize = ReqdSize; 511 } 512 } 513 } 514 515 if (!MaxSize) 516 return false; 517 518 // Range metadata is [Lo, Hi). For ID query we need to pass max size 519 // as Hi. For size query we need to pass Hi + 1. 520 if (IdQuery) 521 MinSize = 0; 522 else 523 ++MaxSize; 524 525 MDBuilder MDB(I->getContext()); 526 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 527 APInt(32, MaxSize)); 528 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 529 return true; 530 } 531 532 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 533 Align &MaxAlign) const { 534 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 535 F.getCallingConv() == CallingConv::SPIR_KERNEL); 536 537 const DataLayout &DL = F.getParent()->getDataLayout(); 538 uint64_t ExplicitArgBytes = 0; 539 MaxAlign = Align(1); 540 541 for (const Argument &Arg : F.args()) { 542 const bool IsByRef = Arg.hasByRefAttr(); 543 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); 544 MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None; 545 if (!Alignment) 546 Alignment = DL.getABITypeAlign(ArgTy); 547 548 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 549 ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize; 550 MaxAlign = max(MaxAlign, Alignment); 551 } 552 553 return ExplicitArgBytes; 554 } 555 556 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 557 Align &MaxAlign) const { 558 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 559 560 unsigned ExplicitOffset = getExplicitKernelArgOffset(F); 561 562 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 563 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 564 if (ImplicitBytes != 0) { 565 const Align Alignment = getAlignmentForImplicitArgPtr(); 566 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 567 } 568 569 // Being able to dereference past the end is useful for emitting scalar loads. 570 return alignTo(TotalSize, 4); 571 } 572 573 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 574 const TargetMachine &TM) : 575 R600GenSubtargetInfo(TT, GPU, /*TuneCPU*/GPU, FS), 576 AMDGPUSubtarget(TT), 577 InstrInfo(*this), 578 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 579 FMA(false), 580 CaymanISA(false), 581 CFALUBug(false), 582 HasVertexCache(false), 583 R600ALUInst(false), 584 FP64(false), 585 TexVTXClauseSize(0), 586 Gen(R600), 587 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)), 588 InstrItins(getInstrItineraryForCPU(GPU)) { } 589 590 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 591 unsigned NumRegionInstrs) const { 592 // Track register pressure so the scheduler can try to decrease 593 // pressure once register usage is above the threshold defined by 594 // SIRegisterInfo::getRegPressureSetLimit() 595 Policy.ShouldTrackPressure = true; 596 597 // Enabling both top down and bottom up scheduling seems to give us less 598 // register spills than just using one of these approaches on its own. 599 Policy.OnlyTopDown = false; 600 Policy.OnlyBottomUp = false; 601 602 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 603 if (!enableSIScheduler()) 604 Policy.ShouldTrackLaneMasks = true; 605 } 606 607 bool GCNSubtarget::hasMadF16() const { 608 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1; 609 } 610 611 bool GCNSubtarget::useVGPRIndexMode() const { 612 return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode()); 613 } 614 615 bool GCNSubtarget::useAA() const { return UseAA; } 616 617 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 618 if (getGeneration() >= AMDGPUSubtarget::GFX10) 619 return getMaxWavesPerEU(); 620 621 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 622 if (SGPRs <= 80) 623 return 10; 624 if (SGPRs <= 88) 625 return 9; 626 if (SGPRs <= 100) 627 return 8; 628 return 7; 629 } 630 if (SGPRs <= 48) 631 return 10; 632 if (SGPRs <= 56) 633 return 9; 634 if (SGPRs <= 64) 635 return 8; 636 if (SGPRs <= 72) 637 return 7; 638 if (SGPRs <= 80) 639 return 6; 640 return 5; 641 } 642 643 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 644 unsigned MaxWaves = getMaxWavesPerEU(); 645 unsigned Granule = getVGPRAllocGranule(); 646 if (VGPRs < Granule) 647 return MaxWaves; 648 unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule; 649 return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves); 650 } 651 652 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 653 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 654 if (getGeneration() >= AMDGPUSubtarget::GFX10) 655 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. 656 657 if (MFI.hasFlatScratchInit()) { 658 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 659 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 660 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 661 return 4; // FLAT_SCRATCH, VCC (in that order). 662 } 663 664 if (isXNACKEnabled()) 665 return 4; // XNACK, VCC (in that order). 666 return 2; // VCC. 667 } 668 669 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize, 670 unsigned NumSGPRs, 671 unsigned NumVGPRs) const { 672 unsigned Occupancy = 673 std::min(getMaxWavesPerEU(), 674 getOccupancyWithLocalMemSize(LDSSize, F)); 675 if (NumSGPRs) 676 Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs)); 677 if (NumVGPRs) 678 Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs)); 679 return Occupancy; 680 } 681 682 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 683 const Function &F = MF.getFunction(); 684 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 685 686 // Compute maximum number of SGPRs function can use using default/requested 687 // minimum number of waves per execution unit. 688 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 689 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 690 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 691 692 // Check if maximum number of SGPRs was explicitly requested using 693 // "amdgpu-num-sgpr" attribute. 694 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 695 unsigned Requested = AMDGPU::getIntegerAttribute( 696 F, "amdgpu-num-sgpr", MaxNumSGPRs); 697 698 // Make sure requested value does not violate subtarget's specifications. 699 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 700 Requested = 0; 701 702 // If more SGPRs are required to support the input user/system SGPRs, 703 // increase to accommodate them. 704 // 705 // FIXME: This really ends up using the requested number of SGPRs + number 706 // of reserved special registers in total. Theoretically you could re-use 707 // the last input registers for these special registers, but this would 708 // require a lot of complexity to deal with the weird aliasing. 709 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 710 if (Requested && Requested < InputNumSGPRs) 711 Requested = InputNumSGPRs; 712 713 // Make sure requested value is compatible with values implied by 714 // default/requested minimum/maximum number of waves per execution unit. 715 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 716 Requested = 0; 717 if (WavesPerEU.second && 718 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 719 Requested = 0; 720 721 if (Requested) 722 MaxNumSGPRs = Requested; 723 } 724 725 if (hasSGPRInitBug()) 726 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 727 728 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 729 MaxAddressableNumSGPRs); 730 } 731 732 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 733 const Function &F = MF.getFunction(); 734 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 735 736 // Compute maximum number of VGPRs function can use using default/requested 737 // minimum number of waves per execution unit. 738 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 739 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 740 741 // Check if maximum number of VGPRs was explicitly requested using 742 // "amdgpu-num-vgpr" attribute. 743 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 744 unsigned Requested = AMDGPU::getIntegerAttribute( 745 F, "amdgpu-num-vgpr", MaxNumVGPRs); 746 747 // Make sure requested value is compatible with values implied by 748 // default/requested minimum/maximum number of waves per execution unit. 749 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 750 Requested = 0; 751 if (WavesPerEU.second && 752 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 753 Requested = 0; 754 755 if (Requested) 756 MaxNumVGPRs = Requested; 757 } 758 759 return MaxNumVGPRs; 760 } 761 762 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, 763 int UseOpIdx, SDep &Dep) const { 764 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || 765 !Def->isInstr() || !Use->isInstr()) 766 return; 767 768 MachineInstr *DefI = Def->getInstr(); 769 MachineInstr *UseI = Use->getInstr(); 770 771 if (DefI->isBundle()) { 772 const SIRegisterInfo *TRI = getRegisterInfo(); 773 auto Reg = Dep.getReg(); 774 MachineBasicBlock::const_instr_iterator I(DefI->getIterator()); 775 MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end()); 776 unsigned Lat = 0; 777 for (++I; I != E && I->isBundledWithPred(); ++I) { 778 if (I->modifiesRegister(Reg, TRI)) 779 Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I); 780 else if (Lat) 781 --Lat; 782 } 783 Dep.setLatency(Lat); 784 } else if (UseI->isBundle()) { 785 const SIRegisterInfo *TRI = getRegisterInfo(); 786 auto Reg = Dep.getReg(); 787 MachineBasicBlock::const_instr_iterator I(UseI->getIterator()); 788 MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end()); 789 unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI); 790 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) { 791 if (I->readsRegister(Reg, TRI)) 792 break; 793 --Lat; 794 } 795 Dep.setLatency(Lat); 796 } 797 } 798 799 namespace { 800 struct FillMFMAShadowMutation : ScheduleDAGMutation { 801 const SIInstrInfo *TII; 802 803 ScheduleDAGMI *DAG; 804 805 FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {} 806 807 bool isSALU(const SUnit *SU) const { 808 const MachineInstr *MI = SU->getInstr(); 809 return MI && TII->isSALU(*MI) && !MI->isTerminator(); 810 } 811 812 bool isVALU(const SUnit *SU) const { 813 const MachineInstr *MI = SU->getInstr(); 814 return MI && TII->isVALU(*MI); 815 } 816 817 bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const { 818 if (Pred->NodeNum < Succ->NodeNum) 819 return true; 820 821 SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred}); 822 823 for (unsigned I = 0; I < Succs.size(); ++I) { 824 for (const SDep &SI : Succs[I]->Succs) { 825 const SUnit *SU = SI.getSUnit(); 826 if (SU != Succs[I] && llvm::find(Succs, SU) == Succs.end()) 827 Succs.push_back(SU); 828 } 829 } 830 831 SmallPtrSet<const SUnit*, 32> Visited; 832 while (!Preds.empty()) { 833 const SUnit *SU = Preds.pop_back_val(); 834 if (llvm::find(Succs, SU) != Succs.end()) 835 return false; 836 Visited.insert(SU); 837 for (const SDep &SI : SU->Preds) 838 if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit())) 839 Preds.push_back(SI.getSUnit()); 840 } 841 842 return true; 843 } 844 845 // Link as much SALU intructions in chain as possible. Return the size 846 // of the chain. Links up to MaxChain instructions. 847 unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain, 848 SmallPtrSetImpl<SUnit *> &Visited) const { 849 SmallVector<SUnit *, 8> Worklist({To}); 850 unsigned Linked = 0; 851 852 while (!Worklist.empty() && MaxChain-- > 0) { 853 SUnit *SU = Worklist.pop_back_val(); 854 if (!Visited.insert(SU).second) 855 continue; 856 857 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From); 858 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n'); 859 860 if (SU->addPred(SDep(From, SDep::Artificial), false)) 861 ++Linked; 862 863 for (SDep &SI : From->Succs) { 864 SUnit *SUv = SI.getSUnit(); 865 if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU)) 866 SUv->addPred(SDep(SU, SDep::Artificial), false); 867 } 868 869 for (SDep &SI : SU->Succs) { 870 SUnit *Succ = SI.getSUnit(); 871 if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ)) 872 Worklist.push_back(Succ); 873 } 874 } 875 876 return Linked; 877 } 878 879 void apply(ScheduleDAGInstrs *DAGInstrs) override { 880 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); 881 if (!ST.hasMAIInsts() || DisablePowerSched) 882 return; 883 DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); 884 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); 885 if (!TSchedModel || DAG->SUnits.empty()) 886 return; 887 888 // Scan for MFMA long latency instructions and try to add a dependency 889 // of available SALU instructions to give them a chance to fill MFMA 890 // shadow. That is desirable to fill MFMA shadow with SALU instructions 891 // rather than VALU to prevent power consumption bursts and throttle. 892 auto LastSALU = DAG->SUnits.begin(); 893 auto E = DAG->SUnits.end(); 894 SmallPtrSet<SUnit*, 32> Visited; 895 for (SUnit &SU : DAG->SUnits) { 896 MachineInstr &MAI = *SU.getInstr(); 897 if (!TII->isMAI(MAI) || 898 MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 || 899 MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32) 900 continue; 901 902 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1; 903 904 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU); 905 dbgs() << "Need " << Lat 906 << " instructions to cover latency.\n"); 907 908 // Find up to Lat independent scalar instructions as early as 909 // possible such that they can be scheduled after this MFMA. 910 for ( ; Lat && LastSALU != E; ++LastSALU) { 911 if (Visited.count(&*LastSALU)) 912 continue; 913 914 if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU)) 915 continue; 916 917 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited); 918 } 919 } 920 } 921 }; 922 } // namespace 923 924 void GCNSubtarget::getPostRAMutations( 925 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 926 Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo)); 927 } 928 929 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 930 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 931 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 932 else 933 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); 934 } 935 936 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 937 if (TM.getTargetTriple().getArch() == Triple::amdgcn) 938 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 939 else 940 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); 941 } 942