1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Implements the AMDGPU specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUSubtarget.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUTargetMachine.h" 17 #include "AMDGPUCallLowering.h" 18 #include "AMDGPUInstructionSelector.h" 19 #include "AMDGPULegalizerInfo.h" 20 #include "AMDGPURegisterBankInfo.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 23 #include "llvm/ADT/SmallString.h" 24 #include "llvm/CodeGen/MachineScheduler.h" 25 #include "llvm/MC/MCSubtargetInfo.h" 26 #include "llvm/IR/MDBuilder.h" 27 #include "llvm/CodeGen/TargetFrameLowering.h" 28 #include <algorithm> 29 30 using namespace llvm; 31 32 #define DEBUG_TYPE "amdgpu-subtarget" 33 34 #define GET_SUBTARGETINFO_TARGET_DESC 35 #define GET_SUBTARGETINFO_CTOR 36 #define AMDGPUSubtarget GCNSubtarget 37 #include "AMDGPUGenSubtargetInfo.inc" 38 #define GET_SUBTARGETINFO_TARGET_DESC 39 #define GET_SUBTARGETINFO_CTOR 40 #undef AMDGPUSubtarget 41 #include "R600GenSubtargetInfo.inc" 42 43 static cl::opt<bool> DisablePowerSched( 44 "amdgpu-disable-power-sched", 45 cl::desc("Disable scheduling to minimize mAI power bursts"), 46 cl::init(false)); 47 48 static cl::opt<bool> EnableVGPRIndexMode( 49 "amdgpu-vgpr-index-mode", 50 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), 51 cl::init(false)); 52 53 static cl::opt<bool> EnableFlatScratch( 54 "amdgpu-enable-flat-scratch", 55 cl::desc("Use flat scratch instructions"), 56 cl::init(false)); 57 58 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen", 59 cl::desc("Enable the use of AA during codegen."), 60 cl::init(true)); 61 62 GCNSubtarget::~GCNSubtarget() = default; 63 64 R600Subtarget & 65 R600Subtarget::initializeSubtargetDependencies(const Triple &TT, 66 StringRef GPU, StringRef FS) { 67 SmallString<256> FullFS("+promote-alloca,"); 68 FullFS += FS; 69 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS); 70 71 HasMulU24 = getGeneration() >= EVERGREEN; 72 HasMulI24 = hasCaymanISA(); 73 74 return *this; 75 } 76 77 GCNSubtarget & 78 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 79 StringRef GPU, StringRef FS) { 80 // Determine default and user-specified characteristics 81 // 82 // We want to be able to turn these off, but making this a subtarget feature 83 // for SI has the unhelpful behavior that it unsets everything else if you 84 // disable it. 85 // 86 // Similarly we want enable-prt-strict-null to be on by default and not to 87 // unset everything else if it is disabled 88 89 // Assuming ECC is enabled is the conservative default. 90 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,+sram-ecc,+xnack,"); 91 92 // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default 93 if (isAmdHsaOS()) 94 FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,"; 95 96 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS 97 98 // Disable mutually exclusive bits. 99 if (FS.find_lower("+wavefrontsize") != StringRef::npos) { 100 if (FS.find_lower("wavefrontsize16") == StringRef::npos) 101 FullFS += "-wavefrontsize16,"; 102 if (FS.find_lower("wavefrontsize32") == StringRef::npos) 103 FullFS += "-wavefrontsize32,"; 104 if (FS.find_lower("wavefrontsize64") == StringRef::npos) 105 FullFS += "-wavefrontsize64,"; 106 } 107 108 FullFS += FS; 109 110 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS); 111 112 // Implement the "generic" processors, which acts as the default when no 113 // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to 114 // the first amdgcn target that supports flat addressing. Other OSes defaults 115 // to the first amdgcn target. 116 if (Gen == AMDGPUSubtarget::INVALID) { 117 Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS 118 : AMDGPUSubtarget::SOUTHERN_ISLANDS; 119 } 120 121 // We don't support FP64 for EG/NI atm. 122 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 123 124 // Targets must either support 64-bit offsets for MUBUF instructions, and/or 125 // support flat operations, otherwise they cannot access a 64-bit global 126 // address space 127 assert(hasAddr64() || hasFlat()); 128 // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets 129 // that do not support ADDR64 variants of MUBUF instructions. Such targets 130 // cannot use a 64 bit offset with a MUBUF instruction to access the global 131 // address space 132 if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) { 133 ToggleFeature(AMDGPU::FeatureFlatForGlobal); 134 FlatForGlobal = true; 135 } 136 // Unless +-flat-for-global is specified, use MUBUF instructions for global 137 // address space access if flat operations are not available. 138 if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) { 139 ToggleFeature(AMDGPU::FeatureFlatForGlobal); 140 FlatForGlobal = false; 141 } 142 143 // Set defaults if needed. 144 if (MaxPrivateElementSize == 0) 145 MaxPrivateElementSize = 4; 146 147 if (LDSBankCount == 0) 148 LDSBankCount = 32; 149 150 if (TT.getArch() == Triple::amdgcn) { 151 if (LocalMemorySize == 0) 152 LocalMemorySize = 32768; 153 154 // Do something sensible for unspecified target. 155 if (!HasMovrel && !HasVGPRIndexMode) 156 HasMovrel = true; 157 } 158 159 // Don't crash on invalid devices. 160 if (WavefrontSizeLog2 == 0) 161 WavefrontSizeLog2 = 5; 162 163 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 164 165 // Disable XNACK on targets where it is not enabled by default unless it is 166 // explicitly requested. 167 if (!FS.contains("+xnack") && DoesNotSupportXNACK && EnableXNACK) { 168 ToggleFeature(AMDGPU::FeatureXNACK); 169 EnableXNACK = false; 170 } 171 172 // ECC is on by default, but turn it off if the hardware doesn't support it 173 // anyway. This matters for the gfx9 targets with d16 loads, but don't support 174 // ECC. 175 if (DoesNotSupportSRAMECC && EnableSRAMECC) { 176 ToggleFeature(AMDGPU::FeatureSRAMECC); 177 EnableSRAMECC = false; 178 } 179 180 return *this; 181 } 182 183 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : 184 TargetTriple(TT), 185 Has16BitInsts(false), 186 HasMadMixInsts(false), 187 HasMadMacF32Insts(false), 188 HasDsSrc2Insts(false), 189 HasSDWA(false), 190 HasVOP3PInsts(false), 191 HasMulI24(true), 192 HasMulU24(true), 193 HasInv2PiInlineImm(false), 194 HasFminFmaxLegacy(true), 195 EnablePromoteAlloca(false), 196 HasTrigReducedRange(false), 197 MaxWavesPerEU(10), 198 LocalMemorySize(0), 199 WavefrontSizeLog2(0) 200 { } 201 202 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 203 const GCNTargetMachine &TM) : 204 AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS), 205 AMDGPUSubtarget(TT), 206 TargetTriple(TT), 207 Gen(INVALID), 208 InstrItins(getInstrItineraryForCPU(GPU)), 209 LDSBankCount(0), 210 MaxPrivateElementSize(0), 211 212 FastFMAF32(false), 213 FastDenormalF32(false), 214 HalfRate64Ops(false), 215 216 FlatForGlobal(false), 217 AutoWaitcntBeforeBarrier(false), 218 UnalignedScratchAccess(false), 219 UnalignedAccessMode(false), 220 221 HasApertureRegs(false), 222 EnableXNACK(false), 223 DoesNotSupportXNACK(false), 224 EnableCuMode(false), 225 TrapHandler(false), 226 227 EnableLoadStoreOpt(false), 228 EnableUnsafeDSOffsetFolding(false), 229 EnableSIScheduler(false), 230 EnableDS128(false), 231 EnablePRTStrictNull(false), 232 DumpCode(false), 233 234 FP64(false), 235 GCN3Encoding(false), 236 CIInsts(false), 237 GFX8Insts(false), 238 GFX9Insts(false), 239 GFX10Insts(false), 240 GFX10_3Insts(false), 241 GFX7GFX8GFX9Insts(false), 242 SGPRInitBug(false), 243 HasSMemRealTime(false), 244 HasIntClamp(false), 245 HasFmaMixInsts(false), 246 HasMovrel(false), 247 HasVGPRIndexMode(false), 248 HasScalarStores(false), 249 HasScalarAtomics(false), 250 HasSDWAOmod(false), 251 HasSDWAScalar(false), 252 HasSDWASdst(false), 253 HasSDWAMac(false), 254 HasSDWAOutModsVOPC(false), 255 HasDPP(false), 256 HasDPP8(false), 257 HasR128A16(false), 258 HasGFX10A16(false), 259 HasG16(false), 260 HasNSAEncoding(false), 261 GFX10_BEncoding(false), 262 HasDLInsts(false), 263 HasDot1Insts(false), 264 HasDot2Insts(false), 265 HasDot3Insts(false), 266 HasDot4Insts(false), 267 HasDot5Insts(false), 268 HasDot6Insts(false), 269 HasMAIInsts(false), 270 HasPkFmacF16Inst(false), 271 HasAtomicFaddInsts(false), 272 EnableSRAMECC(false), 273 DoesNotSupportSRAMECC(false), 274 HasNoSdstCMPX(false), 275 HasVscnt(false), 276 HasGetWaveIdInst(false), 277 HasSMemTimeInst(false), 278 HasRegisterBanking(false), 279 HasVOP3Literal(false), 280 HasNoDataDepHazard(false), 281 FlatAddressSpace(false), 282 FlatInstOffsets(false), 283 FlatGlobalInsts(false), 284 FlatScratchInsts(false), 285 ScalarFlatScratchInsts(false), 286 AddNoCarryInsts(false), 287 HasUnpackedD16VMem(false), 288 LDSMisalignedBug(false), 289 HasMFMAInlineLiteralBug(false), 290 UnalignedBufferAccess(false), 291 UnalignedDSAccess(false), 292 293 ScalarizeGlobal(false), 294 295 HasVcmpxPermlaneHazard(false), 296 HasVMEMtoScalarWriteHazard(false), 297 HasSMEMtoVectorWriteHazard(false), 298 HasInstFwdPrefetchBug(false), 299 HasVcmpxExecWARHazard(false), 300 HasLdsBranchVmemWARHazard(false), 301 HasNSAtoVMEMBug(false), 302 HasOffset3fBug(false), 303 HasFlatSegmentOffsetBug(false), 304 HasImageStoreD16Bug(false), 305 HasImageGather4D16Bug(false), 306 307 FeatureDisable(false), 308 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 309 TLInfo(TM, *this), 310 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 311 MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this); 312 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 313 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering())); 314 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 315 RegBankInfo.reset(new AMDGPURegisterBankInfo(*this)); 316 InstSelector.reset(new AMDGPUInstructionSelector( 317 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 318 } 319 320 bool GCNSubtarget::enableFlatScratch() const { 321 return EnableFlatScratch && hasFlatScratchInsts(); 322 } 323 324 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { 325 if (getGeneration() < GFX10) 326 return 1; 327 328 switch (Opcode) { 329 case AMDGPU::V_LSHLREV_B64: 330 case AMDGPU::V_LSHLREV_B64_gfx10: 331 case AMDGPU::V_LSHL_B64: 332 case AMDGPU::V_LSHRREV_B64: 333 case AMDGPU::V_LSHRREV_B64_gfx10: 334 case AMDGPU::V_LSHR_B64: 335 case AMDGPU::V_ASHRREV_I64: 336 case AMDGPU::V_ASHRREV_I64_gfx10: 337 case AMDGPU::V_ASHR_I64: 338 return 1; 339 } 340 341 return 2; 342 } 343 344 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 345 const Function &F) const { 346 if (NWaves == 1) 347 return getLocalMemorySize(); 348 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 349 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 350 if (!WorkGroupsPerCu) 351 return 0; 352 unsigned MaxWaves = getMaxWavesPerEU(); 353 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 354 } 355 356 // FIXME: Should return min,max range. 357 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 358 const Function &F) const { 359 const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second; 360 const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize); 361 if (!MaxWorkGroupsPerCu) 362 return 0; 363 364 const unsigned WaveSize = getWavefrontSize(); 365 366 // FIXME: Do we need to account for alignment requirement of LDS rounding the 367 // size up? 368 // Compute restriction based on LDS usage 369 unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u); 370 371 // This can be queried with more LDS than is possible, so just assume the 372 // worst. 373 if (NumGroups == 0) 374 return 1; 375 376 NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups); 377 378 // Round to the number of waves. 379 const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize; 380 unsigned MaxWaves = NumGroups * MaxGroupNumWaves; 381 382 // Clamp to the maximum possible number of waves. 383 MaxWaves = std::min(MaxWaves, getMaxWavesPerEU()); 384 385 // FIXME: Needs to be a multiple of the group size? 386 //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves); 387 388 assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() && 389 "computed invalid occupancy"); 390 return MaxWaves; 391 } 392 393 unsigned 394 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 395 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 396 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 397 } 398 399 std::pair<unsigned, unsigned> 400 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 401 switch (CC) { 402 case CallingConv::AMDGPU_VS: 403 case CallingConv::AMDGPU_LS: 404 case CallingConv::AMDGPU_HS: 405 case CallingConv::AMDGPU_ES: 406 case CallingConv::AMDGPU_GS: 407 case CallingConv::AMDGPU_PS: 408 return std::make_pair(1, getWavefrontSize()); 409 default: 410 return std::make_pair(1u, getMaxFlatWorkGroupSize()); 411 } 412 } 413 414 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 415 const Function &F) const { 416 // Default minimum/maximum flat work group sizes. 417 std::pair<unsigned, unsigned> Default = 418 getDefaultFlatWorkGroupSize(F.getCallingConv()); 419 420 // Requested minimum/maximum flat work group sizes. 421 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 422 F, "amdgpu-flat-work-group-size", Default); 423 424 // Make sure requested minimum is less than requested maximum. 425 if (Requested.first > Requested.second) 426 return Default; 427 428 // Make sure requested values do not violate subtarget's specifications. 429 if (Requested.first < getMinFlatWorkGroupSize()) 430 return Default; 431 if (Requested.second > getMaxFlatWorkGroupSize()) 432 return Default; 433 434 return Requested; 435 } 436 437 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 438 const Function &F) const { 439 // Default minimum/maximum number of waves per execution unit. 440 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 441 442 // Default/requested minimum/maximum flat work group sizes. 443 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 444 445 // If minimum/maximum flat work group sizes were explicitly requested using 446 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 447 // number of waves per execution unit to values implied by requested 448 // minimum/maximum flat work group sizes. 449 unsigned MinImpliedByFlatWorkGroupSize = 450 getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second); 451 Default.first = MinImpliedByFlatWorkGroupSize; 452 bool RequestedFlatWorkGroupSize = 453 F.hasFnAttribute("amdgpu-flat-work-group-size"); 454 455 // Requested minimum/maximum number of waves per execution unit. 456 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 457 F, "amdgpu-waves-per-eu", Default, true); 458 459 // Make sure requested minimum is less than requested maximum. 460 if (Requested.second && Requested.first > Requested.second) 461 return Default; 462 463 // Make sure requested values do not violate subtarget's specifications. 464 if (Requested.first < getMinWavesPerEU() || 465 Requested.second > getMaxWavesPerEU()) 466 return Default; 467 468 // Make sure requested values are compatible with values implied by requested 469 // minimum/maximum flat work group sizes. 470 if (RequestedFlatWorkGroupSize && 471 Requested.first < MinImpliedByFlatWorkGroupSize) 472 return Default; 473 474 return Requested; 475 } 476 477 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) { 478 auto Node = Kernel.getMetadata("reqd_work_group_size"); 479 if (Node && Node->getNumOperands() == 3) 480 return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue(); 481 return std::numeric_limits<unsigned>::max(); 482 } 483 484 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel, 485 unsigned Dimension) const { 486 unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension); 487 if (ReqdSize != std::numeric_limits<unsigned>::max()) 488 return ReqdSize - 1; 489 return getFlatWorkGroupSizes(Kernel).second - 1; 490 } 491 492 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 493 Function *Kernel = I->getParent()->getParent(); 494 unsigned MinSize = 0; 495 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 496 bool IdQuery = false; 497 498 // If reqd_work_group_size is present it narrows value down. 499 if (auto *CI = dyn_cast<CallInst>(I)) { 500 const Function *F = CI->getCalledFunction(); 501 if (F) { 502 unsigned Dim = UINT_MAX; 503 switch (F->getIntrinsicID()) { 504 case Intrinsic::amdgcn_workitem_id_x: 505 case Intrinsic::r600_read_tidig_x: 506 IdQuery = true; 507 LLVM_FALLTHROUGH; 508 case Intrinsic::r600_read_local_size_x: 509 Dim = 0; 510 break; 511 case Intrinsic::amdgcn_workitem_id_y: 512 case Intrinsic::r600_read_tidig_y: 513 IdQuery = true; 514 LLVM_FALLTHROUGH; 515 case Intrinsic::r600_read_local_size_y: 516 Dim = 1; 517 break; 518 case Intrinsic::amdgcn_workitem_id_z: 519 case Intrinsic::r600_read_tidig_z: 520 IdQuery = true; 521 LLVM_FALLTHROUGH; 522 case Intrinsic::r600_read_local_size_z: 523 Dim = 2; 524 break; 525 default: 526 break; 527 } 528 529 if (Dim <= 3) { 530 unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim); 531 if (ReqdSize != std::numeric_limits<unsigned>::max()) 532 MinSize = MaxSize = ReqdSize; 533 } 534 } 535 } 536 537 if (!MaxSize) 538 return false; 539 540 // Range metadata is [Lo, Hi). For ID query we need to pass max size 541 // as Hi. For size query we need to pass Hi + 1. 542 if (IdQuery) 543 MinSize = 0; 544 else 545 ++MaxSize; 546 547 MDBuilder MDB(I->getContext()); 548 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 549 APInt(32, MaxSize)); 550 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 551 return true; 552 } 553 554 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 555 Align &MaxAlign) const { 556 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 557 F.getCallingConv() == CallingConv::SPIR_KERNEL); 558 559 const DataLayout &DL = F.getParent()->getDataLayout(); 560 uint64_t ExplicitArgBytes = 0; 561 MaxAlign = Align(1); 562 563 for (const Argument &Arg : F.args()) { 564 const bool IsByRef = Arg.hasByRefAttr(); 565 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); 566 MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None; 567 if (!Alignment) 568 Alignment = DL.getABITypeAlign(ArgTy); 569 570 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 571 ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize; 572 MaxAlign = max(MaxAlign, Alignment); 573 } 574 575 return ExplicitArgBytes; 576 } 577 578 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 579 Align &MaxAlign) const { 580 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 581 582 unsigned ExplicitOffset = getExplicitKernelArgOffset(F); 583 584 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 585 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 586 if (ImplicitBytes != 0) { 587 const Align Alignment = getAlignmentForImplicitArgPtr(); 588 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 589 } 590 591 // Being able to dereference past the end is useful for emitting scalar loads. 592 return alignTo(TotalSize, 4); 593 } 594 595 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 596 const TargetMachine &TM) : 597 R600GenSubtargetInfo(TT, GPU, /*TuneCPU*/GPU, FS), 598 AMDGPUSubtarget(TT), 599 InstrInfo(*this), 600 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 601 FMA(false), 602 CaymanISA(false), 603 CFALUBug(false), 604 HasVertexCache(false), 605 R600ALUInst(false), 606 FP64(false), 607 TexVTXClauseSize(0), 608 Gen(R600), 609 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)), 610 InstrItins(getInstrItineraryForCPU(GPU)) { } 611 612 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 613 unsigned NumRegionInstrs) const { 614 // Track register pressure so the scheduler can try to decrease 615 // pressure once register usage is above the threshold defined by 616 // SIRegisterInfo::getRegPressureSetLimit() 617 Policy.ShouldTrackPressure = true; 618 619 // Enabling both top down and bottom up scheduling seems to give us less 620 // register spills than just using one of these approaches on its own. 621 Policy.OnlyTopDown = false; 622 Policy.OnlyBottomUp = false; 623 624 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 625 if (!enableSIScheduler()) 626 Policy.ShouldTrackLaneMasks = true; 627 } 628 629 bool GCNSubtarget::hasMadF16() const { 630 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1; 631 } 632 633 bool GCNSubtarget::useVGPRIndexMode() const { 634 return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode()); 635 } 636 637 bool GCNSubtarget::useAA() const { return UseAA; } 638 639 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 640 if (getGeneration() >= AMDGPUSubtarget::GFX10) 641 return getMaxWavesPerEU(); 642 643 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 644 if (SGPRs <= 80) 645 return 10; 646 if (SGPRs <= 88) 647 return 9; 648 if (SGPRs <= 100) 649 return 8; 650 return 7; 651 } 652 if (SGPRs <= 48) 653 return 10; 654 if (SGPRs <= 56) 655 return 9; 656 if (SGPRs <= 64) 657 return 8; 658 if (SGPRs <= 72) 659 return 7; 660 if (SGPRs <= 80) 661 return 6; 662 return 5; 663 } 664 665 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 666 unsigned MaxWaves = getMaxWavesPerEU(); 667 unsigned Granule = getVGPRAllocGranule(); 668 if (VGPRs < Granule) 669 return MaxWaves; 670 unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule; 671 return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves); 672 } 673 674 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 675 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 676 if (getGeneration() >= AMDGPUSubtarget::GFX10) 677 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. 678 679 if (MFI.hasFlatScratchInit()) { 680 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 681 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 682 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 683 return 4; // FLAT_SCRATCH, VCC (in that order). 684 } 685 686 if (isXNACKEnabled()) 687 return 4; // XNACK, VCC (in that order). 688 return 2; // VCC. 689 } 690 691 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize, 692 unsigned NumSGPRs, 693 unsigned NumVGPRs) const { 694 unsigned Occupancy = 695 std::min(getMaxWavesPerEU(), 696 getOccupancyWithLocalMemSize(LDSSize, F)); 697 if (NumSGPRs) 698 Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs)); 699 if (NumVGPRs) 700 Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs)); 701 return Occupancy; 702 } 703 704 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 705 const Function &F = MF.getFunction(); 706 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 707 708 // Compute maximum number of SGPRs function can use using default/requested 709 // minimum number of waves per execution unit. 710 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 711 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 712 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 713 714 // Check if maximum number of SGPRs was explicitly requested using 715 // "amdgpu-num-sgpr" attribute. 716 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 717 unsigned Requested = AMDGPU::getIntegerAttribute( 718 F, "amdgpu-num-sgpr", MaxNumSGPRs); 719 720 // Make sure requested value does not violate subtarget's specifications. 721 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 722 Requested = 0; 723 724 // If more SGPRs are required to support the input user/system SGPRs, 725 // increase to accommodate them. 726 // 727 // FIXME: This really ends up using the requested number of SGPRs + number 728 // of reserved special registers in total. Theoretically you could re-use 729 // the last input registers for these special registers, but this would 730 // require a lot of complexity to deal with the weird aliasing. 731 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 732 if (Requested && Requested < InputNumSGPRs) 733 Requested = InputNumSGPRs; 734 735 // Make sure requested value is compatible with values implied by 736 // default/requested minimum/maximum number of waves per execution unit. 737 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 738 Requested = 0; 739 if (WavesPerEU.second && 740 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 741 Requested = 0; 742 743 if (Requested) 744 MaxNumSGPRs = Requested; 745 } 746 747 if (hasSGPRInitBug()) 748 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 749 750 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 751 MaxAddressableNumSGPRs); 752 } 753 754 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 755 const Function &F = MF.getFunction(); 756 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 757 758 // Compute maximum number of VGPRs function can use using default/requested 759 // minimum number of waves per execution unit. 760 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 761 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 762 763 // Check if maximum number of VGPRs was explicitly requested using 764 // "amdgpu-num-vgpr" attribute. 765 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 766 unsigned Requested = AMDGPU::getIntegerAttribute( 767 F, "amdgpu-num-vgpr", MaxNumVGPRs); 768 769 // Make sure requested value is compatible with values implied by 770 // default/requested minimum/maximum number of waves per execution unit. 771 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 772 Requested = 0; 773 if (WavesPerEU.second && 774 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 775 Requested = 0; 776 777 if (Requested) 778 MaxNumVGPRs = Requested; 779 } 780 781 return MaxNumVGPRs; 782 } 783 784 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, 785 int UseOpIdx, SDep &Dep) const { 786 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || 787 !Def->isInstr() || !Use->isInstr()) 788 return; 789 790 MachineInstr *DefI = Def->getInstr(); 791 MachineInstr *UseI = Use->getInstr(); 792 793 if (DefI->isBundle()) { 794 const SIRegisterInfo *TRI = getRegisterInfo(); 795 auto Reg = Dep.getReg(); 796 MachineBasicBlock::const_instr_iterator I(DefI->getIterator()); 797 MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end()); 798 unsigned Lat = 0; 799 for (++I; I != E && I->isBundledWithPred(); ++I) { 800 if (I->modifiesRegister(Reg, TRI)) 801 Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I); 802 else if (Lat) 803 --Lat; 804 } 805 Dep.setLatency(Lat); 806 } else if (UseI->isBundle()) { 807 const SIRegisterInfo *TRI = getRegisterInfo(); 808 auto Reg = Dep.getReg(); 809 MachineBasicBlock::const_instr_iterator I(UseI->getIterator()); 810 MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end()); 811 unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI); 812 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) { 813 if (I->readsRegister(Reg, TRI)) 814 break; 815 --Lat; 816 } 817 Dep.setLatency(Lat); 818 } 819 } 820 821 namespace { 822 struct FillMFMAShadowMutation : ScheduleDAGMutation { 823 const SIInstrInfo *TII; 824 825 ScheduleDAGMI *DAG; 826 827 FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {} 828 829 bool isSALU(const SUnit *SU) const { 830 const MachineInstr *MI = SU->getInstr(); 831 return MI && TII->isSALU(*MI) && !MI->isTerminator(); 832 } 833 834 bool isVALU(const SUnit *SU) const { 835 const MachineInstr *MI = SU->getInstr(); 836 return MI && TII->isVALU(*MI); 837 } 838 839 bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const { 840 if (Pred->NodeNum < Succ->NodeNum) 841 return true; 842 843 SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred}); 844 845 for (unsigned I = 0; I < Succs.size(); ++I) { 846 for (const SDep &SI : Succs[I]->Succs) { 847 const SUnit *SU = SI.getSUnit(); 848 if (SU != Succs[I] && !llvm::is_contained(Succs, SU)) 849 Succs.push_back(SU); 850 } 851 } 852 853 SmallPtrSet<const SUnit*, 32> Visited; 854 while (!Preds.empty()) { 855 const SUnit *SU = Preds.pop_back_val(); 856 if (llvm::is_contained(Succs, SU)) 857 return false; 858 Visited.insert(SU); 859 for (const SDep &SI : SU->Preds) 860 if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit())) 861 Preds.push_back(SI.getSUnit()); 862 } 863 864 return true; 865 } 866 867 // Link as much SALU intructions in chain as possible. Return the size 868 // of the chain. Links up to MaxChain instructions. 869 unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain, 870 SmallPtrSetImpl<SUnit *> &Visited) const { 871 SmallVector<SUnit *, 8> Worklist({To}); 872 unsigned Linked = 0; 873 874 while (!Worklist.empty() && MaxChain-- > 0) { 875 SUnit *SU = Worklist.pop_back_val(); 876 if (!Visited.insert(SU).second) 877 continue; 878 879 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From); 880 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n'); 881 882 if (SU->addPred(SDep(From, SDep::Artificial), false)) 883 ++Linked; 884 885 for (SDep &SI : From->Succs) { 886 SUnit *SUv = SI.getSUnit(); 887 if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU)) 888 SUv->addPred(SDep(SU, SDep::Artificial), false); 889 } 890 891 for (SDep &SI : SU->Succs) { 892 SUnit *Succ = SI.getSUnit(); 893 if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ)) 894 Worklist.push_back(Succ); 895 } 896 } 897 898 return Linked; 899 } 900 901 void apply(ScheduleDAGInstrs *DAGInstrs) override { 902 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); 903 if (!ST.hasMAIInsts() || DisablePowerSched) 904 return; 905 DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); 906 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); 907 if (!TSchedModel || DAG->SUnits.empty()) 908 return; 909 910 // Scan for MFMA long latency instructions and try to add a dependency 911 // of available SALU instructions to give them a chance to fill MFMA 912 // shadow. That is desirable to fill MFMA shadow with SALU instructions 913 // rather than VALU to prevent power consumption bursts and throttle. 914 auto LastSALU = DAG->SUnits.begin(); 915 auto E = DAG->SUnits.end(); 916 SmallPtrSet<SUnit*, 32> Visited; 917 for (SUnit &SU : DAG->SUnits) { 918 MachineInstr &MAI = *SU.getInstr(); 919 if (!TII->isMAI(MAI) || 920 MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 || 921 MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32) 922 continue; 923 924 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1; 925 926 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU); 927 dbgs() << "Need " << Lat 928 << " instructions to cover latency.\n"); 929 930 // Find up to Lat independent scalar instructions as early as 931 // possible such that they can be scheduled after this MFMA. 932 for ( ; Lat && LastSALU != E; ++LastSALU) { 933 if (Visited.count(&*LastSALU)) 934 continue; 935 936 if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU)) 937 continue; 938 939 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited); 940 } 941 } 942 } 943 }; 944 } // namespace 945 946 void GCNSubtarget::getPostRAMutations( 947 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 948 Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo)); 949 } 950 951 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 952 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 953 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 954 else 955 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); 956 } 957 958 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 959 if (TM.getTargetTriple().getArch() == Triple::amdgcn) 960 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 961 else 962 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); 963 } 964