1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Implements the AMDGPU specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUSubtarget.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUCallLowering.h" 17 #include "AMDGPUInstructionSelector.h" 18 #include "AMDGPULegalizerInfo.h" 19 #include "AMDGPURegisterBankInfo.h" 20 #include "AMDGPUTargetMachine.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "Utils/AMDGPUBaseInfo.h" 23 #include "llvm/ADT/SmallString.h" 24 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h" 25 #include "llvm/CodeGen/MachineScheduler.h" 26 #include "llvm/CodeGen/TargetFrameLowering.h" 27 #include "llvm/IR/IntrinsicsAMDGPU.h" 28 #include "llvm/IR/IntrinsicsR600.h" 29 #include "llvm/IR/MDBuilder.h" 30 #include "llvm/MC/MCSubtargetInfo.h" 31 #include <algorithm> 32 33 using namespace llvm; 34 35 #define DEBUG_TYPE "amdgpu-subtarget" 36 37 #define GET_SUBTARGETINFO_TARGET_DESC 38 #define GET_SUBTARGETINFO_CTOR 39 #define AMDGPUSubtarget GCNSubtarget 40 #include "AMDGPUGenSubtargetInfo.inc" 41 #define GET_SUBTARGETINFO_TARGET_DESC 42 #define GET_SUBTARGETINFO_CTOR 43 #undef AMDGPUSubtarget 44 #include "R600GenSubtargetInfo.inc" 45 46 static cl::opt<bool> DisablePowerSched( 47 "amdgpu-disable-power-sched", 48 cl::desc("Disable scheduling to minimize mAI power bursts"), 49 cl::init(false)); 50 51 static cl::opt<bool> EnableVGPRIndexMode( 52 "amdgpu-vgpr-index-mode", 53 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), 54 cl::init(false)); 55 56 static cl::opt<bool> EnableFlatScratch( 57 "amdgpu-enable-flat-scratch", 58 cl::desc("Use flat scratch instructions"), 59 cl::init(false)); 60 61 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen", 62 cl::desc("Enable the use of AA during codegen."), 63 cl::init(true)); 64 65 GCNSubtarget::~GCNSubtarget() = default; 66 67 R600Subtarget & 68 R600Subtarget::initializeSubtargetDependencies(const Triple &TT, 69 StringRef GPU, StringRef FS) { 70 SmallString<256> FullFS("+promote-alloca,"); 71 FullFS += FS; 72 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS); 73 74 HasMulU24 = getGeneration() >= EVERGREEN; 75 HasMulI24 = hasCaymanISA(); 76 77 return *this; 78 } 79 80 GCNSubtarget & 81 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 82 StringRef GPU, StringRef FS) { 83 // Determine default and user-specified characteristics 84 // 85 // We want to be able to turn these off, but making this a subtarget feature 86 // for SI has the unhelpful behavior that it unsets everything else if you 87 // disable it. 88 // 89 // Similarly we want enable-prt-strict-null to be on by default and not to 90 // unset everything else if it is disabled 91 92 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,"); 93 94 // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default 95 if (isAmdHsaOS()) 96 FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,"; 97 98 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS 99 100 // Disable mutually exclusive bits. 101 if (FS.find_lower("+wavefrontsize") != StringRef::npos) { 102 if (FS.find_lower("wavefrontsize16") == StringRef::npos) 103 FullFS += "-wavefrontsize16,"; 104 if (FS.find_lower("wavefrontsize32") == StringRef::npos) 105 FullFS += "-wavefrontsize32,"; 106 if (FS.find_lower("wavefrontsize64") == StringRef::npos) 107 FullFS += "-wavefrontsize64,"; 108 } 109 110 FullFS += FS; 111 112 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS); 113 114 // Implement the "generic" processors, which acts as the default when no 115 // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to 116 // the first amdgcn target that supports flat addressing. Other OSes defaults 117 // to the first amdgcn target. 118 if (Gen == AMDGPUSubtarget::INVALID) { 119 Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS 120 : AMDGPUSubtarget::SOUTHERN_ISLANDS; 121 } 122 123 // We don't support FP64 for EG/NI atm. 124 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 125 126 // Targets must either support 64-bit offsets for MUBUF instructions, and/or 127 // support flat operations, otherwise they cannot access a 64-bit global 128 // address space 129 assert(hasAddr64() || hasFlat()); 130 // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets 131 // that do not support ADDR64 variants of MUBUF instructions. Such targets 132 // cannot use a 64 bit offset with a MUBUF instruction to access the global 133 // address space 134 if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) { 135 ToggleFeature(AMDGPU::FeatureFlatForGlobal); 136 FlatForGlobal = true; 137 } 138 // Unless +-flat-for-global is specified, use MUBUF instructions for global 139 // address space access if flat operations are not available. 140 if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) { 141 ToggleFeature(AMDGPU::FeatureFlatForGlobal); 142 FlatForGlobal = false; 143 } 144 145 // Set defaults if needed. 146 if (MaxPrivateElementSize == 0) 147 MaxPrivateElementSize = 4; 148 149 if (LDSBankCount == 0) 150 LDSBankCount = 32; 151 152 if (TT.getArch() == Triple::amdgcn) { 153 if (LocalMemorySize == 0) 154 LocalMemorySize = 32768; 155 156 // Do something sensible for unspecified target. 157 if (!HasMovrel && !HasVGPRIndexMode) 158 HasMovrel = true; 159 } 160 161 // Don't crash on invalid devices. 162 if (WavefrontSizeLog2 == 0) 163 WavefrontSizeLog2 = 5; 164 165 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 166 167 TargetID.setTargetIDFromFeaturesString(FS); 168 169 LLVM_DEBUG(dbgs() << "xnack setting for subtarget: " 170 << TargetID.getXnackSetting() << '\n'); 171 LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: " 172 << TargetID.getSramEccSetting() << '\n'); 173 174 return *this; 175 } 176 177 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : 178 TargetTriple(TT), 179 GCN3Encoding(false), 180 Has16BitInsts(false), 181 HasMadMixInsts(false), 182 HasMadMacF32Insts(false), 183 HasDsSrc2Insts(false), 184 HasSDWA(false), 185 HasVOP3PInsts(false), 186 HasMulI24(true), 187 HasMulU24(true), 188 HasInv2PiInlineImm(false), 189 HasFminFmaxLegacy(true), 190 EnablePromoteAlloca(false), 191 HasTrigReducedRange(false), 192 MaxWavesPerEU(10), 193 LocalMemorySize(0), 194 WavefrontSizeLog2(0) 195 { } 196 197 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 198 const GCNTargetMachine &TM) 199 : // clang-format off 200 AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS), 201 AMDGPUSubtarget(TT), 202 TargetTriple(TT), 203 TargetID(*this), 204 Gen(INVALID), 205 InstrItins(getInstrItineraryForCPU(GPU)), 206 LDSBankCount(0), 207 MaxPrivateElementSize(0), 208 209 FastFMAF32(false), 210 FastDenormalF32(false), 211 HalfRate64Ops(false), 212 FullRate64Ops(false), 213 214 FlatForGlobal(false), 215 AutoWaitcntBeforeBarrier(false), 216 UnalignedScratchAccess(false), 217 UnalignedAccessMode(false), 218 219 HasApertureRegs(false), 220 SupportsXNACK(false), 221 EnableXNACK(false), 222 EnableTgSplit(false), 223 EnableCuMode(false), 224 TrapHandler(false), 225 226 EnableLoadStoreOpt(false), 227 EnableUnsafeDSOffsetFolding(false), 228 EnableSIScheduler(false), 229 EnableDS128(false), 230 EnablePRTStrictNull(false), 231 DumpCode(false), 232 233 FP64(false), 234 CIInsts(false), 235 GFX8Insts(false), 236 GFX9Insts(false), 237 GFX90AInsts(false), 238 GFX10Insts(false), 239 GFX10_3Insts(false), 240 GFX7GFX8GFX9Insts(false), 241 SGPRInitBug(false), 242 NegativeScratchOffsetBug(false), 243 NegativeUnalignedScratchOffsetBug(false), 244 HasSMemRealTime(false), 245 HasIntClamp(false), 246 HasFmaMixInsts(false), 247 HasMovrel(false), 248 HasVGPRIndexMode(false), 249 HasScalarStores(false), 250 HasScalarAtomics(false), 251 HasSDWAOmod(false), 252 HasSDWAScalar(false), 253 HasSDWASdst(false), 254 HasSDWAMac(false), 255 HasSDWAOutModsVOPC(false), 256 HasDPP(false), 257 HasDPP8(false), 258 Has64BitDPP(false), 259 HasPackedFP32Ops(false), 260 HasExtendedImageInsts(false), 261 HasR128A16(false), 262 HasGFX10A16(false), 263 HasG16(false), 264 HasNSAEncoding(false), 265 GFX10_AEncoding(false), 266 GFX10_BEncoding(false), 267 HasDLInsts(false), 268 HasDot1Insts(false), 269 HasDot2Insts(false), 270 HasDot3Insts(false), 271 HasDot4Insts(false), 272 HasDot5Insts(false), 273 HasDot6Insts(false), 274 HasDot7Insts(false), 275 HasMAIInsts(false), 276 HasPkFmacF16Inst(false), 277 HasAtomicFaddInsts(false), 278 SupportsSRAMECC(false), 279 EnableSRAMECC(false), 280 HasNoSdstCMPX(false), 281 HasVscnt(false), 282 HasGetWaveIdInst(false), 283 HasSMemTimeInst(false), 284 HasShaderCyclesRegister(false), 285 HasRegisterBanking(false), 286 HasVOP3Literal(false), 287 HasNoDataDepHazard(false), 288 FlatAddressSpace(false), 289 FlatInstOffsets(false), 290 FlatGlobalInsts(false), 291 FlatScratchInsts(false), 292 ScalarFlatScratchInsts(false), 293 HasArchitectedFlatScratch(false), 294 AddNoCarryInsts(false), 295 HasUnpackedD16VMem(false), 296 LDSMisalignedBug(false), 297 HasMFMAInlineLiteralBug(false), 298 UnalignedBufferAccess(false), 299 UnalignedDSAccess(false), 300 HasPackedTID(false), 301 302 ScalarizeGlobal(false), 303 304 HasVcmpxPermlaneHazard(false), 305 HasVMEMtoScalarWriteHazard(false), 306 HasSMEMtoVectorWriteHazard(false), 307 HasInstFwdPrefetchBug(false), 308 HasVcmpxExecWARHazard(false), 309 HasLdsBranchVmemWARHazard(false), 310 HasNSAtoVMEMBug(false), 311 HasNSAClauseBug(false), 312 HasOffset3fBug(false), 313 HasFlatSegmentOffsetBug(false), 314 HasImageStoreD16Bug(false), 315 HasImageGather4D16Bug(false), 316 317 FeatureDisable(false), 318 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 319 TLInfo(TM, *this), 320 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 321 // clang-format on 322 MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this); 323 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 324 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering())); 325 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 326 RegBankInfo.reset(new AMDGPURegisterBankInfo(*this)); 327 InstSelector.reset(new AMDGPUInstructionSelector( 328 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 329 } 330 331 bool GCNSubtarget::enableFlatScratch() const { 332 return flatScratchIsArchitected() || 333 (EnableFlatScratch && hasFlatScratchInsts()); 334 } 335 336 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { 337 if (getGeneration() < GFX10) 338 return 1; 339 340 switch (Opcode) { 341 case AMDGPU::V_LSHLREV_B64_e64: 342 case AMDGPU::V_LSHLREV_B64_gfx10: 343 case AMDGPU::V_LSHL_B64_e64: 344 case AMDGPU::V_LSHRREV_B64_e64: 345 case AMDGPU::V_LSHRREV_B64_gfx10: 346 case AMDGPU::V_LSHR_B64_e64: 347 case AMDGPU::V_ASHRREV_I64_e64: 348 case AMDGPU::V_ASHRREV_I64_gfx10: 349 case AMDGPU::V_ASHR_I64_e64: 350 return 1; 351 } 352 353 return 2; 354 } 355 356 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 357 const Function &F) const { 358 if (NWaves == 1) 359 return getLocalMemorySize(); 360 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 361 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 362 if (!WorkGroupsPerCu) 363 return 0; 364 unsigned MaxWaves = getMaxWavesPerEU(); 365 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 366 } 367 368 // FIXME: Should return min,max range. 369 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 370 const Function &F) const { 371 const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second; 372 const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize); 373 if (!MaxWorkGroupsPerCu) 374 return 0; 375 376 const unsigned WaveSize = getWavefrontSize(); 377 378 // FIXME: Do we need to account for alignment requirement of LDS rounding the 379 // size up? 380 // Compute restriction based on LDS usage 381 unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u); 382 383 // This can be queried with more LDS than is possible, so just assume the 384 // worst. 385 if (NumGroups == 0) 386 return 1; 387 388 NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups); 389 390 // Round to the number of waves. 391 const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize; 392 unsigned MaxWaves = NumGroups * MaxGroupNumWaves; 393 394 // Clamp to the maximum possible number of waves. 395 MaxWaves = std::min(MaxWaves, getMaxWavesPerEU()); 396 397 // FIXME: Needs to be a multiple of the group size? 398 //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves); 399 400 assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() && 401 "computed invalid occupancy"); 402 return MaxWaves; 403 } 404 405 unsigned 406 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 407 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 408 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 409 } 410 411 std::pair<unsigned, unsigned> 412 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 413 switch (CC) { 414 case CallingConv::AMDGPU_VS: 415 case CallingConv::AMDGPU_LS: 416 case CallingConv::AMDGPU_HS: 417 case CallingConv::AMDGPU_ES: 418 case CallingConv::AMDGPU_GS: 419 case CallingConv::AMDGPU_PS: 420 return std::make_pair(1, getWavefrontSize()); 421 default: 422 return std::make_pair(1u, getMaxFlatWorkGroupSize()); 423 } 424 } 425 426 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 427 const Function &F) const { 428 // Default minimum/maximum flat work group sizes. 429 std::pair<unsigned, unsigned> Default = 430 getDefaultFlatWorkGroupSize(F.getCallingConv()); 431 432 // Requested minimum/maximum flat work group sizes. 433 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 434 F, "amdgpu-flat-work-group-size", Default); 435 436 // Make sure requested minimum is less than requested maximum. 437 if (Requested.first > Requested.second) 438 return Default; 439 440 // Make sure requested values do not violate subtarget's specifications. 441 if (Requested.first < getMinFlatWorkGroupSize()) 442 return Default; 443 if (Requested.second > getMaxFlatWorkGroupSize()) 444 return Default; 445 446 return Requested; 447 } 448 449 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 450 const Function &F) const { 451 // Default minimum/maximum number of waves per execution unit. 452 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 453 454 // Default/requested minimum/maximum flat work group sizes. 455 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 456 457 // If minimum/maximum flat work group sizes were explicitly requested using 458 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 459 // number of waves per execution unit to values implied by requested 460 // minimum/maximum flat work group sizes. 461 unsigned MinImpliedByFlatWorkGroupSize = 462 getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second); 463 Default.first = MinImpliedByFlatWorkGroupSize; 464 bool RequestedFlatWorkGroupSize = 465 F.hasFnAttribute("amdgpu-flat-work-group-size"); 466 467 // Requested minimum/maximum number of waves per execution unit. 468 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 469 F, "amdgpu-waves-per-eu", Default, true); 470 471 // Make sure requested minimum is less than requested maximum. 472 if (Requested.second && Requested.first > Requested.second) 473 return Default; 474 475 // Make sure requested values do not violate subtarget's specifications. 476 if (Requested.first < getMinWavesPerEU() || 477 Requested.second > getMaxWavesPerEU()) 478 return Default; 479 480 // Make sure requested values are compatible with values implied by requested 481 // minimum/maximum flat work group sizes. 482 if (RequestedFlatWorkGroupSize && 483 Requested.first < MinImpliedByFlatWorkGroupSize) 484 return Default; 485 486 return Requested; 487 } 488 489 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) { 490 auto Node = Kernel.getMetadata("reqd_work_group_size"); 491 if (Node && Node->getNumOperands() == 3) 492 return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue(); 493 return std::numeric_limits<unsigned>::max(); 494 } 495 496 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const { 497 return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv()); 498 } 499 500 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel, 501 unsigned Dimension) const { 502 unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension); 503 if (ReqdSize != std::numeric_limits<unsigned>::max()) 504 return ReqdSize - 1; 505 return getFlatWorkGroupSizes(Kernel).second - 1; 506 } 507 508 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 509 Function *Kernel = I->getParent()->getParent(); 510 unsigned MinSize = 0; 511 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 512 bool IdQuery = false; 513 514 // If reqd_work_group_size is present it narrows value down. 515 if (auto *CI = dyn_cast<CallInst>(I)) { 516 const Function *F = CI->getCalledFunction(); 517 if (F) { 518 unsigned Dim = UINT_MAX; 519 switch (F->getIntrinsicID()) { 520 case Intrinsic::amdgcn_workitem_id_x: 521 case Intrinsic::r600_read_tidig_x: 522 IdQuery = true; 523 LLVM_FALLTHROUGH; 524 case Intrinsic::r600_read_local_size_x: 525 Dim = 0; 526 break; 527 case Intrinsic::amdgcn_workitem_id_y: 528 case Intrinsic::r600_read_tidig_y: 529 IdQuery = true; 530 LLVM_FALLTHROUGH; 531 case Intrinsic::r600_read_local_size_y: 532 Dim = 1; 533 break; 534 case Intrinsic::amdgcn_workitem_id_z: 535 case Intrinsic::r600_read_tidig_z: 536 IdQuery = true; 537 LLVM_FALLTHROUGH; 538 case Intrinsic::r600_read_local_size_z: 539 Dim = 2; 540 break; 541 default: 542 break; 543 } 544 545 if (Dim <= 3) { 546 unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim); 547 if (ReqdSize != std::numeric_limits<unsigned>::max()) 548 MinSize = MaxSize = ReqdSize; 549 } 550 } 551 } 552 553 if (!MaxSize) 554 return false; 555 556 // Range metadata is [Lo, Hi). For ID query we need to pass max size 557 // as Hi. For size query we need to pass Hi + 1. 558 if (IdQuery) 559 MinSize = 0; 560 else 561 ++MaxSize; 562 563 MDBuilder MDB(I->getContext()); 564 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 565 APInt(32, MaxSize)); 566 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 567 return true; 568 } 569 570 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const { 571 if (isMesaKernel(F)) 572 return 16; 573 return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0); 574 } 575 576 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 577 Align &MaxAlign) const { 578 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 579 F.getCallingConv() == CallingConv::SPIR_KERNEL); 580 581 const DataLayout &DL = F.getParent()->getDataLayout(); 582 uint64_t ExplicitArgBytes = 0; 583 MaxAlign = Align(1); 584 585 for (const Argument &Arg : F.args()) { 586 const bool IsByRef = Arg.hasByRefAttr(); 587 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); 588 MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None; 589 if (!Alignment) 590 Alignment = DL.getABITypeAlign(ArgTy); 591 592 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 593 ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize; 594 MaxAlign = max(MaxAlign, Alignment); 595 } 596 597 return ExplicitArgBytes; 598 } 599 600 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 601 Align &MaxAlign) const { 602 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 603 604 unsigned ExplicitOffset = getExplicitKernelArgOffset(F); 605 606 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 607 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 608 if (ImplicitBytes != 0) { 609 const Align Alignment = getAlignmentForImplicitArgPtr(); 610 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 611 } 612 613 // Being able to dereference past the end is useful for emitting scalar loads. 614 return alignTo(TotalSize, 4); 615 } 616 617 AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const { 618 return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32 619 : AMDGPUDwarfFlavour::Wave64; 620 } 621 622 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 623 const TargetMachine &TM) : 624 R600GenSubtargetInfo(TT, GPU, /*TuneCPU*/GPU, FS), 625 AMDGPUSubtarget(TT), 626 InstrInfo(*this), 627 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 628 FMA(false), 629 CaymanISA(false), 630 CFALUBug(false), 631 HasVertexCache(false), 632 R600ALUInst(false), 633 FP64(false), 634 TexVTXClauseSize(0), 635 Gen(R600), 636 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)), 637 InstrItins(getInstrItineraryForCPU(GPU)) { } 638 639 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 640 unsigned NumRegionInstrs) const { 641 // Track register pressure so the scheduler can try to decrease 642 // pressure once register usage is above the threshold defined by 643 // SIRegisterInfo::getRegPressureSetLimit() 644 Policy.ShouldTrackPressure = true; 645 646 // Enabling both top down and bottom up scheduling seems to give us less 647 // register spills than just using one of these approaches on its own. 648 Policy.OnlyTopDown = false; 649 Policy.OnlyBottomUp = false; 650 651 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 652 if (!enableSIScheduler()) 653 Policy.ShouldTrackLaneMasks = true; 654 } 655 656 bool GCNSubtarget::hasMadF16() const { 657 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1; 658 } 659 660 bool GCNSubtarget::useVGPRIndexMode() const { 661 return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode()); 662 } 663 664 bool GCNSubtarget::useAA() const { return UseAA; } 665 666 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 667 if (getGeneration() >= AMDGPUSubtarget::GFX10) 668 return getMaxWavesPerEU(); 669 670 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 671 if (SGPRs <= 80) 672 return 10; 673 if (SGPRs <= 88) 674 return 9; 675 if (SGPRs <= 100) 676 return 8; 677 return 7; 678 } 679 if (SGPRs <= 48) 680 return 10; 681 if (SGPRs <= 56) 682 return 9; 683 if (SGPRs <= 64) 684 return 8; 685 if (SGPRs <= 72) 686 return 7; 687 if (SGPRs <= 80) 688 return 6; 689 return 5; 690 } 691 692 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 693 unsigned MaxWaves = getMaxWavesPerEU(); 694 unsigned Granule = getVGPRAllocGranule(); 695 if (VGPRs < Granule) 696 return MaxWaves; 697 unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule; 698 return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves); 699 } 700 701 unsigned 702 GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratchInit) const { 703 if (getGeneration() >= AMDGPUSubtarget::GFX10) 704 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. 705 706 if (HasFlatScratchInit) { 707 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 708 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 709 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 710 return 4; // FLAT_SCRATCH, VCC (in that order). 711 } 712 713 if (isXNACKEnabled()) 714 return 4; // XNACK, VCC (in that order). 715 return 2; // VCC. 716 } 717 718 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 719 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 720 return getBaseReservedNumSGPRs(MFI.hasFlatScratchInit()); 721 } 722 723 unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const { 724 // The logic to detect if the function has 725 // flat scratch init is same as how MachineFunctionInfo derives. 726 bool FunctionHasFlatScratchInit = false; 727 bool HasCalls = F.hasFnAttribute("amdgpu-calls"); 728 bool HasStackObjects = F.hasFnAttribute("amdgpu-stack-objects"); 729 if (hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(F.getCallingConv()) && 730 (isAmdHsaOrMesa(F) || enableFlatScratch()) && 731 !flatScratchIsArchitected()) { 732 if (HasCalls || HasStackObjects || enableFlatScratch()) 733 FunctionHasFlatScratchInit = true; 734 } 735 return getBaseReservedNumSGPRs(FunctionHasFlatScratchInit); 736 } 737 738 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize, 739 unsigned NumSGPRs, 740 unsigned NumVGPRs) const { 741 unsigned Occupancy = 742 std::min(getMaxWavesPerEU(), 743 getOccupancyWithLocalMemSize(LDSSize, F)); 744 if (NumSGPRs) 745 Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs)); 746 if (NumVGPRs) 747 Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs)); 748 return Occupancy; 749 } 750 751 unsigned GCNSubtarget::getBaseMaxNumSGPRs( 752 const Function &F, std::pair<unsigned, unsigned> WavesPerEU, 753 unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const { 754 // Compute maximum number of SGPRs function can use using default/requested 755 // minimum number of waves per execution unit. 756 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 757 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 758 759 // Check if maximum number of SGPRs was explicitly requested using 760 // "amdgpu-num-sgpr" attribute. 761 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 762 unsigned Requested = AMDGPU::getIntegerAttribute( 763 F, "amdgpu-num-sgpr", MaxNumSGPRs); 764 765 // Make sure requested value does not violate subtarget's specifications. 766 if (Requested && (Requested <= ReservedNumSGPRs)) 767 Requested = 0; 768 769 // If more SGPRs are required to support the input user/system SGPRs, 770 // increase to accommodate them. 771 // 772 // FIXME: This really ends up using the requested number of SGPRs + number 773 // of reserved special registers in total. Theoretically you could re-use 774 // the last input registers for these special registers, but this would 775 // require a lot of complexity to deal with the weird aliasing. 776 unsigned InputNumSGPRs = PreloadedSGPRs; 777 if (Requested && Requested < InputNumSGPRs) 778 Requested = InputNumSGPRs; 779 780 // Make sure requested value is compatible with values implied by 781 // default/requested minimum/maximum number of waves per execution unit. 782 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 783 Requested = 0; 784 if (WavesPerEU.second && 785 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 786 Requested = 0; 787 788 if (Requested) 789 MaxNumSGPRs = Requested; 790 } 791 792 if (hasSGPRInitBug()) 793 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 794 795 return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs); 796 } 797 798 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 799 const Function &F = MF.getFunction(); 800 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 801 return getBaseMaxNumSGPRs(F, MFI.getWavesPerEU(), MFI.getNumPreloadedSGPRs(), 802 getReservedNumSGPRs(MF)); 803 } 804 805 static unsigned getMaxNumPreloadedSGPRs() { 806 // Max number of user SGPRs 807 unsigned MaxUserSGPRs = 4 + // private segment buffer 808 2 + // Dispatch ptr 809 2 + // queue ptr 810 2 + // kernel segment ptr 811 2 + // dispatch ID 812 2 + // flat scratch init 813 2; // Implicit buffer ptr 814 // Max number of system SGPRs 815 unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX 816 1 + // WorkGroupIDY 817 1 + // WorkGroupIDZ 818 1 + // WorkGroupInfo 819 1; // private segment wave byte offset 820 return MaxUserSGPRs + MaxSystemSGPRs; 821 } 822 823 unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const { 824 return getBaseMaxNumSGPRs(F, getWavesPerEU(F), getMaxNumPreloadedSGPRs(), 825 getReservedNumSGPRs(F)); 826 } 827 828 unsigned GCNSubtarget::getBaseMaxNumVGPRs( 829 const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const { 830 // Compute maximum number of VGPRs function can use using default/requested 831 // minimum number of waves per execution unit. 832 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 833 834 // Check if maximum number of VGPRs was explicitly requested using 835 // "amdgpu-num-vgpr" attribute. 836 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 837 unsigned Requested = AMDGPU::getIntegerAttribute( 838 F, "amdgpu-num-vgpr", MaxNumVGPRs); 839 840 if (hasGFX90AInsts()) 841 Requested *= 2; 842 843 // Make sure requested value is compatible with values implied by 844 // default/requested minimum/maximum number of waves per execution unit. 845 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 846 Requested = 0; 847 if (WavesPerEU.second && 848 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 849 Requested = 0; 850 851 if (Requested) 852 MaxNumVGPRs = Requested; 853 } 854 855 return MaxNumVGPRs; 856 } 857 858 unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const { 859 return getBaseMaxNumVGPRs(F, getWavesPerEU(F)); 860 } 861 862 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 863 const Function &F = MF.getFunction(); 864 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 865 return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU()); 866 } 867 868 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, 869 int UseOpIdx, SDep &Dep) const { 870 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || 871 !Def->isInstr() || !Use->isInstr()) 872 return; 873 874 MachineInstr *DefI = Def->getInstr(); 875 MachineInstr *UseI = Use->getInstr(); 876 877 if (DefI->isBundle()) { 878 const SIRegisterInfo *TRI = getRegisterInfo(); 879 auto Reg = Dep.getReg(); 880 MachineBasicBlock::const_instr_iterator I(DefI->getIterator()); 881 MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end()); 882 unsigned Lat = 0; 883 for (++I; I != E && I->isBundledWithPred(); ++I) { 884 if (I->modifiesRegister(Reg, TRI)) 885 Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I); 886 else if (Lat) 887 --Lat; 888 } 889 Dep.setLatency(Lat); 890 } else if (UseI->isBundle()) { 891 const SIRegisterInfo *TRI = getRegisterInfo(); 892 auto Reg = Dep.getReg(); 893 MachineBasicBlock::const_instr_iterator I(UseI->getIterator()); 894 MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end()); 895 unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI); 896 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) { 897 if (I->readsRegister(Reg, TRI)) 898 break; 899 --Lat; 900 } 901 Dep.setLatency(Lat); 902 } 903 } 904 905 namespace { 906 struct FillMFMAShadowMutation : ScheduleDAGMutation { 907 const SIInstrInfo *TII; 908 909 ScheduleDAGMI *DAG; 910 911 FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {} 912 913 bool isSALU(const SUnit *SU) const { 914 const MachineInstr *MI = SU->getInstr(); 915 return MI && TII->isSALU(*MI) && !MI->isTerminator(); 916 } 917 918 bool isVALU(const SUnit *SU) const { 919 const MachineInstr *MI = SU->getInstr(); 920 return MI && TII->isVALU(*MI); 921 } 922 923 bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const { 924 if (Pred->NodeNum < Succ->NodeNum) 925 return true; 926 927 SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred}); 928 929 for (unsigned I = 0; I < Succs.size(); ++I) { 930 for (const SDep &SI : Succs[I]->Succs) { 931 const SUnit *SU = SI.getSUnit(); 932 if (SU != Succs[I] && !llvm::is_contained(Succs, SU)) 933 Succs.push_back(SU); 934 } 935 } 936 937 SmallPtrSet<const SUnit*, 32> Visited; 938 while (!Preds.empty()) { 939 const SUnit *SU = Preds.pop_back_val(); 940 if (llvm::is_contained(Succs, SU)) 941 return false; 942 Visited.insert(SU); 943 for (const SDep &SI : SU->Preds) 944 if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit())) 945 Preds.push_back(SI.getSUnit()); 946 } 947 948 return true; 949 } 950 951 // Link as much SALU intructions in chain as possible. Return the size 952 // of the chain. Links up to MaxChain instructions. 953 unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain, 954 SmallPtrSetImpl<SUnit *> &Visited) const { 955 SmallVector<SUnit *, 8> Worklist({To}); 956 unsigned Linked = 0; 957 958 while (!Worklist.empty() && MaxChain-- > 0) { 959 SUnit *SU = Worklist.pop_back_val(); 960 if (!Visited.insert(SU).second) 961 continue; 962 963 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From); 964 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n'); 965 966 if (SU->addPred(SDep(From, SDep::Artificial), false)) 967 ++Linked; 968 969 for (SDep &SI : From->Succs) { 970 SUnit *SUv = SI.getSUnit(); 971 if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU)) 972 SUv->addPred(SDep(SU, SDep::Artificial), false); 973 } 974 975 for (SDep &SI : SU->Succs) { 976 SUnit *Succ = SI.getSUnit(); 977 if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ)) 978 Worklist.push_back(Succ); 979 } 980 } 981 982 return Linked; 983 } 984 985 void apply(ScheduleDAGInstrs *DAGInstrs) override { 986 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); 987 if (!ST.hasMAIInsts() || DisablePowerSched) 988 return; 989 DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); 990 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); 991 if (!TSchedModel || DAG->SUnits.empty()) 992 return; 993 994 // Scan for MFMA long latency instructions and try to add a dependency 995 // of available SALU instructions to give them a chance to fill MFMA 996 // shadow. That is desirable to fill MFMA shadow with SALU instructions 997 // rather than VALU to prevent power consumption bursts and throttle. 998 auto LastSALU = DAG->SUnits.begin(); 999 auto E = DAG->SUnits.end(); 1000 SmallPtrSet<SUnit*, 32> Visited; 1001 for (SUnit &SU : DAG->SUnits) { 1002 MachineInstr &MAI = *SU.getInstr(); 1003 if (!TII->isMAI(MAI) || 1004 MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 || 1005 MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64) 1006 continue; 1007 1008 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1; 1009 1010 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU); 1011 dbgs() << "Need " << Lat 1012 << " instructions to cover latency.\n"); 1013 1014 // Find up to Lat independent scalar instructions as early as 1015 // possible such that they can be scheduled after this MFMA. 1016 for ( ; Lat && LastSALU != E; ++LastSALU) { 1017 if (Visited.count(&*LastSALU)) 1018 continue; 1019 1020 if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU)) 1021 continue; 1022 1023 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited); 1024 } 1025 } 1026 } 1027 }; 1028 } // namespace 1029 1030 void GCNSubtarget::getPostRAMutations( 1031 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 1032 Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo)); 1033 } 1034 1035 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 1036 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 1037 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 1038 else 1039 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); 1040 } 1041 1042 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 1043 if (TM.getTargetTriple().getArch() == Triple::amdgcn) 1044 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 1045 else 1046 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); 1047 } 1048