1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Implements the AMDGPU specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUSubtarget.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUCallLowering.h" 17 #include "AMDGPUInstructionSelector.h" 18 #include "AMDGPULegalizerInfo.h" 19 #include "AMDGPURegisterBankInfo.h" 20 #include "AMDGPUTargetMachine.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "Utils/AMDGPUBaseInfo.h" 23 #include "llvm/ADT/SmallString.h" 24 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h" 25 #include "llvm/CodeGen/MachineScheduler.h" 26 #include "llvm/CodeGen/TargetFrameLowering.h" 27 #include "llvm/IR/IntrinsicsAMDGPU.h" 28 #include "llvm/IR/IntrinsicsR600.h" 29 #include "llvm/IR/MDBuilder.h" 30 #include "llvm/MC/MCSubtargetInfo.h" 31 #include <algorithm> 32 33 using namespace llvm; 34 35 #define DEBUG_TYPE "amdgpu-subtarget" 36 37 #define GET_SUBTARGETINFO_TARGET_DESC 38 #define GET_SUBTARGETINFO_CTOR 39 #define AMDGPUSubtarget GCNSubtarget 40 #include "AMDGPUGenSubtargetInfo.inc" 41 #define GET_SUBTARGETINFO_TARGET_DESC 42 #define GET_SUBTARGETINFO_CTOR 43 #undef AMDGPUSubtarget 44 #include "R600GenSubtargetInfo.inc" 45 46 static cl::opt<bool> DisablePowerSched( 47 "amdgpu-disable-power-sched", 48 cl::desc("Disable scheduling to minimize mAI power bursts"), 49 cl::init(false)); 50 51 static cl::opt<bool> EnableVGPRIndexMode( 52 "amdgpu-vgpr-index-mode", 53 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), 54 cl::init(false)); 55 56 static cl::opt<bool> EnableFlatScratch( 57 "amdgpu-enable-flat-scratch", 58 cl::desc("Use flat scratch instructions"), 59 cl::init(false)); 60 61 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen", 62 cl::desc("Enable the use of AA during codegen."), 63 cl::init(true)); 64 65 GCNSubtarget::~GCNSubtarget() = default; 66 67 R600Subtarget & 68 R600Subtarget::initializeSubtargetDependencies(const Triple &TT, 69 StringRef GPU, StringRef FS) { 70 SmallString<256> FullFS("+promote-alloca,"); 71 FullFS += FS; 72 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS); 73 74 HasMulU24 = getGeneration() >= EVERGREEN; 75 HasMulI24 = hasCaymanISA(); 76 77 return *this; 78 } 79 80 GCNSubtarget & 81 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 82 StringRef GPU, StringRef FS) { 83 // Determine default and user-specified characteristics 84 // 85 // We want to be able to turn these off, but making this a subtarget feature 86 // for SI has the unhelpful behavior that it unsets everything else if you 87 // disable it. 88 // 89 // Similarly we want enable-prt-strict-null to be on by default and not to 90 // unset everything else if it is disabled 91 92 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,"); 93 94 // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default 95 if (isAmdHsaOS()) 96 FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,"; 97 98 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS 99 100 // Disable mutually exclusive bits. 101 if (FS.find_lower("+wavefrontsize") != StringRef::npos) { 102 if (FS.find_lower("wavefrontsize16") == StringRef::npos) 103 FullFS += "-wavefrontsize16,"; 104 if (FS.find_lower("wavefrontsize32") == StringRef::npos) 105 FullFS += "-wavefrontsize32,"; 106 if (FS.find_lower("wavefrontsize64") == StringRef::npos) 107 FullFS += "-wavefrontsize64,"; 108 } 109 110 FullFS += FS; 111 112 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS); 113 114 // Implement the "generic" processors, which acts as the default when no 115 // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to 116 // the first amdgcn target that supports flat addressing. Other OSes defaults 117 // to the first amdgcn target. 118 if (Gen == AMDGPUSubtarget::INVALID) { 119 Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS 120 : AMDGPUSubtarget::SOUTHERN_ISLANDS; 121 } 122 123 // We don't support FP64 for EG/NI atm. 124 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 125 126 // Targets must either support 64-bit offsets for MUBUF instructions, and/or 127 // support flat operations, otherwise they cannot access a 64-bit global 128 // address space 129 assert(hasAddr64() || hasFlat()); 130 // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets 131 // that do not support ADDR64 variants of MUBUF instructions. Such targets 132 // cannot use a 64 bit offset with a MUBUF instruction to access the global 133 // address space 134 if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) { 135 ToggleFeature(AMDGPU::FeatureFlatForGlobal); 136 FlatForGlobal = true; 137 } 138 // Unless +-flat-for-global is specified, use MUBUF instructions for global 139 // address space access if flat operations are not available. 140 if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) { 141 ToggleFeature(AMDGPU::FeatureFlatForGlobal); 142 FlatForGlobal = false; 143 } 144 145 // Set defaults if needed. 146 if (MaxPrivateElementSize == 0) 147 MaxPrivateElementSize = 4; 148 149 if (LDSBankCount == 0) 150 LDSBankCount = 32; 151 152 if (TT.getArch() == Triple::amdgcn) { 153 if (LocalMemorySize == 0) 154 LocalMemorySize = 32768; 155 156 // Do something sensible for unspecified target. 157 if (!HasMovrel && !HasVGPRIndexMode) 158 HasMovrel = true; 159 } 160 161 // Don't crash on invalid devices. 162 if (WavefrontSizeLog2 == 0) 163 WavefrontSizeLog2 = 5; 164 165 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 166 167 TargetID.setTargetIDFromFeaturesString(FS); 168 169 LLVM_DEBUG(dbgs() << "xnack setting for subtarget: " 170 << TargetID.getXnackSetting() << '\n'); 171 LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: " 172 << TargetID.getSramEccSetting() << '\n'); 173 174 return *this; 175 } 176 177 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : 178 TargetTriple(TT), 179 GCN3Encoding(false), 180 Has16BitInsts(false), 181 HasMadMixInsts(false), 182 HasMadMacF32Insts(false), 183 HasDsSrc2Insts(false), 184 HasSDWA(false), 185 HasVOP3PInsts(false), 186 HasMulI24(true), 187 HasMulU24(true), 188 HasInv2PiInlineImm(false), 189 HasFminFmaxLegacy(true), 190 EnablePromoteAlloca(false), 191 HasTrigReducedRange(false), 192 MaxWavesPerEU(10), 193 LocalMemorySize(0), 194 WavefrontSizeLog2(0) 195 { } 196 197 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 198 const GCNTargetMachine &TM) 199 : // clang-format off 200 AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS), 201 AMDGPUSubtarget(TT), 202 TargetTriple(TT), 203 TargetID(*this), 204 Gen(INVALID), 205 InstrItins(getInstrItineraryForCPU(GPU)), 206 LDSBankCount(0), 207 MaxPrivateElementSize(0), 208 209 FastFMAF32(false), 210 FastDenormalF32(false), 211 HalfRate64Ops(false), 212 FullRate64Ops(false), 213 214 FlatForGlobal(false), 215 AutoWaitcntBeforeBarrier(false), 216 UnalignedScratchAccess(false), 217 UnalignedAccessMode(false), 218 219 HasApertureRegs(false), 220 SupportsXNACK(false), 221 EnableXNACK(false), 222 EnableTgSplit(false), 223 EnableCuMode(false), 224 TrapHandler(false), 225 226 EnableLoadStoreOpt(false), 227 EnableUnsafeDSOffsetFolding(false), 228 EnableSIScheduler(false), 229 EnableDS128(false), 230 EnablePRTStrictNull(false), 231 DumpCode(false), 232 233 FP64(false), 234 CIInsts(false), 235 GFX8Insts(false), 236 GFX9Insts(false), 237 GFX90AInsts(false), 238 GFX10Insts(false), 239 GFX10_3Insts(false), 240 GFX7GFX8GFX9Insts(false), 241 SGPRInitBug(false), 242 NegativeScratchOffsetBug(false), 243 NegativeUnalignedScratchOffsetBug(false), 244 HasSMemRealTime(false), 245 HasIntClamp(false), 246 HasFmaMixInsts(false), 247 HasMovrel(false), 248 HasVGPRIndexMode(false), 249 HasScalarStores(false), 250 HasScalarAtomics(false), 251 HasSDWAOmod(false), 252 HasSDWAScalar(false), 253 HasSDWASdst(false), 254 HasSDWAMac(false), 255 HasSDWAOutModsVOPC(false), 256 HasDPP(false), 257 HasDPP8(false), 258 Has64BitDPP(false), 259 HasPackedFP32Ops(false), 260 HasExtendedImageInsts(false), 261 HasR128A16(false), 262 HasGFX10A16(false), 263 HasG16(false), 264 HasNSAEncoding(false), 265 GFX10_AEncoding(false), 266 GFX10_BEncoding(false), 267 HasDLInsts(false), 268 HasDot1Insts(false), 269 HasDot2Insts(false), 270 HasDot3Insts(false), 271 HasDot4Insts(false), 272 HasDot5Insts(false), 273 HasDot6Insts(false), 274 HasDot7Insts(false), 275 HasMAIInsts(false), 276 HasPkFmacF16Inst(false), 277 HasAtomicFaddInsts(false), 278 SupportsSRAMECC(false), 279 EnableSRAMECC(false), 280 HasNoSdstCMPX(false), 281 HasVscnt(false), 282 HasGetWaveIdInst(false), 283 HasSMemTimeInst(false), 284 HasShaderCyclesRegister(false), 285 HasRegisterBanking(false), 286 HasVOP3Literal(false), 287 HasNoDataDepHazard(false), 288 FlatAddressSpace(false), 289 FlatInstOffsets(false), 290 FlatGlobalInsts(false), 291 FlatScratchInsts(false), 292 ScalarFlatScratchInsts(false), 293 HasArchitectedFlatScratch(false), 294 AddNoCarryInsts(false), 295 HasUnpackedD16VMem(false), 296 LDSMisalignedBug(false), 297 HasMFMAInlineLiteralBug(false), 298 UnalignedBufferAccess(false), 299 UnalignedDSAccess(false), 300 HasPackedTID(false), 301 302 ScalarizeGlobal(false), 303 304 HasVcmpxPermlaneHazard(false), 305 HasVMEMtoScalarWriteHazard(false), 306 HasSMEMtoVectorWriteHazard(false), 307 HasInstFwdPrefetchBug(false), 308 HasVcmpxExecWARHazard(false), 309 HasLdsBranchVmemWARHazard(false), 310 HasNSAtoVMEMBug(false), 311 HasNSAClauseBug(false), 312 HasOffset3fBug(false), 313 HasFlatSegmentOffsetBug(false), 314 HasImageStoreD16Bug(false), 315 HasImageGather4D16Bug(false), 316 317 FeatureDisable(false), 318 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 319 TLInfo(TM, *this), 320 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 321 // clang-format on 322 MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this); 323 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 324 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering())); 325 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 326 RegBankInfo.reset(new AMDGPURegisterBankInfo(*this)); 327 InstSelector.reset(new AMDGPUInstructionSelector( 328 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 329 } 330 331 bool GCNSubtarget::enableFlatScratch() const { 332 return flatScratchIsArchitected() || 333 (EnableFlatScratch && hasFlatScratchInsts()); 334 } 335 336 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { 337 if (getGeneration() < GFX10) 338 return 1; 339 340 switch (Opcode) { 341 case AMDGPU::V_LSHLREV_B64_e64: 342 case AMDGPU::V_LSHLREV_B64_gfx10: 343 case AMDGPU::V_LSHL_B64_e64: 344 case AMDGPU::V_LSHRREV_B64_e64: 345 case AMDGPU::V_LSHRREV_B64_gfx10: 346 case AMDGPU::V_LSHR_B64_e64: 347 case AMDGPU::V_ASHRREV_I64_e64: 348 case AMDGPU::V_ASHRREV_I64_gfx10: 349 case AMDGPU::V_ASHR_I64_e64: 350 return 1; 351 } 352 353 return 2; 354 } 355 356 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 357 const Function &F) const { 358 if (NWaves == 1) 359 return getLocalMemorySize(); 360 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 361 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 362 if (!WorkGroupsPerCu) 363 return 0; 364 unsigned MaxWaves = getMaxWavesPerEU(); 365 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 366 } 367 368 // FIXME: Should return min,max range. 369 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 370 const Function &F) const { 371 const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second; 372 const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize); 373 if (!MaxWorkGroupsPerCu) 374 return 0; 375 376 const unsigned WaveSize = getWavefrontSize(); 377 378 // FIXME: Do we need to account for alignment requirement of LDS rounding the 379 // size up? 380 // Compute restriction based on LDS usage 381 unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u); 382 383 // This can be queried with more LDS than is possible, so just assume the 384 // worst. 385 if (NumGroups == 0) 386 return 1; 387 388 NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups); 389 390 // Round to the number of waves. 391 const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize; 392 unsigned MaxWaves = NumGroups * MaxGroupNumWaves; 393 394 // Clamp to the maximum possible number of waves. 395 MaxWaves = std::min(MaxWaves, getMaxWavesPerEU()); 396 397 // FIXME: Needs to be a multiple of the group size? 398 //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves); 399 400 assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() && 401 "computed invalid occupancy"); 402 return MaxWaves; 403 } 404 405 unsigned 406 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 407 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 408 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 409 } 410 411 std::pair<unsigned, unsigned> 412 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 413 switch (CC) { 414 case CallingConv::AMDGPU_VS: 415 case CallingConv::AMDGPU_LS: 416 case CallingConv::AMDGPU_HS: 417 case CallingConv::AMDGPU_ES: 418 case CallingConv::AMDGPU_GS: 419 case CallingConv::AMDGPU_PS: 420 return std::make_pair(1, getWavefrontSize()); 421 default: 422 return std::make_pair(1u, getMaxFlatWorkGroupSize()); 423 } 424 } 425 426 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 427 const Function &F) const { 428 // Default minimum/maximum flat work group sizes. 429 std::pair<unsigned, unsigned> Default = 430 getDefaultFlatWorkGroupSize(F.getCallingConv()); 431 432 // Requested minimum/maximum flat work group sizes. 433 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 434 F, "amdgpu-flat-work-group-size", Default); 435 436 // Make sure requested minimum is less than requested maximum. 437 if (Requested.first > Requested.second) 438 return Default; 439 440 // Make sure requested values do not violate subtarget's specifications. 441 if (Requested.first < getMinFlatWorkGroupSize()) 442 return Default; 443 if (Requested.second > getMaxFlatWorkGroupSize()) 444 return Default; 445 446 return Requested; 447 } 448 449 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 450 const Function &F) const { 451 // Default minimum/maximum number of waves per execution unit. 452 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 453 454 // Default/requested minimum/maximum flat work group sizes. 455 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 456 457 // If minimum/maximum flat work group sizes were explicitly requested using 458 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 459 // number of waves per execution unit to values implied by requested 460 // minimum/maximum flat work group sizes. 461 unsigned MinImpliedByFlatWorkGroupSize = 462 getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second); 463 Default.first = MinImpliedByFlatWorkGroupSize; 464 bool RequestedFlatWorkGroupSize = 465 F.hasFnAttribute("amdgpu-flat-work-group-size"); 466 467 // Requested minimum/maximum number of waves per execution unit. 468 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 469 F, "amdgpu-waves-per-eu", Default, true); 470 471 // Make sure requested minimum is less than requested maximum. 472 if (Requested.second && Requested.first > Requested.second) 473 return Default; 474 475 // Make sure requested values do not violate subtarget's specifications. 476 if (Requested.first < getMinWavesPerEU() || 477 Requested.second > getMaxWavesPerEU()) 478 return Default; 479 480 // Make sure requested values are compatible with values implied by requested 481 // minimum/maximum flat work group sizes. 482 if (RequestedFlatWorkGroupSize && 483 Requested.first < MinImpliedByFlatWorkGroupSize) 484 return Default; 485 486 return Requested; 487 } 488 489 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) { 490 auto Node = Kernel.getMetadata("reqd_work_group_size"); 491 if (Node && Node->getNumOperands() == 3) 492 return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue(); 493 return std::numeric_limits<unsigned>::max(); 494 } 495 496 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const { 497 return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv()); 498 } 499 500 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel, 501 unsigned Dimension) const { 502 unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension); 503 if (ReqdSize != std::numeric_limits<unsigned>::max()) 504 return ReqdSize - 1; 505 return getFlatWorkGroupSizes(Kernel).second - 1; 506 } 507 508 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 509 Function *Kernel = I->getParent()->getParent(); 510 unsigned MinSize = 0; 511 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 512 bool IdQuery = false; 513 514 // If reqd_work_group_size is present it narrows value down. 515 if (auto *CI = dyn_cast<CallInst>(I)) { 516 const Function *F = CI->getCalledFunction(); 517 if (F) { 518 unsigned Dim = UINT_MAX; 519 switch (F->getIntrinsicID()) { 520 case Intrinsic::amdgcn_workitem_id_x: 521 case Intrinsic::r600_read_tidig_x: 522 IdQuery = true; 523 LLVM_FALLTHROUGH; 524 case Intrinsic::r600_read_local_size_x: 525 Dim = 0; 526 break; 527 case Intrinsic::amdgcn_workitem_id_y: 528 case Intrinsic::r600_read_tidig_y: 529 IdQuery = true; 530 LLVM_FALLTHROUGH; 531 case Intrinsic::r600_read_local_size_y: 532 Dim = 1; 533 break; 534 case Intrinsic::amdgcn_workitem_id_z: 535 case Intrinsic::r600_read_tidig_z: 536 IdQuery = true; 537 LLVM_FALLTHROUGH; 538 case Intrinsic::r600_read_local_size_z: 539 Dim = 2; 540 break; 541 default: 542 break; 543 } 544 545 if (Dim <= 3) { 546 unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim); 547 if (ReqdSize != std::numeric_limits<unsigned>::max()) 548 MinSize = MaxSize = ReqdSize; 549 } 550 } 551 } 552 553 if (!MaxSize) 554 return false; 555 556 // Range metadata is [Lo, Hi). For ID query we need to pass max size 557 // as Hi. For size query we need to pass Hi + 1. 558 if (IdQuery) 559 MinSize = 0; 560 else 561 ++MaxSize; 562 563 MDBuilder MDB(I->getContext()); 564 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 565 APInt(32, MaxSize)); 566 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 567 return true; 568 } 569 570 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const { 571 if (isMesaKernel(F)) 572 return 16; 573 return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0); 574 } 575 576 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 577 Align &MaxAlign) const { 578 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 579 F.getCallingConv() == CallingConv::SPIR_KERNEL); 580 581 const DataLayout &DL = F.getParent()->getDataLayout(); 582 uint64_t ExplicitArgBytes = 0; 583 MaxAlign = Align(1); 584 585 for (const Argument &Arg : F.args()) { 586 const bool IsByRef = Arg.hasByRefAttr(); 587 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); 588 MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None; 589 if (!Alignment) 590 Alignment = DL.getABITypeAlign(ArgTy); 591 592 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 593 ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize; 594 MaxAlign = max(MaxAlign, Alignment); 595 } 596 597 return ExplicitArgBytes; 598 } 599 600 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 601 Align &MaxAlign) const { 602 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 603 604 unsigned ExplicitOffset = getExplicitKernelArgOffset(F); 605 606 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 607 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 608 if (ImplicitBytes != 0) { 609 const Align Alignment = getAlignmentForImplicitArgPtr(); 610 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 611 } 612 613 // Being able to dereference past the end is useful for emitting scalar loads. 614 return alignTo(TotalSize, 4); 615 } 616 617 AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const { 618 return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32 619 : AMDGPUDwarfFlavour::Wave64; 620 } 621 622 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 623 const TargetMachine &TM) : 624 R600GenSubtargetInfo(TT, GPU, /*TuneCPU*/GPU, FS), 625 AMDGPUSubtarget(TT), 626 InstrInfo(*this), 627 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 628 FMA(false), 629 CaymanISA(false), 630 CFALUBug(false), 631 HasVertexCache(false), 632 R600ALUInst(false), 633 FP64(false), 634 TexVTXClauseSize(0), 635 Gen(R600), 636 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)), 637 InstrItins(getInstrItineraryForCPU(GPU)) { } 638 639 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 640 unsigned NumRegionInstrs) const { 641 // Track register pressure so the scheduler can try to decrease 642 // pressure once register usage is above the threshold defined by 643 // SIRegisterInfo::getRegPressureSetLimit() 644 Policy.ShouldTrackPressure = true; 645 646 // Enabling both top down and bottom up scheduling seems to give us less 647 // register spills than just using one of these approaches on its own. 648 Policy.OnlyTopDown = false; 649 Policy.OnlyBottomUp = false; 650 651 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 652 if (!enableSIScheduler()) 653 Policy.ShouldTrackLaneMasks = true; 654 } 655 656 bool GCNSubtarget::hasMadF16() const { 657 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1; 658 } 659 660 bool GCNSubtarget::useVGPRIndexMode() const { 661 return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode()); 662 } 663 664 bool GCNSubtarget::useAA() const { return UseAA; } 665 666 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 667 if (getGeneration() >= AMDGPUSubtarget::GFX10) 668 return getMaxWavesPerEU(); 669 670 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 671 if (SGPRs <= 80) 672 return 10; 673 if (SGPRs <= 88) 674 return 9; 675 if (SGPRs <= 100) 676 return 8; 677 return 7; 678 } 679 if (SGPRs <= 48) 680 return 10; 681 if (SGPRs <= 56) 682 return 9; 683 if (SGPRs <= 64) 684 return 8; 685 if (SGPRs <= 72) 686 return 7; 687 if (SGPRs <= 80) 688 return 6; 689 return 5; 690 } 691 692 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 693 unsigned MaxWaves = getMaxWavesPerEU(); 694 unsigned Granule = getVGPRAllocGranule(); 695 if (VGPRs < Granule) 696 return MaxWaves; 697 unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule; 698 return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves); 699 } 700 701 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 702 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 703 if (getGeneration() >= AMDGPUSubtarget::GFX10) 704 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. 705 706 if (MFI.hasFlatScratchInit()) { 707 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 708 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 709 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 710 return 4; // FLAT_SCRATCH, VCC (in that order). 711 } 712 713 if (isXNACKEnabled()) 714 return 4; // XNACK, VCC (in that order). 715 return 2; // VCC. 716 } 717 718 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize, 719 unsigned NumSGPRs, 720 unsigned NumVGPRs) const { 721 unsigned Occupancy = 722 std::min(getMaxWavesPerEU(), 723 getOccupancyWithLocalMemSize(LDSSize, F)); 724 if (NumSGPRs) 725 Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs)); 726 if (NumVGPRs) 727 Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs)); 728 return Occupancy; 729 } 730 731 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 732 const Function &F = MF.getFunction(); 733 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 734 735 // Compute maximum number of SGPRs function can use using default/requested 736 // minimum number of waves per execution unit. 737 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 738 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 739 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 740 741 // Check if maximum number of SGPRs was explicitly requested using 742 // "amdgpu-num-sgpr" attribute. 743 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 744 unsigned Requested = AMDGPU::getIntegerAttribute( 745 F, "amdgpu-num-sgpr", MaxNumSGPRs); 746 747 // Make sure requested value does not violate subtarget's specifications. 748 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 749 Requested = 0; 750 751 // If more SGPRs are required to support the input user/system SGPRs, 752 // increase to accommodate them. 753 // 754 // FIXME: This really ends up using the requested number of SGPRs + number 755 // of reserved special registers in total. Theoretically you could re-use 756 // the last input registers for these special registers, but this would 757 // require a lot of complexity to deal with the weird aliasing. 758 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 759 if (Requested && Requested < InputNumSGPRs) 760 Requested = InputNumSGPRs; 761 762 // Make sure requested value is compatible with values implied by 763 // default/requested minimum/maximum number of waves per execution unit. 764 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 765 Requested = 0; 766 if (WavesPerEU.second && 767 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 768 Requested = 0; 769 770 if (Requested) 771 MaxNumSGPRs = Requested; 772 } 773 774 if (hasSGPRInitBug()) 775 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 776 777 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 778 MaxAddressableNumSGPRs); 779 } 780 781 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 782 const Function &F = MF.getFunction(); 783 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 784 785 // Compute maximum number of VGPRs function can use using default/requested 786 // minimum number of waves per execution unit. 787 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 788 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 789 790 // Check if maximum number of VGPRs was explicitly requested using 791 // "amdgpu-num-vgpr" attribute. 792 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 793 unsigned Requested = AMDGPU::getIntegerAttribute( 794 F, "amdgpu-num-vgpr", MaxNumVGPRs); 795 796 if (hasGFX90AInsts()) 797 Requested *= 2; 798 799 // Make sure requested value is compatible with values implied by 800 // default/requested minimum/maximum number of waves per execution unit. 801 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 802 Requested = 0; 803 if (WavesPerEU.second && 804 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 805 Requested = 0; 806 807 if (Requested) 808 MaxNumVGPRs = Requested; 809 } 810 811 return MaxNumVGPRs; 812 } 813 814 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, 815 int UseOpIdx, SDep &Dep) const { 816 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || 817 !Def->isInstr() || !Use->isInstr()) 818 return; 819 820 MachineInstr *DefI = Def->getInstr(); 821 MachineInstr *UseI = Use->getInstr(); 822 823 if (DefI->isBundle()) { 824 const SIRegisterInfo *TRI = getRegisterInfo(); 825 auto Reg = Dep.getReg(); 826 MachineBasicBlock::const_instr_iterator I(DefI->getIterator()); 827 MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end()); 828 unsigned Lat = 0; 829 for (++I; I != E && I->isBundledWithPred(); ++I) { 830 if (I->modifiesRegister(Reg, TRI)) 831 Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I); 832 else if (Lat) 833 --Lat; 834 } 835 Dep.setLatency(Lat); 836 } else if (UseI->isBundle()) { 837 const SIRegisterInfo *TRI = getRegisterInfo(); 838 auto Reg = Dep.getReg(); 839 MachineBasicBlock::const_instr_iterator I(UseI->getIterator()); 840 MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end()); 841 unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI); 842 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) { 843 if (I->readsRegister(Reg, TRI)) 844 break; 845 --Lat; 846 } 847 Dep.setLatency(Lat); 848 } 849 } 850 851 namespace { 852 struct FillMFMAShadowMutation : ScheduleDAGMutation { 853 const SIInstrInfo *TII; 854 855 ScheduleDAGMI *DAG; 856 857 FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {} 858 859 bool isSALU(const SUnit *SU) const { 860 const MachineInstr *MI = SU->getInstr(); 861 return MI && TII->isSALU(*MI) && !MI->isTerminator(); 862 } 863 864 bool isVALU(const SUnit *SU) const { 865 const MachineInstr *MI = SU->getInstr(); 866 return MI && TII->isVALU(*MI); 867 } 868 869 bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const { 870 if (Pred->NodeNum < Succ->NodeNum) 871 return true; 872 873 SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred}); 874 875 for (unsigned I = 0; I < Succs.size(); ++I) { 876 for (const SDep &SI : Succs[I]->Succs) { 877 const SUnit *SU = SI.getSUnit(); 878 if (SU != Succs[I] && !llvm::is_contained(Succs, SU)) 879 Succs.push_back(SU); 880 } 881 } 882 883 SmallPtrSet<const SUnit*, 32> Visited; 884 while (!Preds.empty()) { 885 const SUnit *SU = Preds.pop_back_val(); 886 if (llvm::is_contained(Succs, SU)) 887 return false; 888 Visited.insert(SU); 889 for (const SDep &SI : SU->Preds) 890 if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit())) 891 Preds.push_back(SI.getSUnit()); 892 } 893 894 return true; 895 } 896 897 // Link as much SALU intructions in chain as possible. Return the size 898 // of the chain. Links up to MaxChain instructions. 899 unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain, 900 SmallPtrSetImpl<SUnit *> &Visited) const { 901 SmallVector<SUnit *, 8> Worklist({To}); 902 unsigned Linked = 0; 903 904 while (!Worklist.empty() && MaxChain-- > 0) { 905 SUnit *SU = Worklist.pop_back_val(); 906 if (!Visited.insert(SU).second) 907 continue; 908 909 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From); 910 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n'); 911 912 if (SU->addPred(SDep(From, SDep::Artificial), false)) 913 ++Linked; 914 915 for (SDep &SI : From->Succs) { 916 SUnit *SUv = SI.getSUnit(); 917 if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU)) 918 SUv->addPred(SDep(SU, SDep::Artificial), false); 919 } 920 921 for (SDep &SI : SU->Succs) { 922 SUnit *Succ = SI.getSUnit(); 923 if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ)) 924 Worklist.push_back(Succ); 925 } 926 } 927 928 return Linked; 929 } 930 931 void apply(ScheduleDAGInstrs *DAGInstrs) override { 932 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); 933 if (!ST.hasMAIInsts() || DisablePowerSched) 934 return; 935 DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); 936 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); 937 if (!TSchedModel || DAG->SUnits.empty()) 938 return; 939 940 // Scan for MFMA long latency instructions and try to add a dependency 941 // of available SALU instructions to give them a chance to fill MFMA 942 // shadow. That is desirable to fill MFMA shadow with SALU instructions 943 // rather than VALU to prevent power consumption bursts and throttle. 944 auto LastSALU = DAG->SUnits.begin(); 945 auto E = DAG->SUnits.end(); 946 SmallPtrSet<SUnit*, 32> Visited; 947 for (SUnit &SU : DAG->SUnits) { 948 MachineInstr &MAI = *SU.getInstr(); 949 if (!TII->isMAI(MAI) || 950 MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 || 951 MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64) 952 continue; 953 954 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1; 955 956 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU); 957 dbgs() << "Need " << Lat 958 << " instructions to cover latency.\n"); 959 960 // Find up to Lat independent scalar instructions as early as 961 // possible such that they can be scheduled after this MFMA. 962 for ( ; Lat && LastSALU != E; ++LastSALU) { 963 if (Visited.count(&*LastSALU)) 964 continue; 965 966 if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU)) 967 continue; 968 969 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited); 970 } 971 } 972 } 973 }; 974 } // namespace 975 976 void GCNSubtarget::getPostRAMutations( 977 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 978 Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo)); 979 } 980 981 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 982 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 983 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 984 else 985 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); 986 } 987 988 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 989 if (TM.getTargetTriple().getArch() == Triple::amdgcn) 990 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 991 else 992 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); 993 } 994