1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Implements the AMDGPU specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUSubtarget.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUCallLowering.h" 17 #include "AMDGPUInstructionSelector.h" 18 #include "AMDGPULegalizerInfo.h" 19 #include "AMDGPURegisterBankInfo.h" 20 #include "AMDGPUTargetMachine.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "Utils/AMDGPUBaseInfo.h" 23 #include "llvm/ADT/SmallString.h" 24 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h" 25 #include "llvm/CodeGen/MachineScheduler.h" 26 #include "llvm/CodeGen/TargetFrameLowering.h" 27 #include "llvm/IR/IntrinsicsAMDGPU.h" 28 #include "llvm/IR/IntrinsicsR600.h" 29 #include "llvm/IR/MDBuilder.h" 30 #include "llvm/MC/MCSubtargetInfo.h" 31 #include <algorithm> 32 33 using namespace llvm; 34 35 #define DEBUG_TYPE "amdgpu-subtarget" 36 37 #define GET_SUBTARGETINFO_TARGET_DESC 38 #define GET_SUBTARGETINFO_CTOR 39 #define AMDGPUSubtarget GCNSubtarget 40 #include "AMDGPUGenSubtargetInfo.inc" 41 #define GET_SUBTARGETINFO_TARGET_DESC 42 #define GET_SUBTARGETINFO_CTOR 43 #undef AMDGPUSubtarget 44 #include "R600GenSubtargetInfo.inc" 45 46 static cl::opt<bool> DisablePowerSched( 47 "amdgpu-disable-power-sched", 48 cl::desc("Disable scheduling to minimize mAI power bursts"), 49 cl::init(false)); 50 51 static cl::opt<bool> EnableVGPRIndexMode( 52 "amdgpu-vgpr-index-mode", 53 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), 54 cl::init(false)); 55 56 static cl::opt<bool> EnableFlatScratch( 57 "amdgpu-enable-flat-scratch", 58 cl::desc("Use flat scratch instructions"), 59 cl::init(false)); 60 61 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen", 62 cl::desc("Enable the use of AA during codegen."), 63 cl::init(true)); 64 65 GCNSubtarget::~GCNSubtarget() = default; 66 67 R600Subtarget & 68 R600Subtarget::initializeSubtargetDependencies(const Triple &TT, 69 StringRef GPU, StringRef FS) { 70 SmallString<256> FullFS("+promote-alloca,"); 71 FullFS += FS; 72 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS); 73 74 HasMulU24 = getGeneration() >= EVERGREEN; 75 HasMulI24 = hasCaymanISA(); 76 77 return *this; 78 } 79 80 GCNSubtarget & 81 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 82 StringRef GPU, StringRef FS) { 83 // Determine default and user-specified characteristics 84 // 85 // We want to be able to turn these off, but making this a subtarget feature 86 // for SI has the unhelpful behavior that it unsets everything else if you 87 // disable it. 88 // 89 // Similarly we want enable-prt-strict-null to be on by default and not to 90 // unset everything else if it is disabled 91 92 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,"); 93 94 // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default 95 if (isAmdHsaOS()) 96 FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,"; 97 98 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS 99 100 // Disable mutually exclusive bits. 101 if (FS.find_lower("+wavefrontsize") != StringRef::npos) { 102 if (FS.find_lower("wavefrontsize16") == StringRef::npos) 103 FullFS += "-wavefrontsize16,"; 104 if (FS.find_lower("wavefrontsize32") == StringRef::npos) 105 FullFS += "-wavefrontsize32,"; 106 if (FS.find_lower("wavefrontsize64") == StringRef::npos) 107 FullFS += "-wavefrontsize64,"; 108 } 109 110 FullFS += FS; 111 112 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS); 113 114 // Implement the "generic" processors, which acts as the default when no 115 // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to 116 // the first amdgcn target that supports flat addressing. Other OSes defaults 117 // to the first amdgcn target. 118 if (Gen == AMDGPUSubtarget::INVALID) { 119 Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS 120 : AMDGPUSubtarget::SOUTHERN_ISLANDS; 121 } 122 123 // We don't support FP64 for EG/NI atm. 124 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 125 126 // Targets must either support 64-bit offsets for MUBUF instructions, and/or 127 // support flat operations, otherwise they cannot access a 64-bit global 128 // address space 129 assert(hasAddr64() || hasFlat()); 130 // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets 131 // that do not support ADDR64 variants of MUBUF instructions. Such targets 132 // cannot use a 64 bit offset with a MUBUF instruction to access the global 133 // address space 134 if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) { 135 ToggleFeature(AMDGPU::FeatureFlatForGlobal); 136 FlatForGlobal = true; 137 } 138 // Unless +-flat-for-global is specified, use MUBUF instructions for global 139 // address space access if flat operations are not available. 140 if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) { 141 ToggleFeature(AMDGPU::FeatureFlatForGlobal); 142 FlatForGlobal = false; 143 } 144 145 // Set defaults if needed. 146 if (MaxPrivateElementSize == 0) 147 MaxPrivateElementSize = 4; 148 149 if (LDSBankCount == 0) 150 LDSBankCount = 32; 151 152 if (TT.getArch() == Triple::amdgcn) { 153 if (LocalMemorySize == 0) 154 LocalMemorySize = 32768; 155 156 // Do something sensible for unspecified target. 157 if (!HasMovrel && !HasVGPRIndexMode) 158 HasMovrel = true; 159 } 160 161 // Don't crash on invalid devices. 162 if (WavefrontSizeLog2 == 0) 163 WavefrontSizeLog2 = 5; 164 165 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 166 167 TargetID.setTargetIDFromFeaturesString(FS); 168 169 LLVM_DEBUG(dbgs() << "xnack setting for subtarget: " 170 << TargetID.getXnackSetting() << '\n'); 171 LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: " 172 << TargetID.getSramEccSetting() << '\n'); 173 174 return *this; 175 } 176 177 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : 178 TargetTriple(TT), 179 GCN3Encoding(false), 180 Has16BitInsts(false), 181 HasMadMixInsts(false), 182 HasMadMacF32Insts(false), 183 HasDsSrc2Insts(false), 184 HasSDWA(false), 185 HasVOP3PInsts(false), 186 HasMulI24(true), 187 HasMulU24(true), 188 HasInv2PiInlineImm(false), 189 HasFminFmaxLegacy(true), 190 EnablePromoteAlloca(false), 191 HasTrigReducedRange(false), 192 MaxWavesPerEU(10), 193 LocalMemorySize(0), 194 WavefrontSizeLog2(0) 195 { } 196 197 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 198 const GCNTargetMachine &TM) 199 : // clang-format off 200 AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS), 201 AMDGPUSubtarget(TT), 202 TargetTriple(TT), 203 TargetID(*this), 204 Gen(INVALID), 205 InstrItins(getInstrItineraryForCPU(GPU)), 206 LDSBankCount(0), 207 MaxPrivateElementSize(0), 208 209 FastFMAF32(false), 210 FastDenormalF32(false), 211 HalfRate64Ops(false), 212 FullRate64Ops(false), 213 214 FlatForGlobal(false), 215 AutoWaitcntBeforeBarrier(false), 216 UnalignedScratchAccess(false), 217 UnalignedAccessMode(false), 218 219 HasApertureRegs(false), 220 SupportsXNACK(false), 221 EnableXNACK(false), 222 EnableTgSplit(false), 223 EnableCuMode(false), 224 TrapHandler(false), 225 226 EnableLoadStoreOpt(false), 227 EnableUnsafeDSOffsetFolding(false), 228 EnableSIScheduler(false), 229 EnableDS128(false), 230 EnablePRTStrictNull(false), 231 DumpCode(false), 232 233 FP64(false), 234 CIInsts(false), 235 GFX8Insts(false), 236 GFX9Insts(false), 237 GFX90AInsts(false), 238 GFX10Insts(false), 239 GFX10_3Insts(false), 240 GFX7GFX8GFX9Insts(false), 241 SGPRInitBug(false), 242 NegativeScratchOffsetBug(false), 243 NegativeUnalignedScratchOffsetBug(false), 244 HasSMemRealTime(false), 245 HasIntClamp(false), 246 HasFmaMixInsts(false), 247 HasMovrel(false), 248 HasVGPRIndexMode(false), 249 HasScalarStores(false), 250 HasScalarAtomics(false), 251 HasSDWAOmod(false), 252 HasSDWAScalar(false), 253 HasSDWASdst(false), 254 HasSDWAMac(false), 255 HasSDWAOutModsVOPC(false), 256 HasDPP(false), 257 HasDPP8(false), 258 Has64BitDPP(false), 259 HasPackedFP32Ops(false), 260 HasExtendedImageInsts(false), 261 HasR128A16(false), 262 HasGFX10A16(false), 263 HasG16(false), 264 HasNSAEncoding(false), 265 GFX10_BEncoding(false), 266 HasDLInsts(false), 267 HasDot1Insts(false), 268 HasDot2Insts(false), 269 HasDot3Insts(false), 270 HasDot4Insts(false), 271 HasDot5Insts(false), 272 HasDot6Insts(false), 273 HasDot7Insts(false), 274 HasMAIInsts(false), 275 HasPkFmacF16Inst(false), 276 HasAtomicFaddInsts(false), 277 SupportsSRAMECC(false), 278 EnableSRAMECC(false), 279 HasNoSdstCMPX(false), 280 HasVscnt(false), 281 HasGetWaveIdInst(false), 282 HasSMemTimeInst(false), 283 HasShaderCyclesRegister(false), 284 HasRegisterBanking(false), 285 HasVOP3Literal(false), 286 HasNoDataDepHazard(false), 287 FlatAddressSpace(false), 288 FlatInstOffsets(false), 289 FlatGlobalInsts(false), 290 FlatScratchInsts(false), 291 ScalarFlatScratchInsts(false), 292 AddNoCarryInsts(false), 293 HasUnpackedD16VMem(false), 294 LDSMisalignedBug(false), 295 HasMFMAInlineLiteralBug(false), 296 UnalignedBufferAccess(false), 297 UnalignedDSAccess(false), 298 HasPackedTID(false), 299 300 ScalarizeGlobal(false), 301 302 HasVcmpxPermlaneHazard(false), 303 HasVMEMtoScalarWriteHazard(false), 304 HasSMEMtoVectorWriteHazard(false), 305 HasInstFwdPrefetchBug(false), 306 HasVcmpxExecWARHazard(false), 307 HasLdsBranchVmemWARHazard(false), 308 HasNSAtoVMEMBug(false), 309 HasNSAClauseBug(false), 310 HasOffset3fBug(false), 311 HasFlatSegmentOffsetBug(false), 312 HasImageStoreD16Bug(false), 313 HasImageGather4D16Bug(false), 314 315 FeatureDisable(false), 316 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 317 TLInfo(TM, *this), 318 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 319 // clang-format on 320 MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this); 321 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 322 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering())); 323 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 324 RegBankInfo.reset(new AMDGPURegisterBankInfo(*this)); 325 InstSelector.reset(new AMDGPUInstructionSelector( 326 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 327 } 328 329 bool GCNSubtarget::enableFlatScratch() const { 330 return EnableFlatScratch && hasFlatScratchInsts(); 331 } 332 333 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { 334 if (getGeneration() < GFX10) 335 return 1; 336 337 switch (Opcode) { 338 case AMDGPU::V_LSHLREV_B64_e64: 339 case AMDGPU::V_LSHLREV_B64_gfx10: 340 case AMDGPU::V_LSHL_B64_e64: 341 case AMDGPU::V_LSHRREV_B64_e64: 342 case AMDGPU::V_LSHRREV_B64_gfx10: 343 case AMDGPU::V_LSHR_B64_e64: 344 case AMDGPU::V_ASHRREV_I64_e64: 345 case AMDGPU::V_ASHRREV_I64_gfx10: 346 case AMDGPU::V_ASHR_I64_e64: 347 return 1; 348 } 349 350 return 2; 351 } 352 353 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 354 const Function &F) const { 355 if (NWaves == 1) 356 return getLocalMemorySize(); 357 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 358 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 359 if (!WorkGroupsPerCu) 360 return 0; 361 unsigned MaxWaves = getMaxWavesPerEU(); 362 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 363 } 364 365 // FIXME: Should return min,max range. 366 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 367 const Function &F) const { 368 const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second; 369 const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize); 370 if (!MaxWorkGroupsPerCu) 371 return 0; 372 373 const unsigned WaveSize = getWavefrontSize(); 374 375 // FIXME: Do we need to account for alignment requirement of LDS rounding the 376 // size up? 377 // Compute restriction based on LDS usage 378 unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u); 379 380 // This can be queried with more LDS than is possible, so just assume the 381 // worst. 382 if (NumGroups == 0) 383 return 1; 384 385 NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups); 386 387 // Round to the number of waves. 388 const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize; 389 unsigned MaxWaves = NumGroups * MaxGroupNumWaves; 390 391 // Clamp to the maximum possible number of waves. 392 MaxWaves = std::min(MaxWaves, getMaxWavesPerEU()); 393 394 // FIXME: Needs to be a multiple of the group size? 395 //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves); 396 397 assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() && 398 "computed invalid occupancy"); 399 return MaxWaves; 400 } 401 402 unsigned 403 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 404 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 405 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 406 } 407 408 std::pair<unsigned, unsigned> 409 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 410 switch (CC) { 411 case CallingConv::AMDGPU_VS: 412 case CallingConv::AMDGPU_LS: 413 case CallingConv::AMDGPU_HS: 414 case CallingConv::AMDGPU_ES: 415 case CallingConv::AMDGPU_GS: 416 case CallingConv::AMDGPU_PS: 417 return std::make_pair(1, getWavefrontSize()); 418 default: 419 return std::make_pair(1u, getMaxFlatWorkGroupSize()); 420 } 421 } 422 423 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 424 const Function &F) const { 425 // Default minimum/maximum flat work group sizes. 426 std::pair<unsigned, unsigned> Default = 427 getDefaultFlatWorkGroupSize(F.getCallingConv()); 428 429 // Requested minimum/maximum flat work group sizes. 430 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 431 F, "amdgpu-flat-work-group-size", Default); 432 433 // Make sure requested minimum is less than requested maximum. 434 if (Requested.first > Requested.second) 435 return Default; 436 437 // Make sure requested values do not violate subtarget's specifications. 438 if (Requested.first < getMinFlatWorkGroupSize()) 439 return Default; 440 if (Requested.second > getMaxFlatWorkGroupSize()) 441 return Default; 442 443 return Requested; 444 } 445 446 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 447 const Function &F) const { 448 // Default minimum/maximum number of waves per execution unit. 449 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 450 451 // Default/requested minimum/maximum flat work group sizes. 452 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 453 454 // If minimum/maximum flat work group sizes were explicitly requested using 455 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 456 // number of waves per execution unit to values implied by requested 457 // minimum/maximum flat work group sizes. 458 unsigned MinImpliedByFlatWorkGroupSize = 459 getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second); 460 Default.first = MinImpliedByFlatWorkGroupSize; 461 bool RequestedFlatWorkGroupSize = 462 F.hasFnAttribute("amdgpu-flat-work-group-size"); 463 464 // Requested minimum/maximum number of waves per execution unit. 465 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 466 F, "amdgpu-waves-per-eu", Default, true); 467 468 // Make sure requested minimum is less than requested maximum. 469 if (Requested.second && Requested.first > Requested.second) 470 return Default; 471 472 // Make sure requested values do not violate subtarget's specifications. 473 if (Requested.first < getMinWavesPerEU() || 474 Requested.second > getMaxWavesPerEU()) 475 return Default; 476 477 // Make sure requested values are compatible with values implied by requested 478 // minimum/maximum flat work group sizes. 479 if (RequestedFlatWorkGroupSize && 480 Requested.first < MinImpliedByFlatWorkGroupSize) 481 return Default; 482 483 return Requested; 484 } 485 486 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) { 487 auto Node = Kernel.getMetadata("reqd_work_group_size"); 488 if (Node && Node->getNumOperands() == 3) 489 return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue(); 490 return std::numeric_limits<unsigned>::max(); 491 } 492 493 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const { 494 return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv()); 495 } 496 497 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel, 498 unsigned Dimension) const { 499 unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension); 500 if (ReqdSize != std::numeric_limits<unsigned>::max()) 501 return ReqdSize - 1; 502 return getFlatWorkGroupSizes(Kernel).second - 1; 503 } 504 505 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 506 Function *Kernel = I->getParent()->getParent(); 507 unsigned MinSize = 0; 508 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 509 bool IdQuery = false; 510 511 // If reqd_work_group_size is present it narrows value down. 512 if (auto *CI = dyn_cast<CallInst>(I)) { 513 const Function *F = CI->getCalledFunction(); 514 if (F) { 515 unsigned Dim = UINT_MAX; 516 switch (F->getIntrinsicID()) { 517 case Intrinsic::amdgcn_workitem_id_x: 518 case Intrinsic::r600_read_tidig_x: 519 IdQuery = true; 520 LLVM_FALLTHROUGH; 521 case Intrinsic::r600_read_local_size_x: 522 Dim = 0; 523 break; 524 case Intrinsic::amdgcn_workitem_id_y: 525 case Intrinsic::r600_read_tidig_y: 526 IdQuery = true; 527 LLVM_FALLTHROUGH; 528 case Intrinsic::r600_read_local_size_y: 529 Dim = 1; 530 break; 531 case Intrinsic::amdgcn_workitem_id_z: 532 case Intrinsic::r600_read_tidig_z: 533 IdQuery = true; 534 LLVM_FALLTHROUGH; 535 case Intrinsic::r600_read_local_size_z: 536 Dim = 2; 537 break; 538 default: 539 break; 540 } 541 542 if (Dim <= 3) { 543 unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim); 544 if (ReqdSize != std::numeric_limits<unsigned>::max()) 545 MinSize = MaxSize = ReqdSize; 546 } 547 } 548 } 549 550 if (!MaxSize) 551 return false; 552 553 // Range metadata is [Lo, Hi). For ID query we need to pass max size 554 // as Hi. For size query we need to pass Hi + 1. 555 if (IdQuery) 556 MinSize = 0; 557 else 558 ++MaxSize; 559 560 MDBuilder MDB(I->getContext()); 561 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 562 APInt(32, MaxSize)); 563 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 564 return true; 565 } 566 567 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const { 568 if (isMesaKernel(F)) 569 return 16; 570 return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0); 571 } 572 573 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 574 Align &MaxAlign) const { 575 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 576 F.getCallingConv() == CallingConv::SPIR_KERNEL); 577 578 const DataLayout &DL = F.getParent()->getDataLayout(); 579 uint64_t ExplicitArgBytes = 0; 580 MaxAlign = Align(1); 581 582 for (const Argument &Arg : F.args()) { 583 const bool IsByRef = Arg.hasByRefAttr(); 584 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); 585 MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None; 586 if (!Alignment) 587 Alignment = DL.getABITypeAlign(ArgTy); 588 589 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 590 ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize; 591 MaxAlign = max(MaxAlign, Alignment); 592 } 593 594 return ExplicitArgBytes; 595 } 596 597 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 598 Align &MaxAlign) const { 599 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 600 601 unsigned ExplicitOffset = getExplicitKernelArgOffset(F); 602 603 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 604 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 605 if (ImplicitBytes != 0) { 606 const Align Alignment = getAlignmentForImplicitArgPtr(); 607 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 608 } 609 610 // Being able to dereference past the end is useful for emitting scalar loads. 611 return alignTo(TotalSize, 4); 612 } 613 614 AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const { 615 return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32 616 : AMDGPUDwarfFlavour::Wave64; 617 } 618 619 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 620 const TargetMachine &TM) : 621 R600GenSubtargetInfo(TT, GPU, /*TuneCPU*/GPU, FS), 622 AMDGPUSubtarget(TT), 623 InstrInfo(*this), 624 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 625 FMA(false), 626 CaymanISA(false), 627 CFALUBug(false), 628 HasVertexCache(false), 629 R600ALUInst(false), 630 FP64(false), 631 TexVTXClauseSize(0), 632 Gen(R600), 633 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)), 634 InstrItins(getInstrItineraryForCPU(GPU)) { } 635 636 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 637 unsigned NumRegionInstrs) const { 638 // Track register pressure so the scheduler can try to decrease 639 // pressure once register usage is above the threshold defined by 640 // SIRegisterInfo::getRegPressureSetLimit() 641 Policy.ShouldTrackPressure = true; 642 643 // Enabling both top down and bottom up scheduling seems to give us less 644 // register spills than just using one of these approaches on its own. 645 Policy.OnlyTopDown = false; 646 Policy.OnlyBottomUp = false; 647 648 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 649 if (!enableSIScheduler()) 650 Policy.ShouldTrackLaneMasks = true; 651 } 652 653 bool GCNSubtarget::hasMadF16() const { 654 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1; 655 } 656 657 bool GCNSubtarget::useVGPRIndexMode() const { 658 return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode()); 659 } 660 661 bool GCNSubtarget::useAA() const { return UseAA; } 662 663 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 664 if (getGeneration() >= AMDGPUSubtarget::GFX10) 665 return getMaxWavesPerEU(); 666 667 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 668 if (SGPRs <= 80) 669 return 10; 670 if (SGPRs <= 88) 671 return 9; 672 if (SGPRs <= 100) 673 return 8; 674 return 7; 675 } 676 if (SGPRs <= 48) 677 return 10; 678 if (SGPRs <= 56) 679 return 9; 680 if (SGPRs <= 64) 681 return 8; 682 if (SGPRs <= 72) 683 return 7; 684 if (SGPRs <= 80) 685 return 6; 686 return 5; 687 } 688 689 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 690 unsigned MaxWaves = getMaxWavesPerEU(); 691 unsigned Granule = getVGPRAllocGranule(); 692 if (VGPRs < Granule) 693 return MaxWaves; 694 unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule; 695 return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves); 696 } 697 698 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 699 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 700 if (getGeneration() >= AMDGPUSubtarget::GFX10) 701 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. 702 703 if (MFI.hasFlatScratchInit()) { 704 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 705 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 706 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 707 return 4; // FLAT_SCRATCH, VCC (in that order). 708 } 709 710 if (isXNACKEnabled()) 711 return 4; // XNACK, VCC (in that order). 712 return 2; // VCC. 713 } 714 715 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize, 716 unsigned NumSGPRs, 717 unsigned NumVGPRs) const { 718 unsigned Occupancy = 719 std::min(getMaxWavesPerEU(), 720 getOccupancyWithLocalMemSize(LDSSize, F)); 721 if (NumSGPRs) 722 Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs)); 723 if (NumVGPRs) 724 Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs)); 725 return Occupancy; 726 } 727 728 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 729 const Function &F = MF.getFunction(); 730 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 731 732 // Compute maximum number of SGPRs function can use using default/requested 733 // minimum number of waves per execution unit. 734 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 735 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 736 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 737 738 // Check if maximum number of SGPRs was explicitly requested using 739 // "amdgpu-num-sgpr" attribute. 740 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 741 unsigned Requested = AMDGPU::getIntegerAttribute( 742 F, "amdgpu-num-sgpr", MaxNumSGPRs); 743 744 // Make sure requested value does not violate subtarget's specifications. 745 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 746 Requested = 0; 747 748 // If more SGPRs are required to support the input user/system SGPRs, 749 // increase to accommodate them. 750 // 751 // FIXME: This really ends up using the requested number of SGPRs + number 752 // of reserved special registers in total. Theoretically you could re-use 753 // the last input registers for these special registers, but this would 754 // require a lot of complexity to deal with the weird aliasing. 755 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 756 if (Requested && Requested < InputNumSGPRs) 757 Requested = InputNumSGPRs; 758 759 // Make sure requested value is compatible with values implied by 760 // default/requested minimum/maximum number of waves per execution unit. 761 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 762 Requested = 0; 763 if (WavesPerEU.second && 764 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 765 Requested = 0; 766 767 if (Requested) 768 MaxNumSGPRs = Requested; 769 } 770 771 if (hasSGPRInitBug()) 772 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 773 774 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 775 MaxAddressableNumSGPRs); 776 } 777 778 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 779 const Function &F = MF.getFunction(); 780 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 781 782 // Compute maximum number of VGPRs function can use using default/requested 783 // minimum number of waves per execution unit. 784 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 785 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 786 787 // Check if maximum number of VGPRs was explicitly requested using 788 // "amdgpu-num-vgpr" attribute. 789 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 790 unsigned Requested = AMDGPU::getIntegerAttribute( 791 F, "amdgpu-num-vgpr", MaxNumVGPRs); 792 793 if (hasGFX90AInsts()) 794 Requested *= 2; 795 796 // Make sure requested value is compatible with values implied by 797 // default/requested minimum/maximum number of waves per execution unit. 798 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 799 Requested = 0; 800 if (WavesPerEU.second && 801 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 802 Requested = 0; 803 804 if (Requested) 805 MaxNumVGPRs = Requested; 806 } 807 808 return MaxNumVGPRs; 809 } 810 811 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, 812 int UseOpIdx, SDep &Dep) const { 813 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || 814 !Def->isInstr() || !Use->isInstr()) 815 return; 816 817 MachineInstr *DefI = Def->getInstr(); 818 MachineInstr *UseI = Use->getInstr(); 819 820 if (DefI->isBundle()) { 821 const SIRegisterInfo *TRI = getRegisterInfo(); 822 auto Reg = Dep.getReg(); 823 MachineBasicBlock::const_instr_iterator I(DefI->getIterator()); 824 MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end()); 825 unsigned Lat = 0; 826 for (++I; I != E && I->isBundledWithPred(); ++I) { 827 if (I->modifiesRegister(Reg, TRI)) 828 Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I); 829 else if (Lat) 830 --Lat; 831 } 832 Dep.setLatency(Lat); 833 } else if (UseI->isBundle()) { 834 const SIRegisterInfo *TRI = getRegisterInfo(); 835 auto Reg = Dep.getReg(); 836 MachineBasicBlock::const_instr_iterator I(UseI->getIterator()); 837 MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end()); 838 unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI); 839 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) { 840 if (I->readsRegister(Reg, TRI)) 841 break; 842 --Lat; 843 } 844 Dep.setLatency(Lat); 845 } 846 } 847 848 namespace { 849 struct FillMFMAShadowMutation : ScheduleDAGMutation { 850 const SIInstrInfo *TII; 851 852 ScheduleDAGMI *DAG; 853 854 FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {} 855 856 bool isSALU(const SUnit *SU) const { 857 const MachineInstr *MI = SU->getInstr(); 858 return MI && TII->isSALU(*MI) && !MI->isTerminator(); 859 } 860 861 bool isVALU(const SUnit *SU) const { 862 const MachineInstr *MI = SU->getInstr(); 863 return MI && TII->isVALU(*MI); 864 } 865 866 bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const { 867 if (Pred->NodeNum < Succ->NodeNum) 868 return true; 869 870 SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred}); 871 872 for (unsigned I = 0; I < Succs.size(); ++I) { 873 for (const SDep &SI : Succs[I]->Succs) { 874 const SUnit *SU = SI.getSUnit(); 875 if (SU != Succs[I] && !llvm::is_contained(Succs, SU)) 876 Succs.push_back(SU); 877 } 878 } 879 880 SmallPtrSet<const SUnit*, 32> Visited; 881 while (!Preds.empty()) { 882 const SUnit *SU = Preds.pop_back_val(); 883 if (llvm::is_contained(Succs, SU)) 884 return false; 885 Visited.insert(SU); 886 for (const SDep &SI : SU->Preds) 887 if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit())) 888 Preds.push_back(SI.getSUnit()); 889 } 890 891 return true; 892 } 893 894 // Link as much SALU intructions in chain as possible. Return the size 895 // of the chain. Links up to MaxChain instructions. 896 unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain, 897 SmallPtrSetImpl<SUnit *> &Visited) const { 898 SmallVector<SUnit *, 8> Worklist({To}); 899 unsigned Linked = 0; 900 901 while (!Worklist.empty() && MaxChain-- > 0) { 902 SUnit *SU = Worklist.pop_back_val(); 903 if (!Visited.insert(SU).second) 904 continue; 905 906 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From); 907 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n'); 908 909 if (SU->addPred(SDep(From, SDep::Artificial), false)) 910 ++Linked; 911 912 for (SDep &SI : From->Succs) { 913 SUnit *SUv = SI.getSUnit(); 914 if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU)) 915 SUv->addPred(SDep(SU, SDep::Artificial), false); 916 } 917 918 for (SDep &SI : SU->Succs) { 919 SUnit *Succ = SI.getSUnit(); 920 if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ)) 921 Worklist.push_back(Succ); 922 } 923 } 924 925 return Linked; 926 } 927 928 void apply(ScheduleDAGInstrs *DAGInstrs) override { 929 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); 930 if (!ST.hasMAIInsts() || DisablePowerSched) 931 return; 932 DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); 933 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); 934 if (!TSchedModel || DAG->SUnits.empty()) 935 return; 936 937 // Scan for MFMA long latency instructions and try to add a dependency 938 // of available SALU instructions to give them a chance to fill MFMA 939 // shadow. That is desirable to fill MFMA shadow with SALU instructions 940 // rather than VALU to prevent power consumption bursts and throttle. 941 auto LastSALU = DAG->SUnits.begin(); 942 auto E = DAG->SUnits.end(); 943 SmallPtrSet<SUnit*, 32> Visited; 944 for (SUnit &SU : DAG->SUnits) { 945 MachineInstr &MAI = *SU.getInstr(); 946 if (!TII->isMAI(MAI) || 947 MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 || 948 MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64) 949 continue; 950 951 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1; 952 953 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU); 954 dbgs() << "Need " << Lat 955 << " instructions to cover latency.\n"); 956 957 // Find up to Lat independent scalar instructions as early as 958 // possible such that they can be scheduled after this MFMA. 959 for ( ; Lat && LastSALU != E; ++LastSALU) { 960 if (Visited.count(&*LastSALU)) 961 continue; 962 963 if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU)) 964 continue; 965 966 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited); 967 } 968 } 969 } 970 }; 971 } // namespace 972 973 void GCNSubtarget::getPostRAMutations( 974 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 975 Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo)); 976 } 977 978 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 979 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 980 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 981 else 982 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); 983 } 984 985 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 986 if (TM.getTargetTriple().getArch() == Triple::amdgcn) 987 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 988 else 989 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); 990 } 991