1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Implements the AMDGPU specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUSubtarget.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUCallLowering.h" 17 #include "AMDGPUInstructionSelector.h" 18 #include "AMDGPULegalizerInfo.h" 19 #include "AMDGPURegisterBankInfo.h" 20 #include "AMDGPUTargetMachine.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "Utils/AMDGPUBaseInfo.h" 23 #include "llvm/ADT/SmallString.h" 24 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h" 25 #include "llvm/CodeGen/MachineScheduler.h" 26 #include "llvm/CodeGen/TargetFrameLowering.h" 27 #include "llvm/IR/IntrinsicsAMDGPU.h" 28 #include "llvm/IR/IntrinsicsR600.h" 29 #include "llvm/IR/MDBuilder.h" 30 #include "llvm/MC/MCSubtargetInfo.h" 31 #include <algorithm> 32 33 using namespace llvm; 34 35 #define DEBUG_TYPE "amdgpu-subtarget" 36 37 #define GET_SUBTARGETINFO_TARGET_DESC 38 #define GET_SUBTARGETINFO_CTOR 39 #define AMDGPUSubtarget GCNSubtarget 40 #include "AMDGPUGenSubtargetInfo.inc" 41 #define GET_SUBTARGETINFO_TARGET_DESC 42 #define GET_SUBTARGETINFO_CTOR 43 #undef AMDGPUSubtarget 44 #include "R600GenSubtargetInfo.inc" 45 46 static cl::opt<bool> DisablePowerSched( 47 "amdgpu-disable-power-sched", 48 cl::desc("Disable scheduling to minimize mAI power bursts"), 49 cl::init(false)); 50 51 static cl::opt<bool> EnableVGPRIndexMode( 52 "amdgpu-vgpr-index-mode", 53 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), 54 cl::init(false)); 55 56 static cl::opt<bool> EnableFlatScratch( 57 "amdgpu-enable-flat-scratch", 58 cl::desc("Use flat scratch instructions"), 59 cl::init(false)); 60 61 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen", 62 cl::desc("Enable the use of AA during codegen."), 63 cl::init(true)); 64 65 GCNSubtarget::~GCNSubtarget() = default; 66 67 R600Subtarget & 68 R600Subtarget::initializeSubtargetDependencies(const Triple &TT, 69 StringRef GPU, StringRef FS) { 70 SmallString<256> FullFS("+promote-alloca,"); 71 FullFS += FS; 72 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS); 73 74 HasMulU24 = getGeneration() >= EVERGREEN; 75 HasMulI24 = hasCaymanISA(); 76 77 return *this; 78 } 79 80 GCNSubtarget & 81 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 82 StringRef GPU, StringRef FS) { 83 // Determine default and user-specified characteristics 84 // 85 // We want to be able to turn these off, but making this a subtarget feature 86 // for SI has the unhelpful behavior that it unsets everything else if you 87 // disable it. 88 // 89 // Similarly we want enable-prt-strict-null to be on by default and not to 90 // unset everything else if it is disabled 91 92 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,"); 93 94 // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default 95 if (isAmdHsaOS()) 96 FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,"; 97 98 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS 99 100 // Disable mutually exclusive bits. 101 if (FS.find_lower("+wavefrontsize") != StringRef::npos) { 102 if (FS.find_lower("wavefrontsize16") == StringRef::npos) 103 FullFS += "-wavefrontsize16,"; 104 if (FS.find_lower("wavefrontsize32") == StringRef::npos) 105 FullFS += "-wavefrontsize32,"; 106 if (FS.find_lower("wavefrontsize64") == StringRef::npos) 107 FullFS += "-wavefrontsize64,"; 108 } 109 110 FullFS += FS; 111 112 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS); 113 114 // Implement the "generic" processors, which acts as the default when no 115 // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to 116 // the first amdgcn target that supports flat addressing. Other OSes defaults 117 // to the first amdgcn target. 118 if (Gen == AMDGPUSubtarget::INVALID) { 119 Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS 120 : AMDGPUSubtarget::SOUTHERN_ISLANDS; 121 } 122 123 // We don't support FP64 for EG/NI atm. 124 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 125 126 // Targets must either support 64-bit offsets for MUBUF instructions, and/or 127 // support flat operations, otherwise they cannot access a 64-bit global 128 // address space 129 assert(hasAddr64() || hasFlat()); 130 // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets 131 // that do not support ADDR64 variants of MUBUF instructions. Such targets 132 // cannot use a 64 bit offset with a MUBUF instruction to access the global 133 // address space 134 if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) { 135 ToggleFeature(AMDGPU::FeatureFlatForGlobal); 136 FlatForGlobal = true; 137 } 138 // Unless +-flat-for-global is specified, use MUBUF instructions for global 139 // address space access if flat operations are not available. 140 if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) { 141 ToggleFeature(AMDGPU::FeatureFlatForGlobal); 142 FlatForGlobal = false; 143 } 144 145 // Set defaults if needed. 146 if (MaxPrivateElementSize == 0) 147 MaxPrivateElementSize = 4; 148 149 if (LDSBankCount == 0) 150 LDSBankCount = 32; 151 152 if (TT.getArch() == Triple::amdgcn) { 153 if (LocalMemorySize == 0) 154 LocalMemorySize = 32768; 155 156 // Do something sensible for unspecified target. 157 if (!HasMovrel && !HasVGPRIndexMode) 158 HasMovrel = true; 159 } 160 161 // Don't crash on invalid devices. 162 if (WavefrontSizeLog2 == 0) 163 WavefrontSizeLog2 = 5; 164 165 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 166 167 TargetID.setTargetIDFromFeaturesString(FS); 168 169 LLVM_DEBUG(dbgs() << "xnack setting for subtarget: " 170 << TargetID.getXnackSetting() << '\n'); 171 LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: " 172 << TargetID.getSramEccSetting() << '\n'); 173 174 return *this; 175 } 176 177 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : 178 TargetTriple(TT), 179 GCN3Encoding(false), 180 Has16BitInsts(false), 181 HasMadMixInsts(false), 182 HasMadMacF32Insts(false), 183 HasDsSrc2Insts(false), 184 HasSDWA(false), 185 HasVOP3PInsts(false), 186 HasMulI24(true), 187 HasMulU24(true), 188 HasInv2PiInlineImm(false), 189 HasFminFmaxLegacy(true), 190 EnablePromoteAlloca(false), 191 HasTrigReducedRange(false), 192 MaxWavesPerEU(10), 193 LocalMemorySize(0), 194 WavefrontSizeLog2(0) 195 { } 196 197 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 198 const GCNTargetMachine &TM) : 199 AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS), 200 AMDGPUSubtarget(TT), 201 TargetTriple(TT), 202 TargetID(*this), 203 Gen(INVALID), 204 InstrItins(getInstrItineraryForCPU(GPU)), 205 LDSBankCount(0), 206 MaxPrivateElementSize(0), 207 208 FastFMAF32(false), 209 FastDenormalF32(false), 210 HalfRate64Ops(false), 211 FullRate64Ops(false), 212 213 FlatForGlobal(false), 214 AutoWaitcntBeforeBarrier(false), 215 UnalignedScratchAccess(false), 216 UnalignedAccessMode(false), 217 218 HasApertureRegs(false), 219 SupportsXNACK(false), 220 EnableXNACK(false), 221 EnableTgSplit(false), 222 EnableCuMode(false), 223 TrapHandler(false), 224 225 EnableLoadStoreOpt(false), 226 EnableUnsafeDSOffsetFolding(false), 227 EnableSIScheduler(false), 228 EnableDS128(false), 229 EnablePRTStrictNull(false), 230 DumpCode(false), 231 232 FP64(false), 233 CIInsts(false), 234 GFX8Insts(false), 235 GFX9Insts(false), 236 GFX90AInsts(false), 237 GFX10Insts(false), 238 GFX10_3Insts(false), 239 GFX7GFX8GFX9Insts(false), 240 SGPRInitBug(false), 241 HasSMemRealTime(false), 242 HasIntClamp(false), 243 HasFmaMixInsts(false), 244 HasMovrel(false), 245 HasVGPRIndexMode(false), 246 HasScalarStores(false), 247 HasScalarAtomics(false), 248 HasSDWAOmod(false), 249 HasSDWAScalar(false), 250 HasSDWASdst(false), 251 HasSDWAMac(false), 252 HasSDWAOutModsVOPC(false), 253 HasDPP(false), 254 HasDPP8(false), 255 Has64BitDPP(false), 256 HasPackedFP32Ops(false), 257 HasExtendedImageInsts(false), 258 HasR128A16(false), 259 HasGFX10A16(false), 260 HasG16(false), 261 HasNSAEncoding(false), 262 GFX10_BEncoding(false), 263 HasDLInsts(false), 264 HasDot1Insts(false), 265 HasDot2Insts(false), 266 HasDot3Insts(false), 267 HasDot4Insts(false), 268 HasDot5Insts(false), 269 HasDot6Insts(false), 270 HasDot7Insts(false), 271 HasMAIInsts(false), 272 HasPkFmacF16Inst(false), 273 HasAtomicFaddInsts(false), 274 SupportsSRAMECC(false), 275 EnableSRAMECC(false), 276 HasNoSdstCMPX(false), 277 HasVscnt(false), 278 HasGetWaveIdInst(false), 279 HasSMemTimeInst(false), 280 HasShaderCyclesRegister(false), 281 HasRegisterBanking(false), 282 HasVOP3Literal(false), 283 HasNoDataDepHazard(false), 284 FlatAddressSpace(false), 285 FlatInstOffsets(false), 286 FlatGlobalInsts(false), 287 FlatScratchInsts(false), 288 ScalarFlatScratchInsts(false), 289 AddNoCarryInsts(false), 290 HasUnpackedD16VMem(false), 291 LDSMisalignedBug(false), 292 HasMFMAInlineLiteralBug(false), 293 UnalignedBufferAccess(false), 294 UnalignedDSAccess(false), 295 HasPackedTID(false), 296 297 ScalarizeGlobal(false), 298 299 HasVcmpxPermlaneHazard(false), 300 HasVMEMtoScalarWriteHazard(false), 301 HasSMEMtoVectorWriteHazard(false), 302 HasInstFwdPrefetchBug(false), 303 HasVcmpxExecWARHazard(false), 304 HasLdsBranchVmemWARHazard(false), 305 HasNSAtoVMEMBug(false), 306 HasOffset3fBug(false), 307 HasFlatSegmentOffsetBug(false), 308 HasImageStoreD16Bug(false), 309 HasImageGather4D16Bug(false), 310 311 FeatureDisable(false), 312 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 313 TLInfo(TM, *this), 314 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 315 MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this); 316 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 317 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering())); 318 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 319 RegBankInfo.reset(new AMDGPURegisterBankInfo(*this)); 320 InstSelector.reset(new AMDGPUInstructionSelector( 321 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 322 } 323 324 bool GCNSubtarget::enableFlatScratch() const { 325 return EnableFlatScratch && hasFlatScratchInsts(); 326 } 327 328 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { 329 if (getGeneration() < GFX10) 330 return 1; 331 332 switch (Opcode) { 333 case AMDGPU::V_LSHLREV_B64_e64: 334 case AMDGPU::V_LSHLREV_B64_gfx10: 335 case AMDGPU::V_LSHL_B64_e64: 336 case AMDGPU::V_LSHRREV_B64_e64: 337 case AMDGPU::V_LSHRREV_B64_gfx10: 338 case AMDGPU::V_LSHR_B64_e64: 339 case AMDGPU::V_ASHRREV_I64_e64: 340 case AMDGPU::V_ASHRREV_I64_gfx10: 341 case AMDGPU::V_ASHR_I64_e64: 342 return 1; 343 } 344 345 return 2; 346 } 347 348 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 349 const Function &F) const { 350 if (NWaves == 1) 351 return getLocalMemorySize(); 352 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 353 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 354 if (!WorkGroupsPerCu) 355 return 0; 356 unsigned MaxWaves = getMaxWavesPerEU(); 357 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 358 } 359 360 // FIXME: Should return min,max range. 361 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 362 const Function &F) const { 363 const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second; 364 const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize); 365 if (!MaxWorkGroupsPerCu) 366 return 0; 367 368 const unsigned WaveSize = getWavefrontSize(); 369 370 // FIXME: Do we need to account for alignment requirement of LDS rounding the 371 // size up? 372 // Compute restriction based on LDS usage 373 unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u); 374 375 // This can be queried with more LDS than is possible, so just assume the 376 // worst. 377 if (NumGroups == 0) 378 return 1; 379 380 NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups); 381 382 // Round to the number of waves. 383 const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize; 384 unsigned MaxWaves = NumGroups * MaxGroupNumWaves; 385 386 // Clamp to the maximum possible number of waves. 387 MaxWaves = std::min(MaxWaves, getMaxWavesPerEU()); 388 389 // FIXME: Needs to be a multiple of the group size? 390 //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves); 391 392 assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() && 393 "computed invalid occupancy"); 394 return MaxWaves; 395 } 396 397 unsigned 398 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 399 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 400 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 401 } 402 403 std::pair<unsigned, unsigned> 404 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 405 switch (CC) { 406 case CallingConv::AMDGPU_VS: 407 case CallingConv::AMDGPU_LS: 408 case CallingConv::AMDGPU_HS: 409 case CallingConv::AMDGPU_ES: 410 case CallingConv::AMDGPU_GS: 411 case CallingConv::AMDGPU_PS: 412 return std::make_pair(1, getWavefrontSize()); 413 default: 414 return std::make_pair(1u, getMaxFlatWorkGroupSize()); 415 } 416 } 417 418 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 419 const Function &F) const { 420 // Default minimum/maximum flat work group sizes. 421 std::pair<unsigned, unsigned> Default = 422 getDefaultFlatWorkGroupSize(F.getCallingConv()); 423 424 // Requested minimum/maximum flat work group sizes. 425 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 426 F, "amdgpu-flat-work-group-size", Default); 427 428 // Make sure requested minimum is less than requested maximum. 429 if (Requested.first > Requested.second) 430 return Default; 431 432 // Make sure requested values do not violate subtarget's specifications. 433 if (Requested.first < getMinFlatWorkGroupSize()) 434 return Default; 435 if (Requested.second > getMaxFlatWorkGroupSize()) 436 return Default; 437 438 return Requested; 439 } 440 441 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 442 const Function &F) const { 443 // Default minimum/maximum number of waves per execution unit. 444 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 445 446 // Default/requested minimum/maximum flat work group sizes. 447 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 448 449 // If minimum/maximum flat work group sizes were explicitly requested using 450 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 451 // number of waves per execution unit to values implied by requested 452 // minimum/maximum flat work group sizes. 453 unsigned MinImpliedByFlatWorkGroupSize = 454 getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second); 455 Default.first = MinImpliedByFlatWorkGroupSize; 456 bool RequestedFlatWorkGroupSize = 457 F.hasFnAttribute("amdgpu-flat-work-group-size"); 458 459 // Requested minimum/maximum number of waves per execution unit. 460 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 461 F, "amdgpu-waves-per-eu", Default, true); 462 463 // Make sure requested minimum is less than requested maximum. 464 if (Requested.second && Requested.first > Requested.second) 465 return Default; 466 467 // Make sure requested values do not violate subtarget's specifications. 468 if (Requested.first < getMinWavesPerEU() || 469 Requested.second > getMaxWavesPerEU()) 470 return Default; 471 472 // Make sure requested values are compatible with values implied by requested 473 // minimum/maximum flat work group sizes. 474 if (RequestedFlatWorkGroupSize && 475 Requested.first < MinImpliedByFlatWorkGroupSize) 476 return Default; 477 478 return Requested; 479 } 480 481 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) { 482 auto Node = Kernel.getMetadata("reqd_work_group_size"); 483 if (Node && Node->getNumOperands() == 3) 484 return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue(); 485 return std::numeric_limits<unsigned>::max(); 486 } 487 488 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const { 489 return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv()); 490 } 491 492 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel, 493 unsigned Dimension) const { 494 unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension); 495 if (ReqdSize != std::numeric_limits<unsigned>::max()) 496 return ReqdSize - 1; 497 return getFlatWorkGroupSizes(Kernel).second - 1; 498 } 499 500 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 501 Function *Kernel = I->getParent()->getParent(); 502 unsigned MinSize = 0; 503 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 504 bool IdQuery = false; 505 506 // If reqd_work_group_size is present it narrows value down. 507 if (auto *CI = dyn_cast<CallInst>(I)) { 508 const Function *F = CI->getCalledFunction(); 509 if (F) { 510 unsigned Dim = UINT_MAX; 511 switch (F->getIntrinsicID()) { 512 case Intrinsic::amdgcn_workitem_id_x: 513 case Intrinsic::r600_read_tidig_x: 514 IdQuery = true; 515 LLVM_FALLTHROUGH; 516 case Intrinsic::r600_read_local_size_x: 517 Dim = 0; 518 break; 519 case Intrinsic::amdgcn_workitem_id_y: 520 case Intrinsic::r600_read_tidig_y: 521 IdQuery = true; 522 LLVM_FALLTHROUGH; 523 case Intrinsic::r600_read_local_size_y: 524 Dim = 1; 525 break; 526 case Intrinsic::amdgcn_workitem_id_z: 527 case Intrinsic::r600_read_tidig_z: 528 IdQuery = true; 529 LLVM_FALLTHROUGH; 530 case Intrinsic::r600_read_local_size_z: 531 Dim = 2; 532 break; 533 default: 534 break; 535 } 536 537 if (Dim <= 3) { 538 unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim); 539 if (ReqdSize != std::numeric_limits<unsigned>::max()) 540 MinSize = MaxSize = ReqdSize; 541 } 542 } 543 } 544 545 if (!MaxSize) 546 return false; 547 548 // Range metadata is [Lo, Hi). For ID query we need to pass max size 549 // as Hi. For size query we need to pass Hi + 1. 550 if (IdQuery) 551 MinSize = 0; 552 else 553 ++MaxSize; 554 555 MDBuilder MDB(I->getContext()); 556 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 557 APInt(32, MaxSize)); 558 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 559 return true; 560 } 561 562 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const { 563 if (isMesaKernel(F)) 564 return 16; 565 return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0); 566 } 567 568 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 569 Align &MaxAlign) const { 570 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 571 F.getCallingConv() == CallingConv::SPIR_KERNEL); 572 573 const DataLayout &DL = F.getParent()->getDataLayout(); 574 uint64_t ExplicitArgBytes = 0; 575 MaxAlign = Align(1); 576 577 for (const Argument &Arg : F.args()) { 578 const bool IsByRef = Arg.hasByRefAttr(); 579 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); 580 MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None; 581 if (!Alignment) 582 Alignment = DL.getABITypeAlign(ArgTy); 583 584 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 585 ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize; 586 MaxAlign = max(MaxAlign, Alignment); 587 } 588 589 return ExplicitArgBytes; 590 } 591 592 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 593 Align &MaxAlign) const { 594 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 595 596 unsigned ExplicitOffset = getExplicitKernelArgOffset(F); 597 598 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 599 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 600 if (ImplicitBytes != 0) { 601 const Align Alignment = getAlignmentForImplicitArgPtr(); 602 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 603 } 604 605 // Being able to dereference past the end is useful for emitting scalar loads. 606 return alignTo(TotalSize, 4); 607 } 608 609 AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const { 610 return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32 611 : AMDGPUDwarfFlavour::Wave64; 612 } 613 614 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 615 const TargetMachine &TM) : 616 R600GenSubtargetInfo(TT, GPU, /*TuneCPU*/GPU, FS), 617 AMDGPUSubtarget(TT), 618 InstrInfo(*this), 619 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 620 FMA(false), 621 CaymanISA(false), 622 CFALUBug(false), 623 HasVertexCache(false), 624 R600ALUInst(false), 625 FP64(false), 626 TexVTXClauseSize(0), 627 Gen(R600), 628 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)), 629 InstrItins(getInstrItineraryForCPU(GPU)) { } 630 631 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 632 unsigned NumRegionInstrs) const { 633 // Track register pressure so the scheduler can try to decrease 634 // pressure once register usage is above the threshold defined by 635 // SIRegisterInfo::getRegPressureSetLimit() 636 Policy.ShouldTrackPressure = true; 637 638 // Enabling both top down and bottom up scheduling seems to give us less 639 // register spills than just using one of these approaches on its own. 640 Policy.OnlyTopDown = false; 641 Policy.OnlyBottomUp = false; 642 643 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 644 if (!enableSIScheduler()) 645 Policy.ShouldTrackLaneMasks = true; 646 } 647 648 bool GCNSubtarget::hasMadF16() const { 649 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1; 650 } 651 652 bool GCNSubtarget::useVGPRIndexMode() const { 653 return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode()); 654 } 655 656 bool GCNSubtarget::useAA() const { return UseAA; } 657 658 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 659 if (getGeneration() >= AMDGPUSubtarget::GFX10) 660 return getMaxWavesPerEU(); 661 662 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 663 if (SGPRs <= 80) 664 return 10; 665 if (SGPRs <= 88) 666 return 9; 667 if (SGPRs <= 100) 668 return 8; 669 return 7; 670 } 671 if (SGPRs <= 48) 672 return 10; 673 if (SGPRs <= 56) 674 return 9; 675 if (SGPRs <= 64) 676 return 8; 677 if (SGPRs <= 72) 678 return 7; 679 if (SGPRs <= 80) 680 return 6; 681 return 5; 682 } 683 684 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 685 unsigned MaxWaves = getMaxWavesPerEU(); 686 unsigned Granule = getVGPRAllocGranule(); 687 if (VGPRs < Granule) 688 return MaxWaves; 689 unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule; 690 return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves); 691 } 692 693 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 694 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 695 if (getGeneration() >= AMDGPUSubtarget::GFX10) 696 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. 697 698 if (MFI.hasFlatScratchInit()) { 699 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 700 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 701 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 702 return 4; // FLAT_SCRATCH, VCC (in that order). 703 } 704 705 if (isXNACKEnabled()) 706 return 4; // XNACK, VCC (in that order). 707 return 2; // VCC. 708 } 709 710 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize, 711 unsigned NumSGPRs, 712 unsigned NumVGPRs) const { 713 unsigned Occupancy = 714 std::min(getMaxWavesPerEU(), 715 getOccupancyWithLocalMemSize(LDSSize, F)); 716 if (NumSGPRs) 717 Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs)); 718 if (NumVGPRs) 719 Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs)); 720 return Occupancy; 721 } 722 723 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 724 const Function &F = MF.getFunction(); 725 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 726 727 // Compute maximum number of SGPRs function can use using default/requested 728 // minimum number of waves per execution unit. 729 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 730 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 731 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 732 733 // Check if maximum number of SGPRs was explicitly requested using 734 // "amdgpu-num-sgpr" attribute. 735 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 736 unsigned Requested = AMDGPU::getIntegerAttribute( 737 F, "amdgpu-num-sgpr", MaxNumSGPRs); 738 739 // Make sure requested value does not violate subtarget's specifications. 740 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 741 Requested = 0; 742 743 // If more SGPRs are required to support the input user/system SGPRs, 744 // increase to accommodate them. 745 // 746 // FIXME: This really ends up using the requested number of SGPRs + number 747 // of reserved special registers in total. Theoretically you could re-use 748 // the last input registers for these special registers, but this would 749 // require a lot of complexity to deal with the weird aliasing. 750 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 751 if (Requested && Requested < InputNumSGPRs) 752 Requested = InputNumSGPRs; 753 754 // Make sure requested value is compatible with values implied by 755 // default/requested minimum/maximum number of waves per execution unit. 756 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 757 Requested = 0; 758 if (WavesPerEU.second && 759 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 760 Requested = 0; 761 762 if (Requested) 763 MaxNumSGPRs = Requested; 764 } 765 766 if (hasSGPRInitBug()) 767 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 768 769 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 770 MaxAddressableNumSGPRs); 771 } 772 773 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 774 const Function &F = MF.getFunction(); 775 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 776 777 // Compute maximum number of VGPRs function can use using default/requested 778 // minimum number of waves per execution unit. 779 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 780 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 781 782 // Check if maximum number of VGPRs was explicitly requested using 783 // "amdgpu-num-vgpr" attribute. 784 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 785 unsigned Requested = AMDGPU::getIntegerAttribute( 786 F, "amdgpu-num-vgpr", MaxNumVGPRs); 787 788 if (hasGFX90AInsts()) 789 Requested *= 2; 790 791 // Make sure requested value is compatible with values implied by 792 // default/requested minimum/maximum number of waves per execution unit. 793 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 794 Requested = 0; 795 if (WavesPerEU.second && 796 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 797 Requested = 0; 798 799 if (Requested) 800 MaxNumVGPRs = Requested; 801 } 802 803 return MaxNumVGPRs; 804 } 805 806 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, 807 int UseOpIdx, SDep &Dep) const { 808 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || 809 !Def->isInstr() || !Use->isInstr()) 810 return; 811 812 MachineInstr *DefI = Def->getInstr(); 813 MachineInstr *UseI = Use->getInstr(); 814 815 if (DefI->isBundle()) { 816 const SIRegisterInfo *TRI = getRegisterInfo(); 817 auto Reg = Dep.getReg(); 818 MachineBasicBlock::const_instr_iterator I(DefI->getIterator()); 819 MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end()); 820 unsigned Lat = 0; 821 for (++I; I != E && I->isBundledWithPred(); ++I) { 822 if (I->modifiesRegister(Reg, TRI)) 823 Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I); 824 else if (Lat) 825 --Lat; 826 } 827 Dep.setLatency(Lat); 828 } else if (UseI->isBundle()) { 829 const SIRegisterInfo *TRI = getRegisterInfo(); 830 auto Reg = Dep.getReg(); 831 MachineBasicBlock::const_instr_iterator I(UseI->getIterator()); 832 MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end()); 833 unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI); 834 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) { 835 if (I->readsRegister(Reg, TRI)) 836 break; 837 --Lat; 838 } 839 Dep.setLatency(Lat); 840 } 841 } 842 843 namespace { 844 struct FillMFMAShadowMutation : ScheduleDAGMutation { 845 const SIInstrInfo *TII; 846 847 ScheduleDAGMI *DAG; 848 849 FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {} 850 851 bool isSALU(const SUnit *SU) const { 852 const MachineInstr *MI = SU->getInstr(); 853 return MI && TII->isSALU(*MI) && !MI->isTerminator(); 854 } 855 856 bool isVALU(const SUnit *SU) const { 857 const MachineInstr *MI = SU->getInstr(); 858 return MI && TII->isVALU(*MI); 859 } 860 861 bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const { 862 if (Pred->NodeNum < Succ->NodeNum) 863 return true; 864 865 SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred}); 866 867 for (unsigned I = 0; I < Succs.size(); ++I) { 868 for (const SDep &SI : Succs[I]->Succs) { 869 const SUnit *SU = SI.getSUnit(); 870 if (SU != Succs[I] && !llvm::is_contained(Succs, SU)) 871 Succs.push_back(SU); 872 } 873 } 874 875 SmallPtrSet<const SUnit*, 32> Visited; 876 while (!Preds.empty()) { 877 const SUnit *SU = Preds.pop_back_val(); 878 if (llvm::is_contained(Succs, SU)) 879 return false; 880 Visited.insert(SU); 881 for (const SDep &SI : SU->Preds) 882 if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit())) 883 Preds.push_back(SI.getSUnit()); 884 } 885 886 return true; 887 } 888 889 // Link as much SALU intructions in chain as possible. Return the size 890 // of the chain. Links up to MaxChain instructions. 891 unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain, 892 SmallPtrSetImpl<SUnit *> &Visited) const { 893 SmallVector<SUnit *, 8> Worklist({To}); 894 unsigned Linked = 0; 895 896 while (!Worklist.empty() && MaxChain-- > 0) { 897 SUnit *SU = Worklist.pop_back_val(); 898 if (!Visited.insert(SU).second) 899 continue; 900 901 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From); 902 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n'); 903 904 if (SU->addPred(SDep(From, SDep::Artificial), false)) 905 ++Linked; 906 907 for (SDep &SI : From->Succs) { 908 SUnit *SUv = SI.getSUnit(); 909 if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU)) 910 SUv->addPred(SDep(SU, SDep::Artificial), false); 911 } 912 913 for (SDep &SI : SU->Succs) { 914 SUnit *Succ = SI.getSUnit(); 915 if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ)) 916 Worklist.push_back(Succ); 917 } 918 } 919 920 return Linked; 921 } 922 923 void apply(ScheduleDAGInstrs *DAGInstrs) override { 924 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); 925 if (!ST.hasMAIInsts() || DisablePowerSched) 926 return; 927 DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); 928 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); 929 if (!TSchedModel || DAG->SUnits.empty()) 930 return; 931 932 // Scan for MFMA long latency instructions and try to add a dependency 933 // of available SALU instructions to give them a chance to fill MFMA 934 // shadow. That is desirable to fill MFMA shadow with SALU instructions 935 // rather than VALU to prevent power consumption bursts and throttle. 936 auto LastSALU = DAG->SUnits.begin(); 937 auto E = DAG->SUnits.end(); 938 SmallPtrSet<SUnit*, 32> Visited; 939 for (SUnit &SU : DAG->SUnits) { 940 MachineInstr &MAI = *SU.getInstr(); 941 if (!TII->isMAI(MAI) || 942 MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 || 943 MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64) 944 continue; 945 946 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1; 947 948 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU); 949 dbgs() << "Need " << Lat 950 << " instructions to cover latency.\n"); 951 952 // Find up to Lat independent scalar instructions as early as 953 // possible such that they can be scheduled after this MFMA. 954 for ( ; Lat && LastSALU != E; ++LastSALU) { 955 if (Visited.count(&*LastSALU)) 956 continue; 957 958 if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU)) 959 continue; 960 961 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited); 962 } 963 } 964 } 965 }; 966 } // namespace 967 968 void GCNSubtarget::getPostRAMutations( 969 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 970 Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo)); 971 } 972 973 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 974 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 975 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 976 else 977 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); 978 } 979 980 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 981 if (TM.getTargetTriple().getArch() == Triple::amdgcn) 982 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 983 else 984 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); 985 } 986