1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Implements the AMDGPU specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUSubtarget.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUCallLowering.h" 17 #include "AMDGPUInstructionSelector.h" 18 #include "AMDGPULegalizerInfo.h" 19 #include "AMDGPURegisterBankInfo.h" 20 #include "AMDGPUTargetMachine.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "Utils/AMDGPUBaseInfo.h" 23 #include "llvm/ADT/SmallString.h" 24 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h" 25 #include "llvm/CodeGen/MachineScheduler.h" 26 #include "llvm/CodeGen/TargetFrameLowering.h" 27 #include "llvm/IR/IntrinsicsAMDGPU.h" 28 #include "llvm/IR/IntrinsicsR600.h" 29 #include "llvm/IR/MDBuilder.h" 30 #include "llvm/MC/MCSubtargetInfo.h" 31 #include <algorithm> 32 33 using namespace llvm; 34 35 #define DEBUG_TYPE "amdgpu-subtarget" 36 37 #define GET_SUBTARGETINFO_TARGET_DESC 38 #define GET_SUBTARGETINFO_CTOR 39 #define AMDGPUSubtarget GCNSubtarget 40 #include "AMDGPUGenSubtargetInfo.inc" 41 #define GET_SUBTARGETINFO_TARGET_DESC 42 #define GET_SUBTARGETINFO_CTOR 43 #undef AMDGPUSubtarget 44 #include "R600GenSubtargetInfo.inc" 45 46 static cl::opt<bool> DisablePowerSched( 47 "amdgpu-disable-power-sched", 48 cl::desc("Disable scheduling to minimize mAI power bursts"), 49 cl::init(false)); 50 51 static cl::opt<bool> EnableVGPRIndexMode( 52 "amdgpu-vgpr-index-mode", 53 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), 54 cl::init(false)); 55 56 static cl::opt<bool> EnableFlatScratch( 57 "amdgpu-enable-flat-scratch", 58 cl::desc("Use flat scratch instructions"), 59 cl::init(false)); 60 61 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen", 62 cl::desc("Enable the use of AA during codegen."), 63 cl::init(true)); 64 65 GCNSubtarget::~GCNSubtarget() = default; 66 67 R600Subtarget & 68 R600Subtarget::initializeSubtargetDependencies(const Triple &TT, 69 StringRef GPU, StringRef FS) { 70 SmallString<256> FullFS("+promote-alloca,"); 71 FullFS += FS; 72 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS); 73 74 HasMulU24 = getGeneration() >= EVERGREEN; 75 HasMulI24 = hasCaymanISA(); 76 77 return *this; 78 } 79 80 GCNSubtarget & 81 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 82 StringRef GPU, StringRef FS) { 83 // Determine default and user-specified characteristics 84 // 85 // We want to be able to turn these off, but making this a subtarget feature 86 // for SI has the unhelpful behavior that it unsets everything else if you 87 // disable it. 88 // 89 // Similarly we want enable-prt-strict-null to be on by default and not to 90 // unset everything else if it is disabled 91 92 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,"); 93 94 // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default 95 if (isAmdHsaOS()) 96 FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,"; 97 98 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS 99 100 // Disable mutually exclusive bits. 101 if (FS.find_lower("+wavefrontsize") != StringRef::npos) { 102 if (FS.find_lower("wavefrontsize16") == StringRef::npos) 103 FullFS += "-wavefrontsize16,"; 104 if (FS.find_lower("wavefrontsize32") == StringRef::npos) 105 FullFS += "-wavefrontsize32,"; 106 if (FS.find_lower("wavefrontsize64") == StringRef::npos) 107 FullFS += "-wavefrontsize64,"; 108 } 109 110 FullFS += FS; 111 112 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS); 113 114 // Implement the "generic" processors, which acts as the default when no 115 // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to 116 // the first amdgcn target that supports flat addressing. Other OSes defaults 117 // to the first amdgcn target. 118 if (Gen == AMDGPUSubtarget::INVALID) { 119 Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS 120 : AMDGPUSubtarget::SOUTHERN_ISLANDS; 121 } 122 123 // We don't support FP64 for EG/NI atm. 124 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 125 126 // Targets must either support 64-bit offsets for MUBUF instructions, and/or 127 // support flat operations, otherwise they cannot access a 64-bit global 128 // address space 129 assert(hasAddr64() || hasFlat()); 130 // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets 131 // that do not support ADDR64 variants of MUBUF instructions. Such targets 132 // cannot use a 64 bit offset with a MUBUF instruction to access the global 133 // address space 134 if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) { 135 ToggleFeature(AMDGPU::FeatureFlatForGlobal); 136 FlatForGlobal = true; 137 } 138 // Unless +-flat-for-global is specified, use MUBUF instructions for global 139 // address space access if flat operations are not available. 140 if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) { 141 ToggleFeature(AMDGPU::FeatureFlatForGlobal); 142 FlatForGlobal = false; 143 } 144 145 // Set defaults if needed. 146 if (MaxPrivateElementSize == 0) 147 MaxPrivateElementSize = 4; 148 149 if (LDSBankCount == 0) 150 LDSBankCount = 32; 151 152 if (TT.getArch() == Triple::amdgcn) { 153 if (LocalMemorySize == 0) 154 LocalMemorySize = 32768; 155 156 // Do something sensible for unspecified target. 157 if (!HasMovrel && !HasVGPRIndexMode) 158 HasMovrel = true; 159 } 160 161 // Don't crash on invalid devices. 162 if (WavefrontSizeLog2 == 0) 163 WavefrontSizeLog2 = 5; 164 165 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 166 167 TargetID.setTargetIDFromFeaturesString(FS); 168 169 LLVM_DEBUG(dbgs() << "xnack setting for subtarget: " 170 << TargetID.getXnackSetting() << '\n'); 171 LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: " 172 << TargetID.getSramEccSetting() << '\n'); 173 174 return *this; 175 } 176 177 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : 178 TargetTriple(TT), 179 GCN3Encoding(false), 180 Has16BitInsts(false), 181 HasMadMixInsts(false), 182 HasMadMacF32Insts(false), 183 HasDsSrc2Insts(false), 184 HasSDWA(false), 185 HasVOP3PInsts(false), 186 HasMulI24(true), 187 HasMulU24(true), 188 HasInv2PiInlineImm(false), 189 HasFminFmaxLegacy(true), 190 EnablePromoteAlloca(false), 191 HasTrigReducedRange(false), 192 MaxWavesPerEU(10), 193 LocalMemorySize(0), 194 WavefrontSizeLog2(0) 195 { } 196 197 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 198 const GCNTargetMachine &TM) : 199 AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS), 200 AMDGPUSubtarget(TT), 201 TargetTriple(TT), 202 TargetID(*this), 203 Gen(INVALID), 204 InstrItins(getInstrItineraryForCPU(GPU)), 205 LDSBankCount(0), 206 MaxPrivateElementSize(0), 207 208 FastFMAF32(false), 209 FastDenormalF32(false), 210 HalfRate64Ops(false), 211 FullRate64Ops(false), 212 213 FlatForGlobal(false), 214 AutoWaitcntBeforeBarrier(false), 215 UnalignedScratchAccess(false), 216 UnalignedAccessMode(false), 217 218 HasApertureRegs(false), 219 SupportsXNACK(false), 220 EnableXNACK(false), 221 EnableTgSplit(false), 222 EnableCuMode(false), 223 TrapHandler(false), 224 225 EnableLoadStoreOpt(false), 226 EnableUnsafeDSOffsetFolding(false), 227 EnableSIScheduler(false), 228 EnableDS128(false), 229 EnablePRTStrictNull(false), 230 DumpCode(false), 231 232 FP64(false), 233 CIInsts(false), 234 GFX8Insts(false), 235 GFX9Insts(false), 236 GFX90AInsts(false), 237 GFX10Insts(false), 238 GFX10_3Insts(false), 239 GFX7GFX8GFX9Insts(false), 240 SGPRInitBug(false), 241 HasSMemRealTime(false), 242 HasIntClamp(false), 243 HasFmaMixInsts(false), 244 HasMovrel(false), 245 HasVGPRIndexMode(false), 246 HasScalarStores(false), 247 HasScalarAtomics(false), 248 HasSDWAOmod(false), 249 HasSDWAScalar(false), 250 HasSDWASdst(false), 251 HasSDWAMac(false), 252 HasSDWAOutModsVOPC(false), 253 HasDPP(false), 254 HasDPP8(false), 255 Has64BitDPP(false), 256 HasPackedFP32Ops(false), 257 HasExtendedImageInsts(false), 258 HasR128A16(false), 259 HasGFX10A16(false), 260 HasG16(false), 261 HasNSAEncoding(false), 262 GFX10_BEncoding(false), 263 HasDLInsts(false), 264 HasDot1Insts(false), 265 HasDot2Insts(false), 266 HasDot3Insts(false), 267 HasDot4Insts(false), 268 HasDot5Insts(false), 269 HasDot6Insts(false), 270 HasMAIInsts(false), 271 HasPkFmacF16Inst(false), 272 HasAtomicFaddInsts(false), 273 SupportsSRAMECC(false), 274 EnableSRAMECC(false), 275 HasNoSdstCMPX(false), 276 HasVscnt(false), 277 HasGetWaveIdInst(false), 278 HasSMemTimeInst(false), 279 HasShaderCyclesRegister(false), 280 HasRegisterBanking(false), 281 HasVOP3Literal(false), 282 HasNoDataDepHazard(false), 283 FlatAddressSpace(false), 284 FlatInstOffsets(false), 285 FlatGlobalInsts(false), 286 FlatScratchInsts(false), 287 ScalarFlatScratchInsts(false), 288 AddNoCarryInsts(false), 289 HasUnpackedD16VMem(false), 290 LDSMisalignedBug(false), 291 HasMFMAInlineLiteralBug(false), 292 UnalignedBufferAccess(false), 293 UnalignedDSAccess(false), 294 HasPackedTID(false), 295 296 ScalarizeGlobal(false), 297 298 HasVcmpxPermlaneHazard(false), 299 HasVMEMtoScalarWriteHazard(false), 300 HasSMEMtoVectorWriteHazard(false), 301 HasInstFwdPrefetchBug(false), 302 HasVcmpxExecWARHazard(false), 303 HasLdsBranchVmemWARHazard(false), 304 HasNSAtoVMEMBug(false), 305 HasOffset3fBug(false), 306 HasFlatSegmentOffsetBug(false), 307 HasImageStoreD16Bug(false), 308 HasImageGather4D16Bug(false), 309 310 FeatureDisable(false), 311 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 312 TLInfo(TM, *this), 313 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 314 MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this); 315 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 316 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering())); 317 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 318 RegBankInfo.reset(new AMDGPURegisterBankInfo(*this)); 319 InstSelector.reset(new AMDGPUInstructionSelector( 320 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 321 } 322 323 bool GCNSubtarget::enableFlatScratch() const { 324 return EnableFlatScratch && hasFlatScratchInsts(); 325 } 326 327 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { 328 if (getGeneration() < GFX10) 329 return 1; 330 331 switch (Opcode) { 332 case AMDGPU::V_LSHLREV_B64_e64: 333 case AMDGPU::V_LSHLREV_B64_gfx10: 334 case AMDGPU::V_LSHL_B64_e64: 335 case AMDGPU::V_LSHRREV_B64_e64: 336 case AMDGPU::V_LSHRREV_B64_gfx10: 337 case AMDGPU::V_LSHR_B64_e64: 338 case AMDGPU::V_ASHRREV_I64_e64: 339 case AMDGPU::V_ASHRREV_I64_gfx10: 340 case AMDGPU::V_ASHR_I64_e64: 341 return 1; 342 } 343 344 return 2; 345 } 346 347 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 348 const Function &F) const { 349 if (NWaves == 1) 350 return getLocalMemorySize(); 351 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 352 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 353 if (!WorkGroupsPerCu) 354 return 0; 355 unsigned MaxWaves = getMaxWavesPerEU(); 356 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 357 } 358 359 // FIXME: Should return min,max range. 360 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 361 const Function &F) const { 362 const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second; 363 const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize); 364 if (!MaxWorkGroupsPerCu) 365 return 0; 366 367 const unsigned WaveSize = getWavefrontSize(); 368 369 // FIXME: Do we need to account for alignment requirement of LDS rounding the 370 // size up? 371 // Compute restriction based on LDS usage 372 unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u); 373 374 // This can be queried with more LDS than is possible, so just assume the 375 // worst. 376 if (NumGroups == 0) 377 return 1; 378 379 NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups); 380 381 // Round to the number of waves. 382 const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize; 383 unsigned MaxWaves = NumGroups * MaxGroupNumWaves; 384 385 // Clamp to the maximum possible number of waves. 386 MaxWaves = std::min(MaxWaves, getMaxWavesPerEU()); 387 388 // FIXME: Needs to be a multiple of the group size? 389 //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves); 390 391 assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() && 392 "computed invalid occupancy"); 393 return MaxWaves; 394 } 395 396 unsigned 397 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 398 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 399 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 400 } 401 402 std::pair<unsigned, unsigned> 403 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 404 switch (CC) { 405 case CallingConv::AMDGPU_VS: 406 case CallingConv::AMDGPU_LS: 407 case CallingConv::AMDGPU_HS: 408 case CallingConv::AMDGPU_ES: 409 case CallingConv::AMDGPU_GS: 410 case CallingConv::AMDGPU_PS: 411 return std::make_pair(1, getWavefrontSize()); 412 default: 413 return std::make_pair(1u, getMaxFlatWorkGroupSize()); 414 } 415 } 416 417 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 418 const Function &F) const { 419 // Default minimum/maximum flat work group sizes. 420 std::pair<unsigned, unsigned> Default = 421 getDefaultFlatWorkGroupSize(F.getCallingConv()); 422 423 // Requested minimum/maximum flat work group sizes. 424 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 425 F, "amdgpu-flat-work-group-size", Default); 426 427 // Make sure requested minimum is less than requested maximum. 428 if (Requested.first > Requested.second) 429 return Default; 430 431 // Make sure requested values do not violate subtarget's specifications. 432 if (Requested.first < getMinFlatWorkGroupSize()) 433 return Default; 434 if (Requested.second > getMaxFlatWorkGroupSize()) 435 return Default; 436 437 return Requested; 438 } 439 440 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 441 const Function &F) const { 442 // Default minimum/maximum number of waves per execution unit. 443 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 444 445 // Default/requested minimum/maximum flat work group sizes. 446 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 447 448 // If minimum/maximum flat work group sizes were explicitly requested using 449 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 450 // number of waves per execution unit to values implied by requested 451 // minimum/maximum flat work group sizes. 452 unsigned MinImpliedByFlatWorkGroupSize = 453 getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second); 454 Default.first = MinImpliedByFlatWorkGroupSize; 455 bool RequestedFlatWorkGroupSize = 456 F.hasFnAttribute("amdgpu-flat-work-group-size"); 457 458 // Requested minimum/maximum number of waves per execution unit. 459 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 460 F, "amdgpu-waves-per-eu", Default, true); 461 462 // Make sure requested minimum is less than requested maximum. 463 if (Requested.second && Requested.first > Requested.second) 464 return Default; 465 466 // Make sure requested values do not violate subtarget's specifications. 467 if (Requested.first < getMinWavesPerEU() || 468 Requested.second > getMaxWavesPerEU()) 469 return Default; 470 471 // Make sure requested values are compatible with values implied by requested 472 // minimum/maximum flat work group sizes. 473 if (RequestedFlatWorkGroupSize && 474 Requested.first < MinImpliedByFlatWorkGroupSize) 475 return Default; 476 477 return Requested; 478 } 479 480 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) { 481 auto Node = Kernel.getMetadata("reqd_work_group_size"); 482 if (Node && Node->getNumOperands() == 3) 483 return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue(); 484 return std::numeric_limits<unsigned>::max(); 485 } 486 487 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const { 488 return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv()); 489 } 490 491 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel, 492 unsigned Dimension) const { 493 unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension); 494 if (ReqdSize != std::numeric_limits<unsigned>::max()) 495 return ReqdSize - 1; 496 return getFlatWorkGroupSizes(Kernel).second - 1; 497 } 498 499 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 500 Function *Kernel = I->getParent()->getParent(); 501 unsigned MinSize = 0; 502 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 503 bool IdQuery = false; 504 505 // If reqd_work_group_size is present it narrows value down. 506 if (auto *CI = dyn_cast<CallInst>(I)) { 507 const Function *F = CI->getCalledFunction(); 508 if (F) { 509 unsigned Dim = UINT_MAX; 510 switch (F->getIntrinsicID()) { 511 case Intrinsic::amdgcn_workitem_id_x: 512 case Intrinsic::r600_read_tidig_x: 513 IdQuery = true; 514 LLVM_FALLTHROUGH; 515 case Intrinsic::r600_read_local_size_x: 516 Dim = 0; 517 break; 518 case Intrinsic::amdgcn_workitem_id_y: 519 case Intrinsic::r600_read_tidig_y: 520 IdQuery = true; 521 LLVM_FALLTHROUGH; 522 case Intrinsic::r600_read_local_size_y: 523 Dim = 1; 524 break; 525 case Intrinsic::amdgcn_workitem_id_z: 526 case Intrinsic::r600_read_tidig_z: 527 IdQuery = true; 528 LLVM_FALLTHROUGH; 529 case Intrinsic::r600_read_local_size_z: 530 Dim = 2; 531 break; 532 default: 533 break; 534 } 535 536 if (Dim <= 3) { 537 unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim); 538 if (ReqdSize != std::numeric_limits<unsigned>::max()) 539 MinSize = MaxSize = ReqdSize; 540 } 541 } 542 } 543 544 if (!MaxSize) 545 return false; 546 547 // Range metadata is [Lo, Hi). For ID query we need to pass max size 548 // as Hi. For size query we need to pass Hi + 1. 549 if (IdQuery) 550 MinSize = 0; 551 else 552 ++MaxSize; 553 554 MDBuilder MDB(I->getContext()); 555 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 556 APInt(32, MaxSize)); 557 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 558 return true; 559 } 560 561 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const { 562 if (isMesaKernel(F)) 563 return 16; 564 return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0); 565 } 566 567 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 568 Align &MaxAlign) const { 569 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 570 F.getCallingConv() == CallingConv::SPIR_KERNEL); 571 572 const DataLayout &DL = F.getParent()->getDataLayout(); 573 uint64_t ExplicitArgBytes = 0; 574 MaxAlign = Align(1); 575 576 for (const Argument &Arg : F.args()) { 577 const bool IsByRef = Arg.hasByRefAttr(); 578 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); 579 MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None; 580 if (!Alignment) 581 Alignment = DL.getABITypeAlign(ArgTy); 582 583 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 584 ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize; 585 MaxAlign = max(MaxAlign, Alignment); 586 } 587 588 return ExplicitArgBytes; 589 } 590 591 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 592 Align &MaxAlign) const { 593 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 594 595 unsigned ExplicitOffset = getExplicitKernelArgOffset(F); 596 597 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 598 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 599 if (ImplicitBytes != 0) { 600 const Align Alignment = getAlignmentForImplicitArgPtr(); 601 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 602 } 603 604 // Being able to dereference past the end is useful for emitting scalar loads. 605 return alignTo(TotalSize, 4); 606 } 607 608 AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const { 609 return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32 610 : AMDGPUDwarfFlavour::Wave64; 611 } 612 613 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 614 const TargetMachine &TM) : 615 R600GenSubtargetInfo(TT, GPU, /*TuneCPU*/GPU, FS), 616 AMDGPUSubtarget(TT), 617 InstrInfo(*this), 618 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 619 FMA(false), 620 CaymanISA(false), 621 CFALUBug(false), 622 HasVertexCache(false), 623 R600ALUInst(false), 624 FP64(false), 625 TexVTXClauseSize(0), 626 Gen(R600), 627 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)), 628 InstrItins(getInstrItineraryForCPU(GPU)) { } 629 630 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 631 unsigned NumRegionInstrs) const { 632 // Track register pressure so the scheduler can try to decrease 633 // pressure once register usage is above the threshold defined by 634 // SIRegisterInfo::getRegPressureSetLimit() 635 Policy.ShouldTrackPressure = true; 636 637 // Enabling both top down and bottom up scheduling seems to give us less 638 // register spills than just using one of these approaches on its own. 639 Policy.OnlyTopDown = false; 640 Policy.OnlyBottomUp = false; 641 642 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 643 if (!enableSIScheduler()) 644 Policy.ShouldTrackLaneMasks = true; 645 } 646 647 bool GCNSubtarget::hasMadF16() const { 648 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1; 649 } 650 651 bool GCNSubtarget::useVGPRIndexMode() const { 652 return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode()); 653 } 654 655 bool GCNSubtarget::useAA() const { return UseAA; } 656 657 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 658 if (getGeneration() >= AMDGPUSubtarget::GFX10) 659 return getMaxWavesPerEU(); 660 661 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 662 if (SGPRs <= 80) 663 return 10; 664 if (SGPRs <= 88) 665 return 9; 666 if (SGPRs <= 100) 667 return 8; 668 return 7; 669 } 670 if (SGPRs <= 48) 671 return 10; 672 if (SGPRs <= 56) 673 return 9; 674 if (SGPRs <= 64) 675 return 8; 676 if (SGPRs <= 72) 677 return 7; 678 if (SGPRs <= 80) 679 return 6; 680 return 5; 681 } 682 683 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 684 unsigned MaxWaves = getMaxWavesPerEU(); 685 unsigned Granule = getVGPRAllocGranule(); 686 if (VGPRs < Granule) 687 return MaxWaves; 688 unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule; 689 return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves); 690 } 691 692 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 693 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 694 if (getGeneration() >= AMDGPUSubtarget::GFX10) 695 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. 696 697 if (MFI.hasFlatScratchInit()) { 698 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 699 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 700 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 701 return 4; // FLAT_SCRATCH, VCC (in that order). 702 } 703 704 if (isXNACKEnabled()) 705 return 4; // XNACK, VCC (in that order). 706 return 2; // VCC. 707 } 708 709 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize, 710 unsigned NumSGPRs, 711 unsigned NumVGPRs) const { 712 unsigned Occupancy = 713 std::min(getMaxWavesPerEU(), 714 getOccupancyWithLocalMemSize(LDSSize, F)); 715 if (NumSGPRs) 716 Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs)); 717 if (NumVGPRs) 718 Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs)); 719 return Occupancy; 720 } 721 722 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 723 const Function &F = MF.getFunction(); 724 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 725 726 // Compute maximum number of SGPRs function can use using default/requested 727 // minimum number of waves per execution unit. 728 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 729 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 730 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 731 732 // Check if maximum number of SGPRs was explicitly requested using 733 // "amdgpu-num-sgpr" attribute. 734 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 735 unsigned Requested = AMDGPU::getIntegerAttribute( 736 F, "amdgpu-num-sgpr", MaxNumSGPRs); 737 738 // Make sure requested value does not violate subtarget's specifications. 739 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 740 Requested = 0; 741 742 // If more SGPRs are required to support the input user/system SGPRs, 743 // increase to accommodate them. 744 // 745 // FIXME: This really ends up using the requested number of SGPRs + number 746 // of reserved special registers in total. Theoretically you could re-use 747 // the last input registers for these special registers, but this would 748 // require a lot of complexity to deal with the weird aliasing. 749 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 750 if (Requested && Requested < InputNumSGPRs) 751 Requested = InputNumSGPRs; 752 753 // Make sure requested value is compatible with values implied by 754 // default/requested minimum/maximum number of waves per execution unit. 755 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 756 Requested = 0; 757 if (WavesPerEU.second && 758 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 759 Requested = 0; 760 761 if (Requested) 762 MaxNumSGPRs = Requested; 763 } 764 765 if (hasSGPRInitBug()) 766 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 767 768 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 769 MaxAddressableNumSGPRs); 770 } 771 772 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 773 const Function &F = MF.getFunction(); 774 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 775 776 // Compute maximum number of VGPRs function can use using default/requested 777 // minimum number of waves per execution unit. 778 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 779 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 780 781 // Check if maximum number of VGPRs was explicitly requested using 782 // "amdgpu-num-vgpr" attribute. 783 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 784 unsigned Requested = AMDGPU::getIntegerAttribute( 785 F, "amdgpu-num-vgpr", MaxNumVGPRs); 786 787 if (hasGFX90AInsts()) 788 Requested *= 2; 789 790 // Make sure requested value is compatible with values implied by 791 // default/requested minimum/maximum number of waves per execution unit. 792 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 793 Requested = 0; 794 if (WavesPerEU.second && 795 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 796 Requested = 0; 797 798 if (Requested) 799 MaxNumVGPRs = Requested; 800 } 801 802 return MaxNumVGPRs; 803 } 804 805 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, 806 int UseOpIdx, SDep &Dep) const { 807 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || 808 !Def->isInstr() || !Use->isInstr()) 809 return; 810 811 MachineInstr *DefI = Def->getInstr(); 812 MachineInstr *UseI = Use->getInstr(); 813 814 if (DefI->isBundle()) { 815 const SIRegisterInfo *TRI = getRegisterInfo(); 816 auto Reg = Dep.getReg(); 817 MachineBasicBlock::const_instr_iterator I(DefI->getIterator()); 818 MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end()); 819 unsigned Lat = 0; 820 for (++I; I != E && I->isBundledWithPred(); ++I) { 821 if (I->modifiesRegister(Reg, TRI)) 822 Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I); 823 else if (Lat) 824 --Lat; 825 } 826 Dep.setLatency(Lat); 827 } else if (UseI->isBundle()) { 828 const SIRegisterInfo *TRI = getRegisterInfo(); 829 auto Reg = Dep.getReg(); 830 MachineBasicBlock::const_instr_iterator I(UseI->getIterator()); 831 MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end()); 832 unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI); 833 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) { 834 if (I->readsRegister(Reg, TRI)) 835 break; 836 --Lat; 837 } 838 Dep.setLatency(Lat); 839 } 840 } 841 842 namespace { 843 struct FillMFMAShadowMutation : ScheduleDAGMutation { 844 const SIInstrInfo *TII; 845 846 ScheduleDAGMI *DAG; 847 848 FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {} 849 850 bool isSALU(const SUnit *SU) const { 851 const MachineInstr *MI = SU->getInstr(); 852 return MI && TII->isSALU(*MI) && !MI->isTerminator(); 853 } 854 855 bool isVALU(const SUnit *SU) const { 856 const MachineInstr *MI = SU->getInstr(); 857 return MI && TII->isVALU(*MI); 858 } 859 860 bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const { 861 if (Pred->NodeNum < Succ->NodeNum) 862 return true; 863 864 SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred}); 865 866 for (unsigned I = 0; I < Succs.size(); ++I) { 867 for (const SDep &SI : Succs[I]->Succs) { 868 const SUnit *SU = SI.getSUnit(); 869 if (SU != Succs[I] && !llvm::is_contained(Succs, SU)) 870 Succs.push_back(SU); 871 } 872 } 873 874 SmallPtrSet<const SUnit*, 32> Visited; 875 while (!Preds.empty()) { 876 const SUnit *SU = Preds.pop_back_val(); 877 if (llvm::is_contained(Succs, SU)) 878 return false; 879 Visited.insert(SU); 880 for (const SDep &SI : SU->Preds) 881 if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit())) 882 Preds.push_back(SI.getSUnit()); 883 } 884 885 return true; 886 } 887 888 // Link as much SALU intructions in chain as possible. Return the size 889 // of the chain. Links up to MaxChain instructions. 890 unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain, 891 SmallPtrSetImpl<SUnit *> &Visited) const { 892 SmallVector<SUnit *, 8> Worklist({To}); 893 unsigned Linked = 0; 894 895 while (!Worklist.empty() && MaxChain-- > 0) { 896 SUnit *SU = Worklist.pop_back_val(); 897 if (!Visited.insert(SU).second) 898 continue; 899 900 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From); 901 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n'); 902 903 if (SU->addPred(SDep(From, SDep::Artificial), false)) 904 ++Linked; 905 906 for (SDep &SI : From->Succs) { 907 SUnit *SUv = SI.getSUnit(); 908 if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU)) 909 SUv->addPred(SDep(SU, SDep::Artificial), false); 910 } 911 912 for (SDep &SI : SU->Succs) { 913 SUnit *Succ = SI.getSUnit(); 914 if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ)) 915 Worklist.push_back(Succ); 916 } 917 } 918 919 return Linked; 920 } 921 922 void apply(ScheduleDAGInstrs *DAGInstrs) override { 923 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); 924 if (!ST.hasMAIInsts() || DisablePowerSched) 925 return; 926 DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); 927 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); 928 if (!TSchedModel || DAG->SUnits.empty()) 929 return; 930 931 // Scan for MFMA long latency instructions and try to add a dependency 932 // of available SALU instructions to give them a chance to fill MFMA 933 // shadow. That is desirable to fill MFMA shadow with SALU instructions 934 // rather than VALU to prevent power consumption bursts and throttle. 935 auto LastSALU = DAG->SUnits.begin(); 936 auto E = DAG->SUnits.end(); 937 SmallPtrSet<SUnit*, 32> Visited; 938 for (SUnit &SU : DAG->SUnits) { 939 MachineInstr &MAI = *SU.getInstr(); 940 if (!TII->isMAI(MAI) || 941 MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 || 942 MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64) 943 continue; 944 945 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1; 946 947 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU); 948 dbgs() << "Need " << Lat 949 << " instructions to cover latency.\n"); 950 951 // Find up to Lat independent scalar instructions as early as 952 // possible such that they can be scheduled after this MFMA. 953 for ( ; Lat && LastSALU != E; ++LastSALU) { 954 if (Visited.count(&*LastSALU)) 955 continue; 956 957 if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU)) 958 continue; 959 960 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited); 961 } 962 } 963 } 964 }; 965 } // namespace 966 967 void GCNSubtarget::getPostRAMutations( 968 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 969 Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo)); 970 } 971 972 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 973 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 974 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 975 else 976 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); 977 } 978 979 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 980 if (TM.getTargetTriple().getArch() == Triple::amdgcn) 981 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 982 else 983 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); 984 } 985