1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Implements the AMDGPU specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUSubtarget.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUCallLowering.h" 17 #include "AMDGPUInstructionSelector.h" 18 #include "AMDGPULegalizerInfo.h" 19 #include "AMDGPURegisterBankInfo.h" 20 #include "AMDGPUTargetMachine.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "Utils/AMDGPUBaseInfo.h" 23 #include "llvm/ADT/SmallString.h" 24 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h" 25 #include "llvm/CodeGen/MachineScheduler.h" 26 #include "llvm/CodeGen/TargetFrameLowering.h" 27 #include "llvm/IR/IntrinsicsAMDGPU.h" 28 #include "llvm/IR/IntrinsicsR600.h" 29 #include "llvm/IR/MDBuilder.h" 30 #include "llvm/MC/MCSubtargetInfo.h" 31 #include <algorithm> 32 33 using namespace llvm; 34 35 #define DEBUG_TYPE "amdgpu-subtarget" 36 37 #define GET_SUBTARGETINFO_TARGET_DESC 38 #define GET_SUBTARGETINFO_CTOR 39 #define AMDGPUSubtarget GCNSubtarget 40 #include "AMDGPUGenSubtargetInfo.inc" 41 #define GET_SUBTARGETINFO_TARGET_DESC 42 #define GET_SUBTARGETINFO_CTOR 43 #undef AMDGPUSubtarget 44 #include "R600GenSubtargetInfo.inc" 45 46 static cl::opt<bool> DisablePowerSched( 47 "amdgpu-disable-power-sched", 48 cl::desc("Disable scheduling to minimize mAI power bursts"), 49 cl::init(false)); 50 51 static cl::opt<bool> EnableVGPRIndexMode( 52 "amdgpu-vgpr-index-mode", 53 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), 54 cl::init(false)); 55 56 static cl::opt<bool> EnableFlatScratch( 57 "amdgpu-enable-flat-scratch", 58 cl::desc("Use flat scratch instructions"), 59 cl::init(false)); 60 61 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen", 62 cl::desc("Enable the use of AA during codegen."), 63 cl::init(true)); 64 65 GCNSubtarget::~GCNSubtarget() = default; 66 67 R600Subtarget & 68 R600Subtarget::initializeSubtargetDependencies(const Triple &TT, 69 StringRef GPU, StringRef FS) { 70 SmallString<256> FullFS("+promote-alloca,"); 71 FullFS += FS; 72 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS); 73 74 HasMulU24 = getGeneration() >= EVERGREEN; 75 HasMulI24 = hasCaymanISA(); 76 77 return *this; 78 } 79 80 GCNSubtarget & 81 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 82 StringRef GPU, StringRef FS) { 83 // Determine default and user-specified characteristics 84 // 85 // We want to be able to turn these off, but making this a subtarget feature 86 // for SI has the unhelpful behavior that it unsets everything else if you 87 // disable it. 88 // 89 // Similarly we want enable-prt-strict-null to be on by default and not to 90 // unset everything else if it is disabled 91 92 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,"); 93 94 // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default 95 if (isAmdHsaOS()) 96 FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,"; 97 98 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS 99 100 // Disable mutually exclusive bits. 101 if (FS.find_lower("+wavefrontsize") != StringRef::npos) { 102 if (FS.find_lower("wavefrontsize16") == StringRef::npos) 103 FullFS += "-wavefrontsize16,"; 104 if (FS.find_lower("wavefrontsize32") == StringRef::npos) 105 FullFS += "-wavefrontsize32,"; 106 if (FS.find_lower("wavefrontsize64") == StringRef::npos) 107 FullFS += "-wavefrontsize64,"; 108 } 109 110 FullFS += FS; 111 112 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS); 113 114 // Implement the "generic" processors, which acts as the default when no 115 // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to 116 // the first amdgcn target that supports flat addressing. Other OSes defaults 117 // to the first amdgcn target. 118 if (Gen == AMDGPUSubtarget::INVALID) { 119 Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS 120 : AMDGPUSubtarget::SOUTHERN_ISLANDS; 121 } 122 123 // We don't support FP64 for EG/NI atm. 124 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 125 126 // Targets must either support 64-bit offsets for MUBUF instructions, and/or 127 // support flat operations, otherwise they cannot access a 64-bit global 128 // address space 129 assert(hasAddr64() || hasFlat()); 130 // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets 131 // that do not support ADDR64 variants of MUBUF instructions. Such targets 132 // cannot use a 64 bit offset with a MUBUF instruction to access the global 133 // address space 134 if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) { 135 ToggleFeature(AMDGPU::FeatureFlatForGlobal); 136 FlatForGlobal = true; 137 } 138 // Unless +-flat-for-global is specified, use MUBUF instructions for global 139 // address space access if flat operations are not available. 140 if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) { 141 ToggleFeature(AMDGPU::FeatureFlatForGlobal); 142 FlatForGlobal = false; 143 } 144 145 // Set defaults if needed. 146 if (MaxPrivateElementSize == 0) 147 MaxPrivateElementSize = 4; 148 149 if (LDSBankCount == 0) 150 LDSBankCount = 32; 151 152 if (TT.getArch() == Triple::amdgcn) { 153 if (LocalMemorySize == 0) 154 LocalMemorySize = 32768; 155 156 // Do something sensible for unspecified target. 157 if (!HasMovrel && !HasVGPRIndexMode) 158 HasMovrel = true; 159 } 160 161 // Don't crash on invalid devices. 162 if (WavefrontSizeLog2 == 0) 163 WavefrontSizeLog2 = 5; 164 165 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 166 167 TargetID.setTargetIDFromFeaturesString(FS); 168 169 LLVM_DEBUG(dbgs() << "xnack setting for subtarget: " 170 << TargetID.getXnackSetting() << '\n'); 171 LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: " 172 << TargetID.getSramEccSetting() << '\n'); 173 174 return *this; 175 } 176 177 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : 178 TargetTriple(TT), 179 GCN3Encoding(false), 180 Has16BitInsts(false), 181 HasMadMixInsts(false), 182 HasMadMacF32Insts(false), 183 HasDsSrc2Insts(false), 184 HasSDWA(false), 185 HasVOP3PInsts(false), 186 HasMulI24(true), 187 HasMulU24(true), 188 HasInv2PiInlineImm(false), 189 HasFminFmaxLegacy(true), 190 EnablePromoteAlloca(false), 191 HasTrigReducedRange(false), 192 MaxWavesPerEU(10), 193 LocalMemorySize(0), 194 WavefrontSizeLog2(0) 195 { } 196 197 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 198 const GCNTargetMachine &TM) : 199 AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS), 200 AMDGPUSubtarget(TT), 201 TargetTriple(TT), 202 TargetID(*this), 203 Gen(INVALID), 204 InstrItins(getInstrItineraryForCPU(GPU)), 205 LDSBankCount(0), 206 MaxPrivateElementSize(0), 207 208 FastFMAF32(false), 209 FastDenormalF32(false), 210 HalfRate64Ops(false), 211 FullRate64Ops(false), 212 213 FlatForGlobal(false), 214 AutoWaitcntBeforeBarrier(false), 215 UnalignedScratchAccess(false), 216 UnalignedAccessMode(false), 217 218 HasApertureRegs(false), 219 SupportsXNACK(false), 220 EnableXNACK(false), 221 EnableTgSplit(false), 222 EnableCuMode(false), 223 TrapHandler(false), 224 225 EnableLoadStoreOpt(false), 226 EnableUnsafeDSOffsetFolding(false), 227 EnableSIScheduler(false), 228 EnableDS128(false), 229 EnablePRTStrictNull(false), 230 DumpCode(false), 231 232 FP64(false), 233 CIInsts(false), 234 GFX8Insts(false), 235 GFX9Insts(false), 236 GFX90AInsts(false), 237 GFX10Insts(false), 238 GFX10_3Insts(false), 239 GFX7GFX8GFX9Insts(false), 240 SGPRInitBug(false), 241 HasSMemRealTime(false), 242 HasIntClamp(false), 243 HasFmaMixInsts(false), 244 HasMovrel(false), 245 HasVGPRIndexMode(false), 246 HasScalarStores(false), 247 HasScalarAtomics(false), 248 HasSDWAOmod(false), 249 HasSDWAScalar(false), 250 HasSDWASdst(false), 251 HasSDWAMac(false), 252 HasSDWAOutModsVOPC(false), 253 HasDPP(false), 254 HasDPP8(false), 255 Has64BitDPP(false), 256 HasPackedFP32Ops(false), 257 HasExtendedImageInsts(false), 258 HasR128A16(false), 259 HasGFX10A16(false), 260 HasG16(false), 261 HasNSAEncoding(false), 262 GFX10_BEncoding(false), 263 HasDLInsts(false), 264 HasDot1Insts(false), 265 HasDot2Insts(false), 266 HasDot3Insts(false), 267 HasDot4Insts(false), 268 HasDot5Insts(false), 269 HasDot6Insts(false), 270 HasMAIInsts(false), 271 HasPkFmacF16Inst(false), 272 HasAtomicFaddInsts(false), 273 SupportsSRAMECC(false), 274 EnableSRAMECC(false), 275 HasNoSdstCMPX(false), 276 HasVscnt(false), 277 HasGetWaveIdInst(false), 278 HasSMemTimeInst(false), 279 HasRegisterBanking(false), 280 HasVOP3Literal(false), 281 HasNoDataDepHazard(false), 282 FlatAddressSpace(false), 283 FlatInstOffsets(false), 284 FlatGlobalInsts(false), 285 FlatScratchInsts(false), 286 ScalarFlatScratchInsts(false), 287 AddNoCarryInsts(false), 288 HasUnpackedD16VMem(false), 289 LDSMisalignedBug(false), 290 HasMFMAInlineLiteralBug(false), 291 UnalignedBufferAccess(false), 292 UnalignedDSAccess(false), 293 HasPackedTID(false), 294 295 ScalarizeGlobal(false), 296 297 HasVcmpxPermlaneHazard(false), 298 HasVMEMtoScalarWriteHazard(false), 299 HasSMEMtoVectorWriteHazard(false), 300 HasInstFwdPrefetchBug(false), 301 HasVcmpxExecWARHazard(false), 302 HasLdsBranchVmemWARHazard(false), 303 HasNSAtoVMEMBug(false), 304 HasOffset3fBug(false), 305 HasFlatSegmentOffsetBug(false), 306 HasImageStoreD16Bug(false), 307 HasImageGather4D16Bug(false), 308 309 FeatureDisable(false), 310 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 311 TLInfo(TM, *this), 312 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 313 MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this); 314 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 315 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering())); 316 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 317 RegBankInfo.reset(new AMDGPURegisterBankInfo(*this)); 318 InstSelector.reset(new AMDGPUInstructionSelector( 319 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 320 } 321 322 bool GCNSubtarget::enableFlatScratch() const { 323 return EnableFlatScratch && hasFlatScratchInsts(); 324 } 325 326 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { 327 if (getGeneration() < GFX10) 328 return 1; 329 330 switch (Opcode) { 331 case AMDGPU::V_LSHLREV_B64_e64: 332 case AMDGPU::V_LSHLREV_B64_gfx10: 333 case AMDGPU::V_LSHL_B64_e64: 334 case AMDGPU::V_LSHRREV_B64_e64: 335 case AMDGPU::V_LSHRREV_B64_gfx10: 336 case AMDGPU::V_LSHR_B64_e64: 337 case AMDGPU::V_ASHRREV_I64_e64: 338 case AMDGPU::V_ASHRREV_I64_gfx10: 339 case AMDGPU::V_ASHR_I64_e64: 340 return 1; 341 } 342 343 return 2; 344 } 345 346 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 347 const Function &F) const { 348 if (NWaves == 1) 349 return getLocalMemorySize(); 350 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 351 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 352 if (!WorkGroupsPerCu) 353 return 0; 354 unsigned MaxWaves = getMaxWavesPerEU(); 355 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 356 } 357 358 // FIXME: Should return min,max range. 359 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 360 const Function &F) const { 361 const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second; 362 const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize); 363 if (!MaxWorkGroupsPerCu) 364 return 0; 365 366 const unsigned WaveSize = getWavefrontSize(); 367 368 // FIXME: Do we need to account for alignment requirement of LDS rounding the 369 // size up? 370 // Compute restriction based on LDS usage 371 unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u); 372 373 // This can be queried with more LDS than is possible, so just assume the 374 // worst. 375 if (NumGroups == 0) 376 return 1; 377 378 NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups); 379 380 // Round to the number of waves. 381 const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize; 382 unsigned MaxWaves = NumGroups * MaxGroupNumWaves; 383 384 // Clamp to the maximum possible number of waves. 385 MaxWaves = std::min(MaxWaves, getMaxWavesPerEU()); 386 387 // FIXME: Needs to be a multiple of the group size? 388 //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves); 389 390 assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() && 391 "computed invalid occupancy"); 392 return MaxWaves; 393 } 394 395 unsigned 396 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 397 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 398 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 399 } 400 401 std::pair<unsigned, unsigned> 402 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 403 switch (CC) { 404 case CallingConv::AMDGPU_VS: 405 case CallingConv::AMDGPU_LS: 406 case CallingConv::AMDGPU_HS: 407 case CallingConv::AMDGPU_ES: 408 case CallingConv::AMDGPU_GS: 409 case CallingConv::AMDGPU_PS: 410 return std::make_pair(1, getWavefrontSize()); 411 default: 412 return std::make_pair(1u, getMaxFlatWorkGroupSize()); 413 } 414 } 415 416 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 417 const Function &F) const { 418 // Default minimum/maximum flat work group sizes. 419 std::pair<unsigned, unsigned> Default = 420 getDefaultFlatWorkGroupSize(F.getCallingConv()); 421 422 // Requested minimum/maximum flat work group sizes. 423 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 424 F, "amdgpu-flat-work-group-size", Default); 425 426 // Make sure requested minimum is less than requested maximum. 427 if (Requested.first > Requested.second) 428 return Default; 429 430 // Make sure requested values do not violate subtarget's specifications. 431 if (Requested.first < getMinFlatWorkGroupSize()) 432 return Default; 433 if (Requested.second > getMaxFlatWorkGroupSize()) 434 return Default; 435 436 return Requested; 437 } 438 439 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 440 const Function &F) const { 441 // Default minimum/maximum number of waves per execution unit. 442 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 443 444 // Default/requested minimum/maximum flat work group sizes. 445 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 446 447 // If minimum/maximum flat work group sizes were explicitly requested using 448 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 449 // number of waves per execution unit to values implied by requested 450 // minimum/maximum flat work group sizes. 451 unsigned MinImpliedByFlatWorkGroupSize = 452 getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second); 453 Default.first = MinImpliedByFlatWorkGroupSize; 454 bool RequestedFlatWorkGroupSize = 455 F.hasFnAttribute("amdgpu-flat-work-group-size"); 456 457 // Requested minimum/maximum number of waves per execution unit. 458 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 459 F, "amdgpu-waves-per-eu", Default, true); 460 461 // Make sure requested minimum is less than requested maximum. 462 if (Requested.second && Requested.first > Requested.second) 463 return Default; 464 465 // Make sure requested values do not violate subtarget's specifications. 466 if (Requested.first < getMinWavesPerEU() || 467 Requested.second > getMaxWavesPerEU()) 468 return Default; 469 470 // Make sure requested values are compatible with values implied by requested 471 // minimum/maximum flat work group sizes. 472 if (RequestedFlatWorkGroupSize && 473 Requested.first < MinImpliedByFlatWorkGroupSize) 474 return Default; 475 476 return Requested; 477 } 478 479 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) { 480 auto Node = Kernel.getMetadata("reqd_work_group_size"); 481 if (Node && Node->getNumOperands() == 3) 482 return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue(); 483 return std::numeric_limits<unsigned>::max(); 484 } 485 486 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const { 487 return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv()); 488 } 489 490 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel, 491 unsigned Dimension) const { 492 unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension); 493 if (ReqdSize != std::numeric_limits<unsigned>::max()) 494 return ReqdSize - 1; 495 return getFlatWorkGroupSizes(Kernel).second - 1; 496 } 497 498 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 499 Function *Kernel = I->getParent()->getParent(); 500 unsigned MinSize = 0; 501 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 502 bool IdQuery = false; 503 504 // If reqd_work_group_size is present it narrows value down. 505 if (auto *CI = dyn_cast<CallInst>(I)) { 506 const Function *F = CI->getCalledFunction(); 507 if (F) { 508 unsigned Dim = UINT_MAX; 509 switch (F->getIntrinsicID()) { 510 case Intrinsic::amdgcn_workitem_id_x: 511 case Intrinsic::r600_read_tidig_x: 512 IdQuery = true; 513 LLVM_FALLTHROUGH; 514 case Intrinsic::r600_read_local_size_x: 515 Dim = 0; 516 break; 517 case Intrinsic::amdgcn_workitem_id_y: 518 case Intrinsic::r600_read_tidig_y: 519 IdQuery = true; 520 LLVM_FALLTHROUGH; 521 case Intrinsic::r600_read_local_size_y: 522 Dim = 1; 523 break; 524 case Intrinsic::amdgcn_workitem_id_z: 525 case Intrinsic::r600_read_tidig_z: 526 IdQuery = true; 527 LLVM_FALLTHROUGH; 528 case Intrinsic::r600_read_local_size_z: 529 Dim = 2; 530 break; 531 default: 532 break; 533 } 534 535 if (Dim <= 3) { 536 unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim); 537 if (ReqdSize != std::numeric_limits<unsigned>::max()) 538 MinSize = MaxSize = ReqdSize; 539 } 540 } 541 } 542 543 if (!MaxSize) 544 return false; 545 546 // Range metadata is [Lo, Hi). For ID query we need to pass max size 547 // as Hi. For size query we need to pass Hi + 1. 548 if (IdQuery) 549 MinSize = 0; 550 else 551 ++MaxSize; 552 553 MDBuilder MDB(I->getContext()); 554 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 555 APInt(32, MaxSize)); 556 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 557 return true; 558 } 559 560 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const { 561 if (isMesaKernel(F)) 562 return 16; 563 return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0); 564 } 565 566 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 567 Align &MaxAlign) const { 568 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 569 F.getCallingConv() == CallingConv::SPIR_KERNEL); 570 571 const DataLayout &DL = F.getParent()->getDataLayout(); 572 uint64_t ExplicitArgBytes = 0; 573 MaxAlign = Align(1); 574 575 for (const Argument &Arg : F.args()) { 576 const bool IsByRef = Arg.hasByRefAttr(); 577 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); 578 MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None; 579 if (!Alignment) 580 Alignment = DL.getABITypeAlign(ArgTy); 581 582 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 583 ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize; 584 MaxAlign = max(MaxAlign, Alignment); 585 } 586 587 return ExplicitArgBytes; 588 } 589 590 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 591 Align &MaxAlign) const { 592 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 593 594 unsigned ExplicitOffset = getExplicitKernelArgOffset(F); 595 596 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 597 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 598 if (ImplicitBytes != 0) { 599 const Align Alignment = getAlignmentForImplicitArgPtr(); 600 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 601 } 602 603 // Being able to dereference past the end is useful for emitting scalar loads. 604 return alignTo(TotalSize, 4); 605 } 606 607 AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const { 608 return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32 609 : AMDGPUDwarfFlavour::Wave64; 610 } 611 612 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 613 const TargetMachine &TM) : 614 R600GenSubtargetInfo(TT, GPU, /*TuneCPU*/GPU, FS), 615 AMDGPUSubtarget(TT), 616 InstrInfo(*this), 617 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 618 FMA(false), 619 CaymanISA(false), 620 CFALUBug(false), 621 HasVertexCache(false), 622 R600ALUInst(false), 623 FP64(false), 624 TexVTXClauseSize(0), 625 Gen(R600), 626 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)), 627 InstrItins(getInstrItineraryForCPU(GPU)) { } 628 629 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 630 unsigned NumRegionInstrs) const { 631 // Track register pressure so the scheduler can try to decrease 632 // pressure once register usage is above the threshold defined by 633 // SIRegisterInfo::getRegPressureSetLimit() 634 Policy.ShouldTrackPressure = true; 635 636 // Enabling both top down and bottom up scheduling seems to give us less 637 // register spills than just using one of these approaches on its own. 638 Policy.OnlyTopDown = false; 639 Policy.OnlyBottomUp = false; 640 641 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 642 if (!enableSIScheduler()) 643 Policy.ShouldTrackLaneMasks = true; 644 } 645 646 bool GCNSubtarget::hasMadF16() const { 647 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1; 648 } 649 650 bool GCNSubtarget::useVGPRIndexMode() const { 651 return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode()); 652 } 653 654 bool GCNSubtarget::useAA() const { return UseAA; } 655 656 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 657 if (getGeneration() >= AMDGPUSubtarget::GFX10) 658 return getMaxWavesPerEU(); 659 660 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 661 if (SGPRs <= 80) 662 return 10; 663 if (SGPRs <= 88) 664 return 9; 665 if (SGPRs <= 100) 666 return 8; 667 return 7; 668 } 669 if (SGPRs <= 48) 670 return 10; 671 if (SGPRs <= 56) 672 return 9; 673 if (SGPRs <= 64) 674 return 8; 675 if (SGPRs <= 72) 676 return 7; 677 if (SGPRs <= 80) 678 return 6; 679 return 5; 680 } 681 682 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 683 unsigned MaxWaves = getMaxWavesPerEU(); 684 unsigned Granule = getVGPRAllocGranule(); 685 if (VGPRs < Granule) 686 return MaxWaves; 687 unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule; 688 return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves); 689 } 690 691 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 692 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 693 if (getGeneration() >= AMDGPUSubtarget::GFX10) 694 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. 695 696 if (MFI.hasFlatScratchInit()) { 697 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 698 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 699 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 700 return 4; // FLAT_SCRATCH, VCC (in that order). 701 } 702 703 if (isXNACKEnabled()) 704 return 4; // XNACK, VCC (in that order). 705 return 2; // VCC. 706 } 707 708 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize, 709 unsigned NumSGPRs, 710 unsigned NumVGPRs) const { 711 unsigned Occupancy = 712 std::min(getMaxWavesPerEU(), 713 getOccupancyWithLocalMemSize(LDSSize, F)); 714 if (NumSGPRs) 715 Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs)); 716 if (NumVGPRs) 717 Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs)); 718 return Occupancy; 719 } 720 721 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 722 const Function &F = MF.getFunction(); 723 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 724 725 // Compute maximum number of SGPRs function can use using default/requested 726 // minimum number of waves per execution unit. 727 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 728 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 729 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 730 731 // Check if maximum number of SGPRs was explicitly requested using 732 // "amdgpu-num-sgpr" attribute. 733 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 734 unsigned Requested = AMDGPU::getIntegerAttribute( 735 F, "amdgpu-num-sgpr", MaxNumSGPRs); 736 737 // Make sure requested value does not violate subtarget's specifications. 738 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 739 Requested = 0; 740 741 // If more SGPRs are required to support the input user/system SGPRs, 742 // increase to accommodate them. 743 // 744 // FIXME: This really ends up using the requested number of SGPRs + number 745 // of reserved special registers in total. Theoretically you could re-use 746 // the last input registers for these special registers, but this would 747 // require a lot of complexity to deal with the weird aliasing. 748 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 749 if (Requested && Requested < InputNumSGPRs) 750 Requested = InputNumSGPRs; 751 752 // Make sure requested value is compatible with values implied by 753 // default/requested minimum/maximum number of waves per execution unit. 754 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 755 Requested = 0; 756 if (WavesPerEU.second && 757 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 758 Requested = 0; 759 760 if (Requested) 761 MaxNumSGPRs = Requested; 762 } 763 764 if (hasSGPRInitBug()) 765 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 766 767 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 768 MaxAddressableNumSGPRs); 769 } 770 771 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 772 const Function &F = MF.getFunction(); 773 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 774 775 // Compute maximum number of VGPRs function can use using default/requested 776 // minimum number of waves per execution unit. 777 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 778 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 779 780 // Check if maximum number of VGPRs was explicitly requested using 781 // "amdgpu-num-vgpr" attribute. 782 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 783 unsigned Requested = AMDGPU::getIntegerAttribute( 784 F, "amdgpu-num-vgpr", MaxNumVGPRs); 785 786 if (hasGFX90AInsts()) 787 Requested *= 2; 788 789 // Make sure requested value is compatible with values implied by 790 // default/requested minimum/maximum number of waves per execution unit. 791 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 792 Requested = 0; 793 if (WavesPerEU.second && 794 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 795 Requested = 0; 796 797 if (Requested) 798 MaxNumVGPRs = Requested; 799 } 800 801 return MaxNumVGPRs; 802 } 803 804 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, 805 int UseOpIdx, SDep &Dep) const { 806 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || 807 !Def->isInstr() || !Use->isInstr()) 808 return; 809 810 MachineInstr *DefI = Def->getInstr(); 811 MachineInstr *UseI = Use->getInstr(); 812 813 if (DefI->isBundle()) { 814 const SIRegisterInfo *TRI = getRegisterInfo(); 815 auto Reg = Dep.getReg(); 816 MachineBasicBlock::const_instr_iterator I(DefI->getIterator()); 817 MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end()); 818 unsigned Lat = 0; 819 for (++I; I != E && I->isBundledWithPred(); ++I) { 820 if (I->modifiesRegister(Reg, TRI)) 821 Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I); 822 else if (Lat) 823 --Lat; 824 } 825 Dep.setLatency(Lat); 826 } else if (UseI->isBundle()) { 827 const SIRegisterInfo *TRI = getRegisterInfo(); 828 auto Reg = Dep.getReg(); 829 MachineBasicBlock::const_instr_iterator I(UseI->getIterator()); 830 MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end()); 831 unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI); 832 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) { 833 if (I->readsRegister(Reg, TRI)) 834 break; 835 --Lat; 836 } 837 Dep.setLatency(Lat); 838 } 839 } 840 841 namespace { 842 struct FillMFMAShadowMutation : ScheduleDAGMutation { 843 const SIInstrInfo *TII; 844 845 ScheduleDAGMI *DAG; 846 847 FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {} 848 849 bool isSALU(const SUnit *SU) const { 850 const MachineInstr *MI = SU->getInstr(); 851 return MI && TII->isSALU(*MI) && !MI->isTerminator(); 852 } 853 854 bool isVALU(const SUnit *SU) const { 855 const MachineInstr *MI = SU->getInstr(); 856 return MI && TII->isVALU(*MI); 857 } 858 859 bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const { 860 if (Pred->NodeNum < Succ->NodeNum) 861 return true; 862 863 SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred}); 864 865 for (unsigned I = 0; I < Succs.size(); ++I) { 866 for (const SDep &SI : Succs[I]->Succs) { 867 const SUnit *SU = SI.getSUnit(); 868 if (SU != Succs[I] && !llvm::is_contained(Succs, SU)) 869 Succs.push_back(SU); 870 } 871 } 872 873 SmallPtrSet<const SUnit*, 32> Visited; 874 while (!Preds.empty()) { 875 const SUnit *SU = Preds.pop_back_val(); 876 if (llvm::is_contained(Succs, SU)) 877 return false; 878 Visited.insert(SU); 879 for (const SDep &SI : SU->Preds) 880 if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit())) 881 Preds.push_back(SI.getSUnit()); 882 } 883 884 return true; 885 } 886 887 // Link as much SALU intructions in chain as possible. Return the size 888 // of the chain. Links up to MaxChain instructions. 889 unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain, 890 SmallPtrSetImpl<SUnit *> &Visited) const { 891 SmallVector<SUnit *, 8> Worklist({To}); 892 unsigned Linked = 0; 893 894 while (!Worklist.empty() && MaxChain-- > 0) { 895 SUnit *SU = Worklist.pop_back_val(); 896 if (!Visited.insert(SU).second) 897 continue; 898 899 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From); 900 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n'); 901 902 if (SU->addPred(SDep(From, SDep::Artificial), false)) 903 ++Linked; 904 905 for (SDep &SI : From->Succs) { 906 SUnit *SUv = SI.getSUnit(); 907 if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU)) 908 SUv->addPred(SDep(SU, SDep::Artificial), false); 909 } 910 911 for (SDep &SI : SU->Succs) { 912 SUnit *Succ = SI.getSUnit(); 913 if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ)) 914 Worklist.push_back(Succ); 915 } 916 } 917 918 return Linked; 919 } 920 921 void apply(ScheduleDAGInstrs *DAGInstrs) override { 922 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); 923 if (!ST.hasMAIInsts() || DisablePowerSched) 924 return; 925 DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); 926 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); 927 if (!TSchedModel || DAG->SUnits.empty()) 928 return; 929 930 // Scan for MFMA long latency instructions and try to add a dependency 931 // of available SALU instructions to give them a chance to fill MFMA 932 // shadow. That is desirable to fill MFMA shadow with SALU instructions 933 // rather than VALU to prevent power consumption bursts and throttle. 934 auto LastSALU = DAG->SUnits.begin(); 935 auto E = DAG->SUnits.end(); 936 SmallPtrSet<SUnit*, 32> Visited; 937 for (SUnit &SU : DAG->SUnits) { 938 MachineInstr &MAI = *SU.getInstr(); 939 if (!TII->isMAI(MAI) || 940 MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 || 941 MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64) 942 continue; 943 944 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1; 945 946 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU); 947 dbgs() << "Need " << Lat 948 << " instructions to cover latency.\n"); 949 950 // Find up to Lat independent scalar instructions as early as 951 // possible such that they can be scheduled after this MFMA. 952 for ( ; Lat && LastSALU != E; ++LastSALU) { 953 if (Visited.count(&*LastSALU)) 954 continue; 955 956 if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU)) 957 continue; 958 959 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited); 960 } 961 } 962 } 963 }; 964 } // namespace 965 966 void GCNSubtarget::getPostRAMutations( 967 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 968 Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo)); 969 } 970 971 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 972 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 973 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 974 else 975 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); 976 } 977 978 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 979 if (TM.getTargetTriple().getArch() == Triple::amdgcn) 980 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 981 else 982 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); 983 } 984