1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Implements the AMDGPU specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUSubtarget.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUCallLowering.h" 17 #include "AMDGPUInstructionSelector.h" 18 #include "AMDGPULegalizerInfo.h" 19 #include "AMDGPURegisterBankInfo.h" 20 #include "AMDGPUTargetMachine.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "llvm/ADT/SmallString.h" 23 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h" 24 #include "llvm/CodeGen/MachineScheduler.h" 25 #include "llvm/CodeGen/TargetFrameLowering.h" 26 #include "llvm/IR/IntrinsicsAMDGPU.h" 27 #include "llvm/IR/IntrinsicsR600.h" 28 #include "llvm/IR/MDBuilder.h" 29 #include "llvm/MC/MCSubtargetInfo.h" 30 #include <algorithm> 31 32 using namespace llvm; 33 34 #define DEBUG_TYPE "amdgpu-subtarget" 35 36 #define GET_SUBTARGETINFO_TARGET_DESC 37 #define GET_SUBTARGETINFO_CTOR 38 #define AMDGPUSubtarget GCNSubtarget 39 #include "AMDGPUGenSubtargetInfo.inc" 40 #define GET_SUBTARGETINFO_TARGET_DESC 41 #define GET_SUBTARGETINFO_CTOR 42 #undef AMDGPUSubtarget 43 #include "R600GenSubtargetInfo.inc" 44 45 static cl::opt<bool> DisablePowerSched( 46 "amdgpu-disable-power-sched", 47 cl::desc("Disable scheduling to minimize mAI power bursts"), 48 cl::init(false)); 49 50 static cl::opt<bool> EnableVGPRIndexMode( 51 "amdgpu-vgpr-index-mode", 52 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), 53 cl::init(false)); 54 55 static cl::opt<bool> EnableFlatScratch( 56 "amdgpu-enable-flat-scratch", 57 cl::desc("Use flat scratch instructions"), 58 cl::init(false)); 59 60 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen", 61 cl::desc("Enable the use of AA during codegen."), 62 cl::init(true)); 63 64 GCNSubtarget::~GCNSubtarget() = default; 65 66 R600Subtarget & 67 R600Subtarget::initializeSubtargetDependencies(const Triple &TT, 68 StringRef GPU, StringRef FS) { 69 SmallString<256> FullFS("+promote-alloca,"); 70 FullFS += FS; 71 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS); 72 73 HasMulU24 = getGeneration() >= EVERGREEN; 74 HasMulI24 = hasCaymanISA(); 75 76 return *this; 77 } 78 79 GCNSubtarget & 80 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 81 StringRef GPU, StringRef FS) { 82 // Determine default and user-specified characteristics 83 // 84 // We want to be able to turn these off, but making this a subtarget feature 85 // for SI has the unhelpful behavior that it unsets everything else if you 86 // disable it. 87 // 88 // Similarly we want enable-prt-strict-null to be on by default and not to 89 // unset everything else if it is disabled 90 91 // Assuming ECC is enabled is the conservative default. 92 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,+sram-ecc,+xnack,"); 93 94 // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default 95 if (isAmdHsaOS()) 96 FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,"; 97 98 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS 99 100 // Disable mutually exclusive bits. 101 if (FS.find_lower("+wavefrontsize") != StringRef::npos) { 102 if (FS.find_lower("wavefrontsize16") == StringRef::npos) 103 FullFS += "-wavefrontsize16,"; 104 if (FS.find_lower("wavefrontsize32") == StringRef::npos) 105 FullFS += "-wavefrontsize32,"; 106 if (FS.find_lower("wavefrontsize64") == StringRef::npos) 107 FullFS += "-wavefrontsize64,"; 108 } 109 110 FullFS += FS; 111 112 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS); 113 114 // Implement the "generic" processors, which acts as the default when no 115 // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to 116 // the first amdgcn target that supports flat addressing. Other OSes defaults 117 // to the first amdgcn target. 118 if (Gen == AMDGPUSubtarget::INVALID) { 119 Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS 120 : AMDGPUSubtarget::SOUTHERN_ISLANDS; 121 } 122 123 // We don't support FP64 for EG/NI atm. 124 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 125 126 // Targets must either support 64-bit offsets for MUBUF instructions, and/or 127 // support flat operations, otherwise they cannot access a 64-bit global 128 // address space 129 assert(hasAddr64() || hasFlat()); 130 // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets 131 // that do not support ADDR64 variants of MUBUF instructions. Such targets 132 // cannot use a 64 bit offset with a MUBUF instruction to access the global 133 // address space 134 if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) { 135 ToggleFeature(AMDGPU::FeatureFlatForGlobal); 136 FlatForGlobal = true; 137 } 138 // Unless +-flat-for-global is specified, use MUBUF instructions for global 139 // address space access if flat operations are not available. 140 if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) { 141 ToggleFeature(AMDGPU::FeatureFlatForGlobal); 142 FlatForGlobal = false; 143 } 144 145 // Set defaults if needed. 146 if (MaxPrivateElementSize == 0) 147 MaxPrivateElementSize = 4; 148 149 if (LDSBankCount == 0) 150 LDSBankCount = 32; 151 152 if (TT.getArch() == Triple::amdgcn) { 153 if (LocalMemorySize == 0) 154 LocalMemorySize = 32768; 155 156 // Do something sensible for unspecified target. 157 if (!HasMovrel && !HasVGPRIndexMode) 158 HasMovrel = true; 159 } 160 161 // Don't crash on invalid devices. 162 if (WavefrontSizeLog2 == 0) 163 WavefrontSizeLog2 = 5; 164 165 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 166 167 // Disable XNACK on targets where it is not enabled by default unless it is 168 // explicitly requested. 169 if (!FS.contains("+xnack") && DoesNotSupportXNACK && EnableXNACK) { 170 ToggleFeature(AMDGPU::FeatureXNACK); 171 EnableXNACK = false; 172 } 173 174 // ECC is on by default, but turn it off if the hardware doesn't support it 175 // anyway. This matters for the gfx9 targets with d16 loads, but don't support 176 // ECC. 177 if (DoesNotSupportSRAMECC && EnableSRAMECC) { 178 ToggleFeature(AMDGPU::FeatureSRAMECC); 179 EnableSRAMECC = false; 180 } 181 182 return *this; 183 } 184 185 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : 186 TargetTriple(TT), 187 Has16BitInsts(false), 188 HasMadMixInsts(false), 189 HasMadMacF32Insts(false), 190 HasDsSrc2Insts(false), 191 HasSDWA(false), 192 HasVOP3PInsts(false), 193 HasMulI24(true), 194 HasMulU24(true), 195 HasInv2PiInlineImm(false), 196 HasFminFmaxLegacy(true), 197 EnablePromoteAlloca(false), 198 HasTrigReducedRange(false), 199 MaxWavesPerEU(10), 200 LocalMemorySize(0), 201 WavefrontSizeLog2(0) 202 { } 203 204 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 205 const GCNTargetMachine &TM) : 206 AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS), 207 AMDGPUSubtarget(TT), 208 TargetTriple(TT), 209 Gen(INVALID), 210 InstrItins(getInstrItineraryForCPU(GPU)), 211 LDSBankCount(0), 212 MaxPrivateElementSize(0), 213 214 FastFMAF32(false), 215 FastDenormalF32(false), 216 HalfRate64Ops(false), 217 218 FlatForGlobal(false), 219 AutoWaitcntBeforeBarrier(false), 220 UnalignedScratchAccess(false), 221 UnalignedAccessMode(false), 222 223 HasApertureRegs(false), 224 EnableXNACK(false), 225 DoesNotSupportXNACK(false), 226 EnableCuMode(false), 227 TrapHandler(false), 228 229 EnableLoadStoreOpt(false), 230 EnableUnsafeDSOffsetFolding(false), 231 EnableSIScheduler(false), 232 EnableDS128(false), 233 EnablePRTStrictNull(false), 234 DumpCode(false), 235 236 FP64(false), 237 GCN3Encoding(false), 238 CIInsts(false), 239 GFX8Insts(false), 240 GFX9Insts(false), 241 GFX10Insts(false), 242 GFX10_3Insts(false), 243 GFX7GFX8GFX9Insts(false), 244 SGPRInitBug(false), 245 HasSMemRealTime(false), 246 HasIntClamp(false), 247 HasFmaMixInsts(false), 248 HasMovrel(false), 249 HasVGPRIndexMode(false), 250 HasScalarStores(false), 251 HasScalarAtomics(false), 252 HasSDWAOmod(false), 253 HasSDWAScalar(false), 254 HasSDWASdst(false), 255 HasSDWAMac(false), 256 HasSDWAOutModsVOPC(false), 257 HasDPP(false), 258 HasDPP8(false), 259 HasR128A16(false), 260 HasGFX10A16(false), 261 HasG16(false), 262 HasNSAEncoding(false), 263 GFX10_BEncoding(false), 264 HasDLInsts(false), 265 HasDot1Insts(false), 266 HasDot2Insts(false), 267 HasDot3Insts(false), 268 HasDot4Insts(false), 269 HasDot5Insts(false), 270 HasDot6Insts(false), 271 HasMAIInsts(false), 272 HasPkFmacF16Inst(false), 273 HasAtomicFaddInsts(false), 274 EnableSRAMECC(false), 275 DoesNotSupportSRAMECC(false), 276 HasNoSdstCMPX(false), 277 HasVscnt(false), 278 HasGetWaveIdInst(false), 279 HasSMemTimeInst(false), 280 HasRegisterBanking(false), 281 HasVOP3Literal(false), 282 HasNoDataDepHazard(false), 283 FlatAddressSpace(false), 284 FlatInstOffsets(false), 285 FlatGlobalInsts(false), 286 FlatScratchInsts(false), 287 ScalarFlatScratchInsts(false), 288 AddNoCarryInsts(false), 289 HasUnpackedD16VMem(false), 290 LDSMisalignedBug(false), 291 HasMFMAInlineLiteralBug(false), 292 UnalignedBufferAccess(false), 293 UnalignedDSAccess(false), 294 295 ScalarizeGlobal(false), 296 297 HasVcmpxPermlaneHazard(false), 298 HasVMEMtoScalarWriteHazard(false), 299 HasSMEMtoVectorWriteHazard(false), 300 HasInstFwdPrefetchBug(false), 301 HasVcmpxExecWARHazard(false), 302 HasLdsBranchVmemWARHazard(false), 303 HasNSAtoVMEMBug(false), 304 HasOffset3fBug(false), 305 HasFlatSegmentOffsetBug(false), 306 HasImageStoreD16Bug(false), 307 HasImageGather4D16Bug(false), 308 309 FeatureDisable(false), 310 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 311 TLInfo(TM, *this), 312 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 313 MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this); 314 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 315 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering())); 316 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 317 RegBankInfo.reset(new AMDGPURegisterBankInfo(*this)); 318 InstSelector.reset(new AMDGPUInstructionSelector( 319 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 320 } 321 322 bool GCNSubtarget::enableFlatScratch() const { 323 return EnableFlatScratch && hasFlatScratchInsts(); 324 } 325 326 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { 327 if (getGeneration() < GFX10) 328 return 1; 329 330 switch (Opcode) { 331 case AMDGPU::V_LSHLREV_B64_e64: 332 case AMDGPU::V_LSHLREV_B64_gfx10: 333 case AMDGPU::V_LSHL_B64_e64: 334 case AMDGPU::V_LSHRREV_B64_e64: 335 case AMDGPU::V_LSHRREV_B64_gfx10: 336 case AMDGPU::V_LSHR_B64_e64: 337 case AMDGPU::V_ASHRREV_I64_e64: 338 case AMDGPU::V_ASHRREV_I64_gfx10: 339 case AMDGPU::V_ASHR_I64_e64: 340 return 1; 341 } 342 343 return 2; 344 } 345 346 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 347 const Function &F) const { 348 if (NWaves == 1) 349 return getLocalMemorySize(); 350 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 351 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 352 if (!WorkGroupsPerCu) 353 return 0; 354 unsigned MaxWaves = getMaxWavesPerEU(); 355 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 356 } 357 358 // FIXME: Should return min,max range. 359 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 360 const Function &F) const { 361 const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second; 362 const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize); 363 if (!MaxWorkGroupsPerCu) 364 return 0; 365 366 const unsigned WaveSize = getWavefrontSize(); 367 368 // FIXME: Do we need to account for alignment requirement of LDS rounding the 369 // size up? 370 // Compute restriction based on LDS usage 371 unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u); 372 373 // This can be queried with more LDS than is possible, so just assume the 374 // worst. 375 if (NumGroups == 0) 376 return 1; 377 378 NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups); 379 380 // Round to the number of waves. 381 const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize; 382 unsigned MaxWaves = NumGroups * MaxGroupNumWaves; 383 384 // Clamp to the maximum possible number of waves. 385 MaxWaves = std::min(MaxWaves, getMaxWavesPerEU()); 386 387 // FIXME: Needs to be a multiple of the group size? 388 //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves); 389 390 assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() && 391 "computed invalid occupancy"); 392 return MaxWaves; 393 } 394 395 unsigned 396 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 397 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 398 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 399 } 400 401 std::pair<unsigned, unsigned> 402 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 403 switch (CC) { 404 case CallingConv::AMDGPU_VS: 405 case CallingConv::AMDGPU_LS: 406 case CallingConv::AMDGPU_HS: 407 case CallingConv::AMDGPU_ES: 408 case CallingConv::AMDGPU_GS: 409 case CallingConv::AMDGPU_PS: 410 return std::make_pair(1, getWavefrontSize()); 411 default: 412 return std::make_pair(1u, getMaxFlatWorkGroupSize()); 413 } 414 } 415 416 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 417 const Function &F) const { 418 // Default minimum/maximum flat work group sizes. 419 std::pair<unsigned, unsigned> Default = 420 getDefaultFlatWorkGroupSize(F.getCallingConv()); 421 422 // Requested minimum/maximum flat work group sizes. 423 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 424 F, "amdgpu-flat-work-group-size", Default); 425 426 // Make sure requested minimum is less than requested maximum. 427 if (Requested.first > Requested.second) 428 return Default; 429 430 // Make sure requested values do not violate subtarget's specifications. 431 if (Requested.first < getMinFlatWorkGroupSize()) 432 return Default; 433 if (Requested.second > getMaxFlatWorkGroupSize()) 434 return Default; 435 436 return Requested; 437 } 438 439 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 440 const Function &F) const { 441 // Default minimum/maximum number of waves per execution unit. 442 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 443 444 // Default/requested minimum/maximum flat work group sizes. 445 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 446 447 // If minimum/maximum flat work group sizes were explicitly requested using 448 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 449 // number of waves per execution unit to values implied by requested 450 // minimum/maximum flat work group sizes. 451 unsigned MinImpliedByFlatWorkGroupSize = 452 getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second); 453 Default.first = MinImpliedByFlatWorkGroupSize; 454 bool RequestedFlatWorkGroupSize = 455 F.hasFnAttribute("amdgpu-flat-work-group-size"); 456 457 // Requested minimum/maximum number of waves per execution unit. 458 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 459 F, "amdgpu-waves-per-eu", Default, true); 460 461 // Make sure requested minimum is less than requested maximum. 462 if (Requested.second && Requested.first > Requested.second) 463 return Default; 464 465 // Make sure requested values do not violate subtarget's specifications. 466 if (Requested.first < getMinWavesPerEU() || 467 Requested.second > getMaxWavesPerEU()) 468 return Default; 469 470 // Make sure requested values are compatible with values implied by requested 471 // minimum/maximum flat work group sizes. 472 if (RequestedFlatWorkGroupSize && 473 Requested.first < MinImpliedByFlatWorkGroupSize) 474 return Default; 475 476 return Requested; 477 } 478 479 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) { 480 auto Node = Kernel.getMetadata("reqd_work_group_size"); 481 if (Node && Node->getNumOperands() == 3) 482 return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue(); 483 return std::numeric_limits<unsigned>::max(); 484 } 485 486 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const { 487 return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv()); 488 } 489 490 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel, 491 unsigned Dimension) const { 492 unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension); 493 if (ReqdSize != std::numeric_limits<unsigned>::max()) 494 return ReqdSize - 1; 495 return getFlatWorkGroupSizes(Kernel).second - 1; 496 } 497 498 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 499 Function *Kernel = I->getParent()->getParent(); 500 unsigned MinSize = 0; 501 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 502 bool IdQuery = false; 503 504 // If reqd_work_group_size is present it narrows value down. 505 if (auto *CI = dyn_cast<CallInst>(I)) { 506 const Function *F = CI->getCalledFunction(); 507 if (F) { 508 unsigned Dim = UINT_MAX; 509 switch (F->getIntrinsicID()) { 510 case Intrinsic::amdgcn_workitem_id_x: 511 case Intrinsic::r600_read_tidig_x: 512 IdQuery = true; 513 LLVM_FALLTHROUGH; 514 case Intrinsic::r600_read_local_size_x: 515 Dim = 0; 516 break; 517 case Intrinsic::amdgcn_workitem_id_y: 518 case Intrinsic::r600_read_tidig_y: 519 IdQuery = true; 520 LLVM_FALLTHROUGH; 521 case Intrinsic::r600_read_local_size_y: 522 Dim = 1; 523 break; 524 case Intrinsic::amdgcn_workitem_id_z: 525 case Intrinsic::r600_read_tidig_z: 526 IdQuery = true; 527 LLVM_FALLTHROUGH; 528 case Intrinsic::r600_read_local_size_z: 529 Dim = 2; 530 break; 531 default: 532 break; 533 } 534 535 if (Dim <= 3) { 536 unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim); 537 if (ReqdSize != std::numeric_limits<unsigned>::max()) 538 MinSize = MaxSize = ReqdSize; 539 } 540 } 541 } 542 543 if (!MaxSize) 544 return false; 545 546 // Range metadata is [Lo, Hi). For ID query we need to pass max size 547 // as Hi. For size query we need to pass Hi + 1. 548 if (IdQuery) 549 MinSize = 0; 550 else 551 ++MaxSize; 552 553 MDBuilder MDB(I->getContext()); 554 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 555 APInt(32, MaxSize)); 556 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 557 return true; 558 } 559 560 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const { 561 if (isMesaKernel(F)) 562 return 16; 563 return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0); 564 } 565 566 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 567 Align &MaxAlign) const { 568 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 569 F.getCallingConv() == CallingConv::SPIR_KERNEL); 570 571 const DataLayout &DL = F.getParent()->getDataLayout(); 572 uint64_t ExplicitArgBytes = 0; 573 MaxAlign = Align(1); 574 575 for (const Argument &Arg : F.args()) { 576 const bool IsByRef = Arg.hasByRefAttr(); 577 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); 578 MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None; 579 if (!Alignment) 580 Alignment = DL.getABITypeAlign(ArgTy); 581 582 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 583 ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize; 584 MaxAlign = max(MaxAlign, Alignment); 585 } 586 587 return ExplicitArgBytes; 588 } 589 590 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 591 Align &MaxAlign) const { 592 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 593 594 unsigned ExplicitOffset = getExplicitKernelArgOffset(F); 595 596 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 597 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 598 if (ImplicitBytes != 0) { 599 const Align Alignment = getAlignmentForImplicitArgPtr(); 600 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 601 } 602 603 // Being able to dereference past the end is useful for emitting scalar loads. 604 return alignTo(TotalSize, 4); 605 } 606 607 AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const { 608 return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32 609 : AMDGPUDwarfFlavour::Wave64; 610 } 611 612 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 613 const TargetMachine &TM) : 614 R600GenSubtargetInfo(TT, GPU, /*TuneCPU*/GPU, FS), 615 AMDGPUSubtarget(TT), 616 InstrInfo(*this), 617 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 618 FMA(false), 619 CaymanISA(false), 620 CFALUBug(false), 621 HasVertexCache(false), 622 R600ALUInst(false), 623 FP64(false), 624 TexVTXClauseSize(0), 625 Gen(R600), 626 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)), 627 InstrItins(getInstrItineraryForCPU(GPU)) { } 628 629 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 630 unsigned NumRegionInstrs) const { 631 // Track register pressure so the scheduler can try to decrease 632 // pressure once register usage is above the threshold defined by 633 // SIRegisterInfo::getRegPressureSetLimit() 634 Policy.ShouldTrackPressure = true; 635 636 // Enabling both top down and bottom up scheduling seems to give us less 637 // register spills than just using one of these approaches on its own. 638 Policy.OnlyTopDown = false; 639 Policy.OnlyBottomUp = false; 640 641 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 642 if (!enableSIScheduler()) 643 Policy.ShouldTrackLaneMasks = true; 644 } 645 646 bool GCNSubtarget::hasMadF16() const { 647 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1; 648 } 649 650 bool GCNSubtarget::useVGPRIndexMode() const { 651 return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode()); 652 } 653 654 bool GCNSubtarget::useAA() const { return UseAA; } 655 656 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 657 if (getGeneration() >= AMDGPUSubtarget::GFX10) 658 return getMaxWavesPerEU(); 659 660 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 661 if (SGPRs <= 80) 662 return 10; 663 if (SGPRs <= 88) 664 return 9; 665 if (SGPRs <= 100) 666 return 8; 667 return 7; 668 } 669 if (SGPRs <= 48) 670 return 10; 671 if (SGPRs <= 56) 672 return 9; 673 if (SGPRs <= 64) 674 return 8; 675 if (SGPRs <= 72) 676 return 7; 677 if (SGPRs <= 80) 678 return 6; 679 return 5; 680 } 681 682 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 683 unsigned MaxWaves = getMaxWavesPerEU(); 684 unsigned Granule = getVGPRAllocGranule(); 685 if (VGPRs < Granule) 686 return MaxWaves; 687 unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule; 688 return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves); 689 } 690 691 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 692 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 693 if (getGeneration() >= AMDGPUSubtarget::GFX10) 694 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. 695 696 if (MFI.hasFlatScratchInit()) { 697 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 698 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 699 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 700 return 4; // FLAT_SCRATCH, VCC (in that order). 701 } 702 703 if (isXNACKEnabled()) 704 return 4; // XNACK, VCC (in that order). 705 return 2; // VCC. 706 } 707 708 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize, 709 unsigned NumSGPRs, 710 unsigned NumVGPRs) const { 711 unsigned Occupancy = 712 std::min(getMaxWavesPerEU(), 713 getOccupancyWithLocalMemSize(LDSSize, F)); 714 if (NumSGPRs) 715 Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs)); 716 if (NumVGPRs) 717 Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs)); 718 return Occupancy; 719 } 720 721 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 722 const Function &F = MF.getFunction(); 723 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 724 725 // Compute maximum number of SGPRs function can use using default/requested 726 // minimum number of waves per execution unit. 727 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 728 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 729 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 730 731 // Check if maximum number of SGPRs was explicitly requested using 732 // "amdgpu-num-sgpr" attribute. 733 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 734 unsigned Requested = AMDGPU::getIntegerAttribute( 735 F, "amdgpu-num-sgpr", MaxNumSGPRs); 736 737 // Make sure requested value does not violate subtarget's specifications. 738 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 739 Requested = 0; 740 741 // If more SGPRs are required to support the input user/system SGPRs, 742 // increase to accommodate them. 743 // 744 // FIXME: This really ends up using the requested number of SGPRs + number 745 // of reserved special registers in total. Theoretically you could re-use 746 // the last input registers for these special registers, but this would 747 // require a lot of complexity to deal with the weird aliasing. 748 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 749 if (Requested && Requested < InputNumSGPRs) 750 Requested = InputNumSGPRs; 751 752 // Make sure requested value is compatible with values implied by 753 // default/requested minimum/maximum number of waves per execution unit. 754 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 755 Requested = 0; 756 if (WavesPerEU.second && 757 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 758 Requested = 0; 759 760 if (Requested) 761 MaxNumSGPRs = Requested; 762 } 763 764 if (hasSGPRInitBug()) 765 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 766 767 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 768 MaxAddressableNumSGPRs); 769 } 770 771 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 772 const Function &F = MF.getFunction(); 773 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 774 775 // Compute maximum number of VGPRs function can use using default/requested 776 // minimum number of waves per execution unit. 777 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 778 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 779 780 // Check if maximum number of VGPRs was explicitly requested using 781 // "amdgpu-num-vgpr" attribute. 782 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 783 unsigned Requested = AMDGPU::getIntegerAttribute( 784 F, "amdgpu-num-vgpr", MaxNumVGPRs); 785 786 // Make sure requested value is compatible with values implied by 787 // default/requested minimum/maximum number of waves per execution unit. 788 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 789 Requested = 0; 790 if (WavesPerEU.second && 791 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 792 Requested = 0; 793 794 if (Requested) 795 MaxNumVGPRs = Requested; 796 } 797 798 return MaxNumVGPRs; 799 } 800 801 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, 802 int UseOpIdx, SDep &Dep) const { 803 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || 804 !Def->isInstr() || !Use->isInstr()) 805 return; 806 807 MachineInstr *DefI = Def->getInstr(); 808 MachineInstr *UseI = Use->getInstr(); 809 810 if (DefI->isBundle()) { 811 const SIRegisterInfo *TRI = getRegisterInfo(); 812 auto Reg = Dep.getReg(); 813 MachineBasicBlock::const_instr_iterator I(DefI->getIterator()); 814 MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end()); 815 unsigned Lat = 0; 816 for (++I; I != E && I->isBundledWithPred(); ++I) { 817 if (I->modifiesRegister(Reg, TRI)) 818 Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I); 819 else if (Lat) 820 --Lat; 821 } 822 Dep.setLatency(Lat); 823 } else if (UseI->isBundle()) { 824 const SIRegisterInfo *TRI = getRegisterInfo(); 825 auto Reg = Dep.getReg(); 826 MachineBasicBlock::const_instr_iterator I(UseI->getIterator()); 827 MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end()); 828 unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI); 829 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) { 830 if (I->readsRegister(Reg, TRI)) 831 break; 832 --Lat; 833 } 834 Dep.setLatency(Lat); 835 } 836 } 837 838 namespace { 839 struct FillMFMAShadowMutation : ScheduleDAGMutation { 840 const SIInstrInfo *TII; 841 842 ScheduleDAGMI *DAG; 843 844 FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {} 845 846 bool isSALU(const SUnit *SU) const { 847 const MachineInstr *MI = SU->getInstr(); 848 return MI && TII->isSALU(*MI) && !MI->isTerminator(); 849 } 850 851 bool isVALU(const SUnit *SU) const { 852 const MachineInstr *MI = SU->getInstr(); 853 return MI && TII->isVALU(*MI); 854 } 855 856 bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const { 857 if (Pred->NodeNum < Succ->NodeNum) 858 return true; 859 860 SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred}); 861 862 for (unsigned I = 0; I < Succs.size(); ++I) { 863 for (const SDep &SI : Succs[I]->Succs) { 864 const SUnit *SU = SI.getSUnit(); 865 if (SU != Succs[I] && !llvm::is_contained(Succs, SU)) 866 Succs.push_back(SU); 867 } 868 } 869 870 SmallPtrSet<const SUnit*, 32> Visited; 871 while (!Preds.empty()) { 872 const SUnit *SU = Preds.pop_back_val(); 873 if (llvm::is_contained(Succs, SU)) 874 return false; 875 Visited.insert(SU); 876 for (const SDep &SI : SU->Preds) 877 if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit())) 878 Preds.push_back(SI.getSUnit()); 879 } 880 881 return true; 882 } 883 884 // Link as much SALU intructions in chain as possible. Return the size 885 // of the chain. Links up to MaxChain instructions. 886 unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain, 887 SmallPtrSetImpl<SUnit *> &Visited) const { 888 SmallVector<SUnit *, 8> Worklist({To}); 889 unsigned Linked = 0; 890 891 while (!Worklist.empty() && MaxChain-- > 0) { 892 SUnit *SU = Worklist.pop_back_val(); 893 if (!Visited.insert(SU).second) 894 continue; 895 896 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From); 897 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n'); 898 899 if (SU->addPred(SDep(From, SDep::Artificial), false)) 900 ++Linked; 901 902 for (SDep &SI : From->Succs) { 903 SUnit *SUv = SI.getSUnit(); 904 if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU)) 905 SUv->addPred(SDep(SU, SDep::Artificial), false); 906 } 907 908 for (SDep &SI : SU->Succs) { 909 SUnit *Succ = SI.getSUnit(); 910 if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ)) 911 Worklist.push_back(Succ); 912 } 913 } 914 915 return Linked; 916 } 917 918 void apply(ScheduleDAGInstrs *DAGInstrs) override { 919 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); 920 if (!ST.hasMAIInsts() || DisablePowerSched) 921 return; 922 DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); 923 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); 924 if (!TSchedModel || DAG->SUnits.empty()) 925 return; 926 927 // Scan for MFMA long latency instructions and try to add a dependency 928 // of available SALU instructions to give them a chance to fill MFMA 929 // shadow. That is desirable to fill MFMA shadow with SALU instructions 930 // rather than VALU to prevent power consumption bursts and throttle. 931 auto LastSALU = DAG->SUnits.begin(); 932 auto E = DAG->SUnits.end(); 933 SmallPtrSet<SUnit*, 32> Visited; 934 for (SUnit &SU : DAG->SUnits) { 935 MachineInstr &MAI = *SU.getInstr(); 936 if (!TII->isMAI(MAI) || 937 MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 || 938 MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64) 939 continue; 940 941 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1; 942 943 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU); 944 dbgs() << "Need " << Lat 945 << " instructions to cover latency.\n"); 946 947 // Find up to Lat independent scalar instructions as early as 948 // possible such that they can be scheduled after this MFMA. 949 for ( ; Lat && LastSALU != E; ++LastSALU) { 950 if (Visited.count(&*LastSALU)) 951 continue; 952 953 if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU)) 954 continue; 955 956 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited); 957 } 958 } 959 } 960 }; 961 } // namespace 962 963 void GCNSubtarget::getPostRAMutations( 964 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 965 Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo)); 966 } 967 968 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 969 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 970 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 971 else 972 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); 973 } 974 975 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 976 if (TM.getTargetTriple().getArch() == Triple::amdgcn) 977 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 978 else 979 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); 980 } 981