1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Implements the AMDGPU specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUSubtarget.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUCallLowering.h" 17 #include "AMDGPUInstructionSelector.h" 18 #include "AMDGPULegalizerInfo.h" 19 #include "AMDGPURegisterBankInfo.h" 20 #include "AMDGPUTargetMachine.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "llvm/ADT/SmallString.h" 23 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h" 24 #include "llvm/CodeGen/MachineScheduler.h" 25 #include "llvm/CodeGen/TargetFrameLowering.h" 26 #include "llvm/IR/IntrinsicsAMDGPU.h" 27 #include "llvm/IR/IntrinsicsR600.h" 28 #include "llvm/IR/MDBuilder.h" 29 #include "llvm/MC/MCSubtargetInfo.h" 30 #include <algorithm> 31 32 using namespace llvm; 33 34 #define DEBUG_TYPE "amdgpu-subtarget" 35 36 #define GET_SUBTARGETINFO_TARGET_DESC 37 #define GET_SUBTARGETINFO_CTOR 38 #define AMDGPUSubtarget GCNSubtarget 39 #include "AMDGPUGenSubtargetInfo.inc" 40 #define GET_SUBTARGETINFO_TARGET_DESC 41 #define GET_SUBTARGETINFO_CTOR 42 #undef AMDGPUSubtarget 43 #include "R600GenSubtargetInfo.inc" 44 45 static cl::opt<bool> DisablePowerSched( 46 "amdgpu-disable-power-sched", 47 cl::desc("Disable scheduling to minimize mAI power bursts"), 48 cl::init(false)); 49 50 static cl::opt<bool> EnableVGPRIndexMode( 51 "amdgpu-vgpr-index-mode", 52 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), 53 cl::init(false)); 54 55 static cl::opt<bool> EnableFlatScratch( 56 "amdgpu-enable-flat-scratch", 57 cl::desc("Use flat scratch instructions"), 58 cl::init(false)); 59 60 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen", 61 cl::desc("Enable the use of AA during codegen."), 62 cl::init(true)); 63 64 GCNSubtarget::~GCNSubtarget() = default; 65 66 R600Subtarget & 67 R600Subtarget::initializeSubtargetDependencies(const Triple &TT, 68 StringRef GPU, StringRef FS) { 69 SmallString<256> FullFS("+promote-alloca,"); 70 FullFS += FS; 71 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS); 72 73 HasMulU24 = getGeneration() >= EVERGREEN; 74 HasMulI24 = hasCaymanISA(); 75 76 return *this; 77 } 78 79 GCNSubtarget & 80 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 81 StringRef GPU, StringRef FS) { 82 // Determine default and user-specified characteristics 83 // 84 // We want to be able to turn these off, but making this a subtarget feature 85 // for SI has the unhelpful behavior that it unsets everything else if you 86 // disable it. 87 // 88 // Similarly we want enable-prt-strict-null to be on by default and not to 89 // unset everything else if it is disabled 90 91 // Assuming ECC is enabled is the conservative default. 92 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,+sram-ecc,+xnack,"); 93 94 // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default 95 if (isAmdHsaOS()) 96 FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,"; 97 98 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS 99 100 // Disable mutually exclusive bits. 101 if (FS.find_lower("+wavefrontsize") != StringRef::npos) { 102 if (FS.find_lower("wavefrontsize16") == StringRef::npos) 103 FullFS += "-wavefrontsize16,"; 104 if (FS.find_lower("wavefrontsize32") == StringRef::npos) 105 FullFS += "-wavefrontsize32,"; 106 if (FS.find_lower("wavefrontsize64") == StringRef::npos) 107 FullFS += "-wavefrontsize64,"; 108 } 109 110 FullFS += FS; 111 112 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS); 113 114 // Implement the "generic" processors, which acts as the default when no 115 // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to 116 // the first amdgcn target that supports flat addressing. Other OSes defaults 117 // to the first amdgcn target. 118 if (Gen == AMDGPUSubtarget::INVALID) { 119 Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS 120 : AMDGPUSubtarget::SOUTHERN_ISLANDS; 121 } 122 123 // We don't support FP64 for EG/NI atm. 124 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 125 126 // Targets must either support 64-bit offsets for MUBUF instructions, and/or 127 // support flat operations, otherwise they cannot access a 64-bit global 128 // address space 129 assert(hasAddr64() || hasFlat()); 130 // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets 131 // that do not support ADDR64 variants of MUBUF instructions. Such targets 132 // cannot use a 64 bit offset with a MUBUF instruction to access the global 133 // address space 134 if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) { 135 ToggleFeature(AMDGPU::FeatureFlatForGlobal); 136 FlatForGlobal = true; 137 } 138 // Unless +-flat-for-global is specified, use MUBUF instructions for global 139 // address space access if flat operations are not available. 140 if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) { 141 ToggleFeature(AMDGPU::FeatureFlatForGlobal); 142 FlatForGlobal = false; 143 } 144 145 // Set defaults if needed. 146 if (MaxPrivateElementSize == 0) 147 MaxPrivateElementSize = 4; 148 149 if (LDSBankCount == 0) 150 LDSBankCount = 32; 151 152 if (TT.getArch() == Triple::amdgcn) { 153 if (LocalMemorySize == 0) 154 LocalMemorySize = 32768; 155 156 // Do something sensible for unspecified target. 157 if (!HasMovrel && !HasVGPRIndexMode) 158 HasMovrel = true; 159 } 160 161 // Don't crash on invalid devices. 162 if (WavefrontSizeLog2 == 0) 163 WavefrontSizeLog2 = 5; 164 165 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 166 167 // Disable XNACK on targets where it is not enabled by default unless it is 168 // explicitly requested. 169 if (!FS.contains("+xnack") && DoesNotSupportXNACK && EnableXNACK) { 170 ToggleFeature(AMDGPU::FeatureXNACK); 171 EnableXNACK = false; 172 } 173 174 // ECC is on by default, but turn it off if the hardware doesn't support it 175 // anyway. This matters for the gfx9 targets with d16 loads, but don't support 176 // ECC. 177 if (DoesNotSupportSRAMECC && EnableSRAMECC) { 178 ToggleFeature(AMDGPU::FeatureSRAMECC); 179 EnableSRAMECC = false; 180 } 181 182 return *this; 183 } 184 185 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : 186 TargetTriple(TT), 187 Has16BitInsts(false), 188 HasMadMixInsts(false), 189 HasMadMacF32Insts(false), 190 HasDsSrc2Insts(false), 191 HasSDWA(false), 192 HasVOP3PInsts(false), 193 HasMulI24(true), 194 HasMulU24(true), 195 HasInv2PiInlineImm(false), 196 HasFminFmaxLegacy(true), 197 EnablePromoteAlloca(false), 198 HasTrigReducedRange(false), 199 MaxWavesPerEU(10), 200 LocalMemorySize(0), 201 WavefrontSizeLog2(0) 202 { } 203 204 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 205 const GCNTargetMachine &TM) : 206 AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS), 207 AMDGPUSubtarget(TT), 208 TargetTriple(TT), 209 Gen(INVALID), 210 InstrItins(getInstrItineraryForCPU(GPU)), 211 LDSBankCount(0), 212 MaxPrivateElementSize(0), 213 214 FastFMAF32(false), 215 FastDenormalF32(false), 216 HalfRate64Ops(false), 217 218 FlatForGlobal(false), 219 AutoWaitcntBeforeBarrier(false), 220 UnalignedScratchAccess(false), 221 UnalignedAccessMode(false), 222 223 HasApertureRegs(false), 224 EnableXNACK(false), 225 DoesNotSupportXNACK(false), 226 EnableCuMode(false), 227 TrapHandler(false), 228 229 EnableLoadStoreOpt(false), 230 EnableUnsafeDSOffsetFolding(false), 231 EnableSIScheduler(false), 232 EnableDS128(false), 233 EnablePRTStrictNull(false), 234 DumpCode(false), 235 236 FP64(false), 237 GCN3Encoding(false), 238 CIInsts(false), 239 GFX8Insts(false), 240 GFX9Insts(false), 241 GFX10Insts(false), 242 GFX10_3Insts(false), 243 GFX7GFX8GFX9Insts(false), 244 SGPRInitBug(false), 245 HasSMemRealTime(false), 246 HasIntClamp(false), 247 HasFmaMixInsts(false), 248 HasMovrel(false), 249 HasVGPRIndexMode(false), 250 HasScalarStores(false), 251 HasScalarAtomics(false), 252 HasSDWAOmod(false), 253 HasSDWAScalar(false), 254 HasSDWASdst(false), 255 HasSDWAMac(false), 256 HasSDWAOutModsVOPC(false), 257 HasDPP(false), 258 HasDPP8(false), 259 HasR128A16(false), 260 HasGFX10A16(false), 261 HasG16(false), 262 HasNSAEncoding(false), 263 GFX10_BEncoding(false), 264 HasDLInsts(false), 265 HasDot1Insts(false), 266 HasDot2Insts(false), 267 HasDot3Insts(false), 268 HasDot4Insts(false), 269 HasDot5Insts(false), 270 HasDot6Insts(false), 271 HasMAIInsts(false), 272 HasPkFmacF16Inst(false), 273 HasAtomicFaddInsts(false), 274 EnableSRAMECC(false), 275 DoesNotSupportSRAMECC(false), 276 HasNoSdstCMPX(false), 277 HasVscnt(false), 278 HasGetWaveIdInst(false), 279 HasSMemTimeInst(false), 280 HasRegisterBanking(false), 281 HasVOP3Literal(false), 282 HasNoDataDepHazard(false), 283 FlatAddressSpace(false), 284 FlatInstOffsets(false), 285 FlatGlobalInsts(false), 286 FlatScratchInsts(false), 287 ScalarFlatScratchInsts(false), 288 AddNoCarryInsts(false), 289 HasUnpackedD16VMem(false), 290 LDSMisalignedBug(false), 291 HasMFMAInlineLiteralBug(false), 292 UnalignedBufferAccess(false), 293 UnalignedDSAccess(false), 294 295 ScalarizeGlobal(false), 296 297 HasVcmpxPermlaneHazard(false), 298 HasVMEMtoScalarWriteHazard(false), 299 HasSMEMtoVectorWriteHazard(false), 300 HasInstFwdPrefetchBug(false), 301 HasVcmpxExecWARHazard(false), 302 HasLdsBranchVmemWARHazard(false), 303 HasNSAtoVMEMBug(false), 304 HasOffset3fBug(false), 305 HasFlatSegmentOffsetBug(false), 306 HasImageStoreD16Bug(false), 307 HasImageGather4D16Bug(false), 308 309 FeatureDisable(false), 310 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 311 TLInfo(TM, *this), 312 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 313 MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this); 314 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 315 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering())); 316 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 317 RegBankInfo.reset(new AMDGPURegisterBankInfo(*this)); 318 InstSelector.reset(new AMDGPUInstructionSelector( 319 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 320 } 321 322 bool GCNSubtarget::enableFlatScratch() const { 323 return EnableFlatScratch && hasFlatScratchInsts(); 324 } 325 326 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { 327 if (getGeneration() < GFX10) 328 return 1; 329 330 switch (Opcode) { 331 case AMDGPU::V_LSHLREV_B64_e64: 332 case AMDGPU::V_LSHLREV_B64_gfx10: 333 case AMDGPU::V_LSHL_B64_e64: 334 case AMDGPU::V_LSHRREV_B64_e64: 335 case AMDGPU::V_LSHRREV_B64_gfx10: 336 case AMDGPU::V_LSHR_B64_e64: 337 case AMDGPU::V_ASHRREV_I64_e64: 338 case AMDGPU::V_ASHRREV_I64_gfx10: 339 case AMDGPU::V_ASHR_I64_e64: 340 return 1; 341 } 342 343 return 2; 344 } 345 346 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 347 const Function &F) const { 348 if (NWaves == 1) 349 return getLocalMemorySize(); 350 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 351 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 352 if (!WorkGroupsPerCu) 353 return 0; 354 unsigned MaxWaves = getMaxWavesPerEU(); 355 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 356 } 357 358 // FIXME: Should return min,max range. 359 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 360 const Function &F) const { 361 const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second; 362 const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize); 363 if (!MaxWorkGroupsPerCu) 364 return 0; 365 366 const unsigned WaveSize = getWavefrontSize(); 367 368 // FIXME: Do we need to account for alignment requirement of LDS rounding the 369 // size up? 370 // Compute restriction based on LDS usage 371 unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u); 372 373 // This can be queried with more LDS than is possible, so just assume the 374 // worst. 375 if (NumGroups == 0) 376 return 1; 377 378 NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups); 379 380 // Round to the number of waves. 381 const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize; 382 unsigned MaxWaves = NumGroups * MaxGroupNumWaves; 383 384 // Clamp to the maximum possible number of waves. 385 MaxWaves = std::min(MaxWaves, getMaxWavesPerEU()); 386 387 // FIXME: Needs to be a multiple of the group size? 388 //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves); 389 390 assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() && 391 "computed invalid occupancy"); 392 return MaxWaves; 393 } 394 395 unsigned 396 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 397 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 398 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 399 } 400 401 std::pair<unsigned, unsigned> 402 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 403 switch (CC) { 404 case CallingConv::AMDGPU_VS: 405 case CallingConv::AMDGPU_LS: 406 case CallingConv::AMDGPU_HS: 407 case CallingConv::AMDGPU_ES: 408 case CallingConv::AMDGPU_GS: 409 case CallingConv::AMDGPU_PS: 410 return std::make_pair(1, getWavefrontSize()); 411 default: 412 return std::make_pair(1u, getMaxFlatWorkGroupSize()); 413 } 414 } 415 416 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 417 const Function &F) const { 418 // Default minimum/maximum flat work group sizes. 419 std::pair<unsigned, unsigned> Default = 420 getDefaultFlatWorkGroupSize(F.getCallingConv()); 421 422 // Requested minimum/maximum flat work group sizes. 423 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 424 F, "amdgpu-flat-work-group-size", Default); 425 426 // Make sure requested minimum is less than requested maximum. 427 if (Requested.first > Requested.second) 428 return Default; 429 430 // Make sure requested values do not violate subtarget's specifications. 431 if (Requested.first < getMinFlatWorkGroupSize()) 432 return Default; 433 if (Requested.second > getMaxFlatWorkGroupSize()) 434 return Default; 435 436 return Requested; 437 } 438 439 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 440 const Function &F) const { 441 // Default minimum/maximum number of waves per execution unit. 442 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 443 444 // Default/requested minimum/maximum flat work group sizes. 445 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 446 447 // If minimum/maximum flat work group sizes were explicitly requested using 448 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 449 // number of waves per execution unit to values implied by requested 450 // minimum/maximum flat work group sizes. 451 unsigned MinImpliedByFlatWorkGroupSize = 452 getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second); 453 Default.first = MinImpliedByFlatWorkGroupSize; 454 bool RequestedFlatWorkGroupSize = 455 F.hasFnAttribute("amdgpu-flat-work-group-size"); 456 457 // Requested minimum/maximum number of waves per execution unit. 458 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 459 F, "amdgpu-waves-per-eu", Default, true); 460 461 // Make sure requested minimum is less than requested maximum. 462 if (Requested.second && Requested.first > Requested.second) 463 return Default; 464 465 // Make sure requested values do not violate subtarget's specifications. 466 if (Requested.first < getMinWavesPerEU() || 467 Requested.second > getMaxWavesPerEU()) 468 return Default; 469 470 // Make sure requested values are compatible with values implied by requested 471 // minimum/maximum flat work group sizes. 472 if (RequestedFlatWorkGroupSize && 473 Requested.first < MinImpliedByFlatWorkGroupSize) 474 return Default; 475 476 return Requested; 477 } 478 479 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) { 480 auto Node = Kernel.getMetadata("reqd_work_group_size"); 481 if (Node && Node->getNumOperands() == 3) 482 return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue(); 483 return std::numeric_limits<unsigned>::max(); 484 } 485 486 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const { 487 return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv()); 488 } 489 490 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel, 491 unsigned Dimension) const { 492 unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension); 493 if (ReqdSize != std::numeric_limits<unsigned>::max()) 494 return ReqdSize - 1; 495 return getFlatWorkGroupSizes(Kernel).second - 1; 496 } 497 498 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 499 Function *Kernel = I->getParent()->getParent(); 500 unsigned MinSize = 0; 501 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 502 bool IdQuery = false; 503 504 // If reqd_work_group_size is present it narrows value down. 505 if (auto *CI = dyn_cast<CallInst>(I)) { 506 const Function *F = CI->getCalledFunction(); 507 if (F) { 508 unsigned Dim = UINT_MAX; 509 switch (F->getIntrinsicID()) { 510 case Intrinsic::amdgcn_workitem_id_x: 511 case Intrinsic::r600_read_tidig_x: 512 IdQuery = true; 513 LLVM_FALLTHROUGH; 514 case Intrinsic::r600_read_local_size_x: 515 Dim = 0; 516 break; 517 case Intrinsic::amdgcn_workitem_id_y: 518 case Intrinsic::r600_read_tidig_y: 519 IdQuery = true; 520 LLVM_FALLTHROUGH; 521 case Intrinsic::r600_read_local_size_y: 522 Dim = 1; 523 break; 524 case Intrinsic::amdgcn_workitem_id_z: 525 case Intrinsic::r600_read_tidig_z: 526 IdQuery = true; 527 LLVM_FALLTHROUGH; 528 case Intrinsic::r600_read_local_size_z: 529 Dim = 2; 530 break; 531 default: 532 break; 533 } 534 535 if (Dim <= 3) { 536 unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim); 537 if (ReqdSize != std::numeric_limits<unsigned>::max()) 538 MinSize = MaxSize = ReqdSize; 539 } 540 } 541 } 542 543 if (!MaxSize) 544 return false; 545 546 // Range metadata is [Lo, Hi). For ID query we need to pass max size 547 // as Hi. For size query we need to pass Hi + 1. 548 if (IdQuery) 549 MinSize = 0; 550 else 551 ++MaxSize; 552 553 MDBuilder MDB(I->getContext()); 554 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 555 APInt(32, MaxSize)); 556 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 557 return true; 558 } 559 560 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const { 561 if (isMesaKernel(F)) 562 return 16; 563 return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0); 564 } 565 566 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 567 Align &MaxAlign) const { 568 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 569 F.getCallingConv() == CallingConv::SPIR_KERNEL); 570 571 const DataLayout &DL = F.getParent()->getDataLayout(); 572 uint64_t ExplicitArgBytes = 0; 573 MaxAlign = Align(1); 574 575 for (const Argument &Arg : F.args()) { 576 const bool IsByRef = Arg.hasByRefAttr(); 577 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); 578 MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None; 579 if (!Alignment) 580 Alignment = DL.getABITypeAlign(ArgTy); 581 582 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 583 ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize; 584 MaxAlign = max(MaxAlign, Alignment); 585 } 586 587 return ExplicitArgBytes; 588 } 589 590 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 591 Align &MaxAlign) const { 592 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 593 594 unsigned ExplicitOffset = getExplicitKernelArgOffset(F); 595 596 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 597 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 598 if (ImplicitBytes != 0) { 599 const Align Alignment = getAlignmentForImplicitArgPtr(); 600 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 601 } 602 603 // Being able to dereference past the end is useful for emitting scalar loads. 604 return alignTo(TotalSize, 4); 605 } 606 607 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 608 const TargetMachine &TM) : 609 R600GenSubtargetInfo(TT, GPU, /*TuneCPU*/GPU, FS), 610 AMDGPUSubtarget(TT), 611 InstrInfo(*this), 612 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 613 FMA(false), 614 CaymanISA(false), 615 CFALUBug(false), 616 HasVertexCache(false), 617 R600ALUInst(false), 618 FP64(false), 619 TexVTXClauseSize(0), 620 Gen(R600), 621 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)), 622 InstrItins(getInstrItineraryForCPU(GPU)) { } 623 624 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 625 unsigned NumRegionInstrs) const { 626 // Track register pressure so the scheduler can try to decrease 627 // pressure once register usage is above the threshold defined by 628 // SIRegisterInfo::getRegPressureSetLimit() 629 Policy.ShouldTrackPressure = true; 630 631 // Enabling both top down and bottom up scheduling seems to give us less 632 // register spills than just using one of these approaches on its own. 633 Policy.OnlyTopDown = false; 634 Policy.OnlyBottomUp = false; 635 636 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 637 if (!enableSIScheduler()) 638 Policy.ShouldTrackLaneMasks = true; 639 } 640 641 bool GCNSubtarget::hasMadF16() const { 642 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1; 643 } 644 645 bool GCNSubtarget::useVGPRIndexMode() const { 646 return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode()); 647 } 648 649 bool GCNSubtarget::useAA() const { return UseAA; } 650 651 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 652 if (getGeneration() >= AMDGPUSubtarget::GFX10) 653 return getMaxWavesPerEU(); 654 655 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 656 if (SGPRs <= 80) 657 return 10; 658 if (SGPRs <= 88) 659 return 9; 660 if (SGPRs <= 100) 661 return 8; 662 return 7; 663 } 664 if (SGPRs <= 48) 665 return 10; 666 if (SGPRs <= 56) 667 return 9; 668 if (SGPRs <= 64) 669 return 8; 670 if (SGPRs <= 72) 671 return 7; 672 if (SGPRs <= 80) 673 return 6; 674 return 5; 675 } 676 677 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 678 unsigned MaxWaves = getMaxWavesPerEU(); 679 unsigned Granule = getVGPRAllocGranule(); 680 if (VGPRs < Granule) 681 return MaxWaves; 682 unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule; 683 return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves); 684 } 685 686 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 687 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 688 if (getGeneration() >= AMDGPUSubtarget::GFX10) 689 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. 690 691 if (MFI.hasFlatScratchInit()) { 692 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 693 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 694 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 695 return 4; // FLAT_SCRATCH, VCC (in that order). 696 } 697 698 if (isXNACKEnabled()) 699 return 4; // XNACK, VCC (in that order). 700 return 2; // VCC. 701 } 702 703 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize, 704 unsigned NumSGPRs, 705 unsigned NumVGPRs) const { 706 unsigned Occupancy = 707 std::min(getMaxWavesPerEU(), 708 getOccupancyWithLocalMemSize(LDSSize, F)); 709 if (NumSGPRs) 710 Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs)); 711 if (NumVGPRs) 712 Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs)); 713 return Occupancy; 714 } 715 716 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 717 const Function &F = MF.getFunction(); 718 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 719 720 // Compute maximum number of SGPRs function can use using default/requested 721 // minimum number of waves per execution unit. 722 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 723 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 724 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 725 726 // Check if maximum number of SGPRs was explicitly requested using 727 // "amdgpu-num-sgpr" attribute. 728 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 729 unsigned Requested = AMDGPU::getIntegerAttribute( 730 F, "amdgpu-num-sgpr", MaxNumSGPRs); 731 732 // Make sure requested value does not violate subtarget's specifications. 733 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 734 Requested = 0; 735 736 // If more SGPRs are required to support the input user/system SGPRs, 737 // increase to accommodate them. 738 // 739 // FIXME: This really ends up using the requested number of SGPRs + number 740 // of reserved special registers in total. Theoretically you could re-use 741 // the last input registers for these special registers, but this would 742 // require a lot of complexity to deal with the weird aliasing. 743 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 744 if (Requested && Requested < InputNumSGPRs) 745 Requested = InputNumSGPRs; 746 747 // Make sure requested value is compatible with values implied by 748 // default/requested minimum/maximum number of waves per execution unit. 749 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 750 Requested = 0; 751 if (WavesPerEU.second && 752 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 753 Requested = 0; 754 755 if (Requested) 756 MaxNumSGPRs = Requested; 757 } 758 759 if (hasSGPRInitBug()) 760 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 761 762 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 763 MaxAddressableNumSGPRs); 764 } 765 766 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 767 const Function &F = MF.getFunction(); 768 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 769 770 // Compute maximum number of VGPRs function can use using default/requested 771 // minimum number of waves per execution unit. 772 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 773 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 774 775 // Check if maximum number of VGPRs was explicitly requested using 776 // "amdgpu-num-vgpr" attribute. 777 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 778 unsigned Requested = AMDGPU::getIntegerAttribute( 779 F, "amdgpu-num-vgpr", MaxNumVGPRs); 780 781 // Make sure requested value is compatible with values implied by 782 // default/requested minimum/maximum number of waves per execution unit. 783 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 784 Requested = 0; 785 if (WavesPerEU.second && 786 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 787 Requested = 0; 788 789 if (Requested) 790 MaxNumVGPRs = Requested; 791 } 792 793 return MaxNumVGPRs; 794 } 795 796 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, 797 int UseOpIdx, SDep &Dep) const { 798 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || 799 !Def->isInstr() || !Use->isInstr()) 800 return; 801 802 MachineInstr *DefI = Def->getInstr(); 803 MachineInstr *UseI = Use->getInstr(); 804 805 if (DefI->isBundle()) { 806 const SIRegisterInfo *TRI = getRegisterInfo(); 807 auto Reg = Dep.getReg(); 808 MachineBasicBlock::const_instr_iterator I(DefI->getIterator()); 809 MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end()); 810 unsigned Lat = 0; 811 for (++I; I != E && I->isBundledWithPred(); ++I) { 812 if (I->modifiesRegister(Reg, TRI)) 813 Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I); 814 else if (Lat) 815 --Lat; 816 } 817 Dep.setLatency(Lat); 818 } else if (UseI->isBundle()) { 819 const SIRegisterInfo *TRI = getRegisterInfo(); 820 auto Reg = Dep.getReg(); 821 MachineBasicBlock::const_instr_iterator I(UseI->getIterator()); 822 MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end()); 823 unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI); 824 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) { 825 if (I->readsRegister(Reg, TRI)) 826 break; 827 --Lat; 828 } 829 Dep.setLatency(Lat); 830 } 831 } 832 833 namespace { 834 struct FillMFMAShadowMutation : ScheduleDAGMutation { 835 const SIInstrInfo *TII; 836 837 ScheduleDAGMI *DAG; 838 839 FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {} 840 841 bool isSALU(const SUnit *SU) const { 842 const MachineInstr *MI = SU->getInstr(); 843 return MI && TII->isSALU(*MI) && !MI->isTerminator(); 844 } 845 846 bool isVALU(const SUnit *SU) const { 847 const MachineInstr *MI = SU->getInstr(); 848 return MI && TII->isVALU(*MI); 849 } 850 851 bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const { 852 if (Pred->NodeNum < Succ->NodeNum) 853 return true; 854 855 SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred}); 856 857 for (unsigned I = 0; I < Succs.size(); ++I) { 858 for (const SDep &SI : Succs[I]->Succs) { 859 const SUnit *SU = SI.getSUnit(); 860 if (SU != Succs[I] && !llvm::is_contained(Succs, SU)) 861 Succs.push_back(SU); 862 } 863 } 864 865 SmallPtrSet<const SUnit*, 32> Visited; 866 while (!Preds.empty()) { 867 const SUnit *SU = Preds.pop_back_val(); 868 if (llvm::is_contained(Succs, SU)) 869 return false; 870 Visited.insert(SU); 871 for (const SDep &SI : SU->Preds) 872 if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit())) 873 Preds.push_back(SI.getSUnit()); 874 } 875 876 return true; 877 } 878 879 // Link as much SALU intructions in chain as possible. Return the size 880 // of the chain. Links up to MaxChain instructions. 881 unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain, 882 SmallPtrSetImpl<SUnit *> &Visited) const { 883 SmallVector<SUnit *, 8> Worklist({To}); 884 unsigned Linked = 0; 885 886 while (!Worklist.empty() && MaxChain-- > 0) { 887 SUnit *SU = Worklist.pop_back_val(); 888 if (!Visited.insert(SU).second) 889 continue; 890 891 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From); 892 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n'); 893 894 if (SU->addPred(SDep(From, SDep::Artificial), false)) 895 ++Linked; 896 897 for (SDep &SI : From->Succs) { 898 SUnit *SUv = SI.getSUnit(); 899 if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU)) 900 SUv->addPred(SDep(SU, SDep::Artificial), false); 901 } 902 903 for (SDep &SI : SU->Succs) { 904 SUnit *Succ = SI.getSUnit(); 905 if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ)) 906 Worklist.push_back(Succ); 907 } 908 } 909 910 return Linked; 911 } 912 913 void apply(ScheduleDAGInstrs *DAGInstrs) override { 914 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); 915 if (!ST.hasMAIInsts() || DisablePowerSched) 916 return; 917 DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); 918 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); 919 if (!TSchedModel || DAG->SUnits.empty()) 920 return; 921 922 // Scan for MFMA long latency instructions and try to add a dependency 923 // of available SALU instructions to give them a chance to fill MFMA 924 // shadow. That is desirable to fill MFMA shadow with SALU instructions 925 // rather than VALU to prevent power consumption bursts and throttle. 926 auto LastSALU = DAG->SUnits.begin(); 927 auto E = DAG->SUnits.end(); 928 SmallPtrSet<SUnit*, 32> Visited; 929 for (SUnit &SU : DAG->SUnits) { 930 MachineInstr &MAI = *SU.getInstr(); 931 if (!TII->isMAI(MAI) || 932 MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 || 933 MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64) 934 continue; 935 936 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1; 937 938 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU); 939 dbgs() << "Need " << Lat 940 << " instructions to cover latency.\n"); 941 942 // Find up to Lat independent scalar instructions as early as 943 // possible such that they can be scheduled after this MFMA. 944 for ( ; Lat && LastSALU != E; ++LastSALU) { 945 if (Visited.count(&*LastSALU)) 946 continue; 947 948 if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU)) 949 continue; 950 951 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited); 952 } 953 } 954 } 955 }; 956 } // namespace 957 958 void GCNSubtarget::getPostRAMutations( 959 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 960 Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo)); 961 } 962 963 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 964 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 965 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 966 else 967 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); 968 } 969 970 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 971 if (TM.getTargetTriple().getArch() == Triple::amdgcn) 972 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 973 else 974 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); 975 } 976