1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Implements the AMDGPU specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUSubtarget.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUTargetMachine.h" 17 #include "AMDGPUCallLowering.h" 18 #include "AMDGPUInstructionSelector.h" 19 #include "AMDGPULegalizerInfo.h" 20 #include "AMDGPURegisterBankInfo.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 23 #include "llvm/ADT/SmallString.h" 24 #include "llvm/CodeGen/MachineScheduler.h" 25 #include "llvm/MC/MCSubtargetInfo.h" 26 #include "llvm/IR/MDBuilder.h" 27 #include "llvm/CodeGen/TargetFrameLowering.h" 28 #include <algorithm> 29 30 using namespace llvm; 31 32 #define DEBUG_TYPE "amdgpu-subtarget" 33 34 #define GET_SUBTARGETINFO_TARGET_DESC 35 #define GET_SUBTARGETINFO_CTOR 36 #define AMDGPUSubtarget GCNSubtarget 37 #include "AMDGPUGenSubtargetInfo.inc" 38 #define GET_SUBTARGETINFO_TARGET_DESC 39 #define GET_SUBTARGETINFO_CTOR 40 #undef AMDGPUSubtarget 41 #include "R600GenSubtargetInfo.inc" 42 43 GCNSubtarget::~GCNSubtarget() = default; 44 45 R600Subtarget & 46 R600Subtarget::initializeSubtargetDependencies(const Triple &TT, 47 StringRef GPU, StringRef FS) { 48 SmallString<256> FullFS("+promote-alloca,"); 49 FullFS += FS; 50 ParseSubtargetFeatures(GPU, FullFS); 51 52 // FIXME: I don't think think Evergreen has any useful support for 53 // denormals, but should be checked. Should we issue a warning somewhere 54 // if someone tries to enable these? 55 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { 56 FP32Denormals = false; 57 } 58 59 HasMulU24 = getGeneration() >= EVERGREEN; 60 HasMulI24 = hasCaymanISA(); 61 62 return *this; 63 } 64 65 GCNSubtarget & 66 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 67 StringRef GPU, StringRef FS) { 68 // Determine default and user-specified characteristics 69 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be 70 // enabled, but some instructions do not respect them and they run at the 71 // double precision rate, so don't enable by default. 72 // 73 // We want to be able to turn these off, but making this a subtarget feature 74 // for SI has the unhelpful behavior that it unsets everything else if you 75 // disable it. 76 // 77 // Similarly we want enable-prt-strict-null to be on by default and not to 78 // unset everything else if it is disabled 79 80 // Assuming ECC is enabled is the conservative default. 81 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,+xnack,"); 82 83 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 84 FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,"; 85 86 // FIXME: I don't think think Evergreen has any useful support for 87 // denormals, but should be checked. Should we issue a warning somewhere 88 // if someone tries to enable these? 89 if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { 90 FullFS += "+fp64-fp16-denormals,"; 91 } else { 92 FullFS += "-fp32-denormals,"; 93 } 94 95 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS 96 97 // Disable mutually exclusive bits. 98 if (FS.find_lower("+wavefrontsize") != StringRef::npos) { 99 if (FS.find_lower("wavefrontsize16") == StringRef::npos) 100 FullFS += "-wavefrontsize16,"; 101 if (FS.find_lower("wavefrontsize32") == StringRef::npos) 102 FullFS += "-wavefrontsize32,"; 103 if (FS.find_lower("wavefrontsize64") == StringRef::npos) 104 FullFS += "-wavefrontsize64,"; 105 } 106 107 FullFS += FS; 108 109 ParseSubtargetFeatures(GPU, FullFS); 110 111 // We don't support FP64 for EG/NI atm. 112 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 113 114 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es 115 // on VI and newer hardware to avoid assertion failures due to missing ADDR64 116 // variants of MUBUF instructions. 117 if (!hasAddr64() && !FS.contains("flat-for-global")) { 118 FlatForGlobal = true; 119 } 120 121 // Set defaults if needed. 122 if (MaxPrivateElementSize == 0) 123 MaxPrivateElementSize = 4; 124 125 if (LDSBankCount == 0) 126 LDSBankCount = 32; 127 128 if (TT.getArch() == Triple::amdgcn) { 129 if (LocalMemorySize == 0) 130 LocalMemorySize = 32768; 131 132 // Do something sensible for unspecified target. 133 if (!HasMovrel && !HasVGPRIndexMode) 134 HasMovrel = true; 135 } 136 137 // Don't crash on invalid devices. 138 if (WavefrontSize == 0) 139 WavefrontSize = 64; 140 141 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 142 143 if (DoesNotSupportXNACK && EnableXNACK) { 144 ToggleFeature(AMDGPU::FeatureXNACK); 145 EnableXNACK = false; 146 } 147 148 // ECC is on by default, but turn it off if the hardware doesn't support it 149 // anyway. This matters for the gfx9 targets with d16 loads, but don't support 150 // ECC. 151 if (DoesNotSupportSRAMECC && EnableSRAMECC) { 152 ToggleFeature(AMDGPU::FeatureSRAMECC); 153 EnableSRAMECC = false; 154 } 155 156 return *this; 157 } 158 159 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : 160 TargetTriple(TT), 161 Has16BitInsts(false), 162 HasMadMixInsts(false), 163 FP32Denormals(false), 164 FPExceptions(false), 165 HasSDWA(false), 166 HasVOP3PInsts(false), 167 HasMulI24(true), 168 HasMulU24(true), 169 HasInv2PiInlineImm(false), 170 HasFminFmaxLegacy(true), 171 EnablePromoteAlloca(false), 172 HasTrigReducedRange(false), 173 LocalMemorySize(0), 174 WavefrontSize(0) 175 { } 176 177 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 178 const GCNTargetMachine &TM) : 179 AMDGPUGenSubtargetInfo(TT, GPU, FS), 180 AMDGPUSubtarget(TT), 181 TargetTriple(TT), 182 Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS), 183 InstrItins(getInstrItineraryForCPU(GPU)), 184 LDSBankCount(0), 185 MaxPrivateElementSize(0), 186 187 FastFMAF32(false), 188 HalfRate64Ops(false), 189 190 FP64FP16Denormals(false), 191 FlatForGlobal(false), 192 AutoWaitcntBeforeBarrier(false), 193 CodeObjectV3(false), 194 UnalignedScratchAccess(false), 195 UnalignedBufferAccess(false), 196 197 HasApertureRegs(false), 198 EnableXNACK(false), 199 DoesNotSupportXNACK(false), 200 EnableCuMode(false), 201 TrapHandler(false), 202 203 EnableLoadStoreOpt(false), 204 EnableUnsafeDSOffsetFolding(false), 205 EnableSIScheduler(false), 206 EnableDS128(false), 207 EnablePRTStrictNull(false), 208 DumpCode(false), 209 210 FP64(false), 211 GCN3Encoding(false), 212 CIInsts(false), 213 GFX8Insts(false), 214 GFX9Insts(false), 215 GFX10Insts(false), 216 GFX7GFX8GFX9Insts(false), 217 SGPRInitBug(false), 218 HasSMemRealTime(false), 219 HasIntClamp(false), 220 HasFmaMixInsts(false), 221 HasMovrel(false), 222 HasVGPRIndexMode(false), 223 HasScalarStores(false), 224 HasScalarAtomics(false), 225 HasSDWAOmod(false), 226 HasSDWAScalar(false), 227 HasSDWASdst(false), 228 HasSDWAMac(false), 229 HasSDWAOutModsVOPC(false), 230 HasDPP(false), 231 HasDPP8(false), 232 HasR128A16(false), 233 HasNSAEncoding(false), 234 HasDLInsts(false), 235 HasDot1Insts(false), 236 HasDot2Insts(false), 237 HasDot3Insts(false), 238 HasDot4Insts(false), 239 HasDot5Insts(false), 240 HasDot6Insts(false), 241 HasMAIInsts(false), 242 HasPkFmacF16Inst(false), 243 HasAtomicFaddInsts(false), 244 EnableSRAMECC(false), 245 DoesNotSupportSRAMECC(false), 246 HasNoSdstCMPX(false), 247 HasVscnt(false), 248 HasRegisterBanking(false), 249 HasVOP3Literal(false), 250 HasNoDataDepHazard(false), 251 FlatAddressSpace(false), 252 FlatInstOffsets(false), 253 FlatGlobalInsts(false), 254 FlatScratchInsts(false), 255 ScalarFlatScratchInsts(false), 256 AddNoCarryInsts(false), 257 HasUnpackedD16VMem(false), 258 LDSMisalignedBug(false), 259 260 ScalarizeGlobal(false), 261 262 HasVcmpxPermlaneHazard(false), 263 HasVMEMtoScalarWriteHazard(false), 264 HasSMEMtoVectorWriteHazard(false), 265 HasInstFwdPrefetchBug(false), 266 HasVcmpxExecWARHazard(false), 267 HasLdsBranchVmemWARHazard(false), 268 HasNSAtoVMEMBug(false), 269 HasOffset3fBug(false), 270 HasFlatSegmentOffsetBug(false), 271 272 FeatureDisable(false), 273 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 274 TLInfo(TM, *this), 275 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 276 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 277 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 278 RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo())); 279 InstSelector.reset(new AMDGPUInstructionSelector( 280 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 281 } 282 283 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { 284 if (getGeneration() < GFX10) 285 return 1; 286 287 switch (Opcode) { 288 case AMDGPU::V_LSHLREV_B64: 289 case AMDGPU::V_LSHLREV_B64_gfx10: 290 case AMDGPU::V_LSHL_B64: 291 case AMDGPU::V_LSHRREV_B64: 292 case AMDGPU::V_LSHRREV_B64_gfx10: 293 case AMDGPU::V_LSHR_B64: 294 case AMDGPU::V_ASHRREV_I64: 295 case AMDGPU::V_ASHRREV_I64_gfx10: 296 case AMDGPU::V_ASHR_I64: 297 return 1; 298 } 299 300 return 2; 301 } 302 303 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 304 const Function &F) const { 305 if (NWaves == 1) 306 return getLocalMemorySize(); 307 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 308 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 309 if (!WorkGroupsPerCu) 310 return 0; 311 unsigned MaxWaves = getMaxWavesPerEU(); 312 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 313 } 314 315 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 316 const Function &F) const { 317 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 318 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 319 if (!WorkGroupsPerCu) 320 return 0; 321 unsigned MaxWaves = getMaxWavesPerEU(); 322 unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu; 323 unsigned NumWaves = Limit / (Bytes ? Bytes : 1u); 324 NumWaves = std::min(NumWaves, MaxWaves); 325 NumWaves = std::max(NumWaves, 1u); 326 return NumWaves; 327 } 328 329 unsigned 330 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 331 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 332 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 333 } 334 335 std::pair<unsigned, unsigned> 336 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 337 switch (CC) { 338 case CallingConv::AMDGPU_CS: 339 case CallingConv::AMDGPU_KERNEL: 340 case CallingConv::SPIR_KERNEL: 341 return std::make_pair(getWavefrontSize() * 2, 342 std::max(getWavefrontSize() * 4, 256u)); 343 case CallingConv::AMDGPU_VS: 344 case CallingConv::AMDGPU_LS: 345 case CallingConv::AMDGPU_HS: 346 case CallingConv::AMDGPU_ES: 347 case CallingConv::AMDGPU_GS: 348 case CallingConv::AMDGPU_PS: 349 return std::make_pair(1, getWavefrontSize()); 350 default: 351 return std::make_pair(1, 16 * getWavefrontSize()); 352 } 353 } 354 355 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 356 const Function &F) const { 357 // FIXME: 1024 if function. 358 // Default minimum/maximum flat work group sizes. 359 std::pair<unsigned, unsigned> Default = 360 getDefaultFlatWorkGroupSize(F.getCallingConv()); 361 362 // Requested minimum/maximum flat work group sizes. 363 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 364 F, "amdgpu-flat-work-group-size", Default); 365 366 // Make sure requested minimum is less than requested maximum. 367 if (Requested.first > Requested.second) 368 return Default; 369 370 // Make sure requested values do not violate subtarget's specifications. 371 if (Requested.first < getMinFlatWorkGroupSize()) 372 return Default; 373 if (Requested.second > getMaxFlatWorkGroupSize()) 374 return Default; 375 376 return Requested; 377 } 378 379 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 380 const Function &F) const { 381 // Default minimum/maximum number of waves per execution unit. 382 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 383 384 // Default/requested minimum/maximum flat work group sizes. 385 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 386 387 // If minimum/maximum flat work group sizes were explicitly requested using 388 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 389 // number of waves per execution unit to values implied by requested 390 // minimum/maximum flat work group sizes. 391 unsigned MinImpliedByFlatWorkGroupSize = 392 getMaxWavesPerEU(FlatWorkGroupSizes.second); 393 bool RequestedFlatWorkGroupSize = false; 394 395 if (F.hasFnAttribute("amdgpu-flat-work-group-size")) { 396 Default.first = MinImpliedByFlatWorkGroupSize; 397 RequestedFlatWorkGroupSize = true; 398 } 399 400 // Requested minimum/maximum number of waves per execution unit. 401 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 402 F, "amdgpu-waves-per-eu", Default, true); 403 404 // Make sure requested minimum is less than requested maximum. 405 if (Requested.second && Requested.first > Requested.second) 406 return Default; 407 408 // Make sure requested values do not violate subtarget's specifications. 409 if (Requested.first < getMinWavesPerEU() || 410 Requested.first > getMaxWavesPerEU()) 411 return Default; 412 if (Requested.second > getMaxWavesPerEU()) 413 return Default; 414 415 // Make sure requested values are compatible with values implied by requested 416 // minimum/maximum flat work group sizes. 417 if (RequestedFlatWorkGroupSize && 418 Requested.first < MinImpliedByFlatWorkGroupSize) 419 return Default; 420 421 return Requested; 422 } 423 424 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 425 Function *Kernel = I->getParent()->getParent(); 426 unsigned MinSize = 0; 427 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 428 bool IdQuery = false; 429 430 // If reqd_work_group_size is present it narrows value down. 431 if (auto *CI = dyn_cast<CallInst>(I)) { 432 const Function *F = CI->getCalledFunction(); 433 if (F) { 434 unsigned Dim = UINT_MAX; 435 switch (F->getIntrinsicID()) { 436 case Intrinsic::amdgcn_workitem_id_x: 437 case Intrinsic::r600_read_tidig_x: 438 IdQuery = true; 439 LLVM_FALLTHROUGH; 440 case Intrinsic::r600_read_local_size_x: 441 Dim = 0; 442 break; 443 case Intrinsic::amdgcn_workitem_id_y: 444 case Intrinsic::r600_read_tidig_y: 445 IdQuery = true; 446 LLVM_FALLTHROUGH; 447 case Intrinsic::r600_read_local_size_y: 448 Dim = 1; 449 break; 450 case Intrinsic::amdgcn_workitem_id_z: 451 case Intrinsic::r600_read_tidig_z: 452 IdQuery = true; 453 LLVM_FALLTHROUGH; 454 case Intrinsic::r600_read_local_size_z: 455 Dim = 2; 456 break; 457 default: 458 break; 459 } 460 if (Dim <= 3) { 461 if (auto Node = Kernel->getMetadata("reqd_work_group_size")) 462 if (Node->getNumOperands() == 3) 463 MinSize = MaxSize = mdconst::extract<ConstantInt>( 464 Node->getOperand(Dim))->getZExtValue(); 465 } 466 } 467 } 468 469 if (!MaxSize) 470 return false; 471 472 // Range metadata is [Lo, Hi). For ID query we need to pass max size 473 // as Hi. For size query we need to pass Hi + 1. 474 if (IdQuery) 475 MinSize = 0; 476 else 477 ++MaxSize; 478 479 MDBuilder MDB(I->getContext()); 480 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 481 APInt(32, MaxSize)); 482 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 483 return true; 484 } 485 486 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 487 unsigned &MaxAlign) const { 488 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 489 F.getCallingConv() == CallingConv::SPIR_KERNEL); 490 491 const DataLayout &DL = F.getParent()->getDataLayout(); 492 uint64_t ExplicitArgBytes = 0; 493 MaxAlign = 1; 494 495 for (const Argument &Arg : F.args()) { 496 Type *ArgTy = Arg.getType(); 497 498 unsigned Align = DL.getABITypeAlignment(ArgTy); 499 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 500 ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize; 501 MaxAlign = std::max(MaxAlign, Align); 502 } 503 504 return ExplicitArgBytes; 505 } 506 507 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 508 unsigned &MaxAlign) const { 509 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 510 511 unsigned ExplicitOffset = getExplicitKernelArgOffset(F); 512 513 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 514 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 515 if (ImplicitBytes != 0) { 516 unsigned Alignment = getAlignmentForImplicitArgPtr(); 517 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 518 } 519 520 // Being able to dereference past the end is useful for emitting scalar loads. 521 return alignTo(TotalSize, 4); 522 } 523 524 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 525 const TargetMachine &TM) : 526 R600GenSubtargetInfo(TT, GPU, FS), 527 AMDGPUSubtarget(TT), 528 InstrInfo(*this), 529 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 530 FMA(false), 531 CaymanISA(false), 532 CFALUBug(false), 533 HasVertexCache(false), 534 R600ALUInst(false), 535 FP64(false), 536 TexVTXClauseSize(0), 537 Gen(R600), 538 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)), 539 InstrItins(getInstrItineraryForCPU(GPU)) { } 540 541 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 542 unsigned NumRegionInstrs) const { 543 // Track register pressure so the scheduler can try to decrease 544 // pressure once register usage is above the threshold defined by 545 // SIRegisterInfo::getRegPressureSetLimit() 546 Policy.ShouldTrackPressure = true; 547 548 // Enabling both top down and bottom up scheduling seems to give us less 549 // register spills than just using one of these approaches on its own. 550 Policy.OnlyTopDown = false; 551 Policy.OnlyBottomUp = false; 552 553 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 554 if (!enableSIScheduler()) 555 Policy.ShouldTrackLaneMasks = true; 556 } 557 558 bool GCNSubtarget::hasMadF16() const { 559 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1; 560 } 561 562 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 563 if (getGeneration() >= AMDGPUSubtarget::GFX10) 564 return 10; 565 566 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 567 if (SGPRs <= 80) 568 return 10; 569 if (SGPRs <= 88) 570 return 9; 571 if (SGPRs <= 100) 572 return 8; 573 return 7; 574 } 575 if (SGPRs <= 48) 576 return 10; 577 if (SGPRs <= 56) 578 return 9; 579 if (SGPRs <= 64) 580 return 8; 581 if (SGPRs <= 72) 582 return 7; 583 if (SGPRs <= 80) 584 return 6; 585 return 5; 586 } 587 588 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 589 if (VGPRs <= 24) 590 return 10; 591 if (VGPRs <= 28) 592 return 9; 593 if (VGPRs <= 32) 594 return 8; 595 if (VGPRs <= 36) 596 return 7; 597 if (VGPRs <= 40) 598 return 6; 599 if (VGPRs <= 48) 600 return 5; 601 if (VGPRs <= 64) 602 return 4; 603 if (VGPRs <= 84) 604 return 3; 605 if (VGPRs <= 128) 606 return 2; 607 return 1; 608 } 609 610 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 611 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 612 if (getGeneration() >= AMDGPUSubtarget::GFX10) 613 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. 614 615 if (MFI.hasFlatScratchInit()) { 616 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 617 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 618 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 619 return 4; // FLAT_SCRATCH, VCC (in that order). 620 } 621 622 if (isXNACKEnabled()) 623 return 4; // XNACK, VCC (in that order). 624 return 2; // VCC. 625 } 626 627 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 628 const Function &F = MF.getFunction(); 629 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 630 631 // Compute maximum number of SGPRs function can use using default/requested 632 // minimum number of waves per execution unit. 633 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 634 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 635 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 636 637 // Check if maximum number of SGPRs was explicitly requested using 638 // "amdgpu-num-sgpr" attribute. 639 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 640 unsigned Requested = AMDGPU::getIntegerAttribute( 641 F, "amdgpu-num-sgpr", MaxNumSGPRs); 642 643 // Make sure requested value does not violate subtarget's specifications. 644 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 645 Requested = 0; 646 647 // If more SGPRs are required to support the input user/system SGPRs, 648 // increase to accommodate them. 649 // 650 // FIXME: This really ends up using the requested number of SGPRs + number 651 // of reserved special registers in total. Theoretically you could re-use 652 // the last input registers for these special registers, but this would 653 // require a lot of complexity to deal with the weird aliasing. 654 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 655 if (Requested && Requested < InputNumSGPRs) 656 Requested = InputNumSGPRs; 657 658 // Make sure requested value is compatible with values implied by 659 // default/requested minimum/maximum number of waves per execution unit. 660 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 661 Requested = 0; 662 if (WavesPerEU.second && 663 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 664 Requested = 0; 665 666 if (Requested) 667 MaxNumSGPRs = Requested; 668 } 669 670 if (hasSGPRInitBug()) 671 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 672 673 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 674 MaxAddressableNumSGPRs); 675 } 676 677 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 678 const Function &F = MF.getFunction(); 679 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 680 681 // Compute maximum number of VGPRs function can use using default/requested 682 // minimum number of waves per execution unit. 683 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 684 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 685 686 // Check if maximum number of VGPRs was explicitly requested using 687 // "amdgpu-num-vgpr" attribute. 688 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 689 unsigned Requested = AMDGPU::getIntegerAttribute( 690 F, "amdgpu-num-vgpr", MaxNumVGPRs); 691 692 // Make sure requested value is compatible with values implied by 693 // default/requested minimum/maximum number of waves per execution unit. 694 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 695 Requested = 0; 696 if (WavesPerEU.second && 697 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 698 Requested = 0; 699 700 if (Requested) 701 MaxNumVGPRs = Requested; 702 } 703 704 return MaxNumVGPRs; 705 } 706 707 namespace { 708 struct MemOpClusterMutation : ScheduleDAGMutation { 709 const SIInstrInfo *TII; 710 711 MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {} 712 713 void apply(ScheduleDAGInstrs *DAG) override { 714 SUnit *SUa = nullptr; 715 // Search for two consequent memory operations and link them 716 // to prevent scheduler from moving them apart. 717 // In DAG pre-process SUnits are in the original order of 718 // the instructions before scheduling. 719 for (SUnit &SU : DAG->SUnits) { 720 MachineInstr &MI2 = *SU.getInstr(); 721 if (!MI2.mayLoad() && !MI2.mayStore()) { 722 SUa = nullptr; 723 continue; 724 } 725 if (!SUa) { 726 SUa = &SU; 727 continue; 728 } 729 730 MachineInstr &MI1 = *SUa->getInstr(); 731 if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) || 732 (TII->isFLAT(MI1) && TII->isFLAT(MI2)) || 733 (TII->isSMRD(MI1) && TII->isSMRD(MI2)) || 734 (TII->isDS(MI1) && TII->isDS(MI2))) { 735 SU.addPredBarrier(SUa); 736 737 for (const SDep &SI : SU.Preds) { 738 if (SI.getSUnit() != SUa) 739 SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial)); 740 } 741 742 if (&SU != &DAG->ExitSU) { 743 for (const SDep &SI : SUa->Succs) { 744 if (SI.getSUnit() != &SU) 745 SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial)); 746 } 747 } 748 } 749 750 SUa = &SU; 751 } 752 } 753 }; 754 } // namespace 755 756 void GCNSubtarget::getPostRAMutations( 757 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 758 Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo)); 759 } 760 761 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 762 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 763 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 764 else 765 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); 766 } 767 768 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 769 if (TM.getTargetTriple().getArch() == Triple::amdgcn) 770 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 771 else 772 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); 773 } 774