1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Implements the AMDGPU specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUSubtarget.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUTargetMachine.h" 17 #include "AMDGPUCallLowering.h" 18 #include "AMDGPUInstructionSelector.h" 19 #include "AMDGPULegalizerInfo.h" 20 #include "AMDGPURegisterBankInfo.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 23 #include "llvm/ADT/SmallString.h" 24 #include "llvm/CodeGen/MachineScheduler.h" 25 #include "llvm/MC/MCSubtargetInfo.h" 26 #include "llvm/IR/MDBuilder.h" 27 #include "llvm/CodeGen/TargetFrameLowering.h" 28 #include <algorithm> 29 30 using namespace llvm; 31 32 #define DEBUG_TYPE "amdgpu-subtarget" 33 34 #define GET_SUBTARGETINFO_TARGET_DESC 35 #define GET_SUBTARGETINFO_CTOR 36 #define AMDGPUSubtarget GCNSubtarget 37 #include "AMDGPUGenSubtargetInfo.inc" 38 #define GET_SUBTARGETINFO_TARGET_DESC 39 #define GET_SUBTARGETINFO_CTOR 40 #undef AMDGPUSubtarget 41 #include "R600GenSubtargetInfo.inc" 42 43 GCNSubtarget::~GCNSubtarget() = default; 44 45 R600Subtarget & 46 R600Subtarget::initializeSubtargetDependencies(const Triple &TT, 47 StringRef GPU, StringRef FS) { 48 SmallString<256> FullFS("+promote-alloca,"); 49 FullFS += FS; 50 ParseSubtargetFeatures(GPU, FullFS); 51 52 // FIXME: I don't think think Evergreen has any useful support for 53 // denormals, but should be checked. Should we issue a warning somewhere 54 // if someone tries to enable these? 55 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { 56 FP32Denormals = false; 57 } 58 59 HasMulU24 = getGeneration() >= EVERGREEN; 60 HasMulI24 = hasCaymanISA(); 61 62 return *this; 63 } 64 65 GCNSubtarget & 66 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 67 StringRef GPU, StringRef FS) { 68 // Determine default and user-specified characteristics 69 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be 70 // enabled, but some instructions do not respect them and they run at the 71 // double precision rate, so don't enable by default. 72 // 73 // We want to be able to turn these off, but making this a subtarget feature 74 // for SI has the unhelpful behavior that it unsets everything else if you 75 // disable it. 76 // 77 // Similarly we want enable-prt-strict-null to be on by default and not to 78 // unset everything else if it is disabled 79 80 // Assuming ECC is enabled is the conservative default. 81 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,+xnack,"); 82 83 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 84 FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,"; 85 86 // FIXME: I don't think think Evergreen has any useful support for 87 // denormals, but should be checked. Should we issue a warning somewhere 88 // if someone tries to enable these? 89 if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { 90 FullFS += "+fp64-fp16-denormals,"; 91 } else { 92 FullFS += "-fp32-denormals,"; 93 } 94 95 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS 96 97 // Disable mutually exclusive bits. 98 if (FS.find_lower("+wavefrontsize") != StringRef::npos) { 99 if (FS.find_lower("wavefrontsize16") == StringRef::npos) 100 FullFS += "-wavefrontsize16,"; 101 if (FS.find_lower("wavefrontsize32") == StringRef::npos) 102 FullFS += "-wavefrontsize32,"; 103 if (FS.find_lower("wavefrontsize64") == StringRef::npos) 104 FullFS += "-wavefrontsize64,"; 105 } 106 107 FullFS += FS; 108 109 ParseSubtargetFeatures(GPU, FullFS); 110 111 // We don't support FP64 for EG/NI atm. 112 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 113 114 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es 115 // on VI and newer hardware to avoid assertion failures due to missing ADDR64 116 // variants of MUBUF instructions. 117 if (!hasAddr64() && !FS.contains("flat-for-global")) { 118 FlatForGlobal = true; 119 } 120 121 // Set defaults if needed. 122 if (MaxPrivateElementSize == 0) 123 MaxPrivateElementSize = 4; 124 125 if (LDSBankCount == 0) 126 LDSBankCount = 32; 127 128 if (TT.getArch() == Triple::amdgcn) { 129 if (LocalMemorySize == 0) 130 LocalMemorySize = 32768; 131 132 // Do something sensible for unspecified target. 133 if (!HasMovrel && !HasVGPRIndexMode) 134 HasMovrel = true; 135 } 136 137 // Don't crash on invalid devices. 138 if (WavefrontSize == 0) 139 WavefrontSize = 64; 140 141 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 142 143 if (DoesNotSupportXNACK && EnableXNACK) { 144 ToggleFeature(AMDGPU::FeatureXNACK); 145 EnableXNACK = false; 146 } 147 148 // ECC is on by default, but turn it off if the hardware doesn't support it 149 // anyway. This matters for the gfx9 targets with d16 loads, but don't support 150 // ECC. 151 if (DoesNotSupportSRAMECC && EnableSRAMECC) { 152 ToggleFeature(AMDGPU::FeatureSRAMECC); 153 EnableSRAMECC = false; 154 } 155 156 return *this; 157 } 158 159 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : 160 TargetTriple(TT), 161 Has16BitInsts(false), 162 HasMadMixInsts(false), 163 FP32Denormals(false), 164 FPExceptions(false), 165 HasSDWA(false), 166 HasVOP3PInsts(false), 167 HasMulI24(true), 168 HasMulU24(true), 169 HasInv2PiInlineImm(false), 170 HasFminFmaxLegacy(true), 171 EnablePromoteAlloca(false), 172 HasTrigReducedRange(false), 173 LocalMemorySize(0), 174 WavefrontSize(0) 175 { } 176 177 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 178 const GCNTargetMachine &TM) : 179 AMDGPUGenSubtargetInfo(TT, GPU, FS), 180 AMDGPUSubtarget(TT), 181 TargetTriple(TT), 182 Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS), 183 InstrItins(getInstrItineraryForCPU(GPU)), 184 LDSBankCount(0), 185 MaxPrivateElementSize(0), 186 187 FastFMAF32(false), 188 HalfRate64Ops(false), 189 190 FP64FP16Denormals(false), 191 FlatForGlobal(false), 192 AutoWaitcntBeforeBarrier(false), 193 CodeObjectV3(false), 194 UnalignedScratchAccess(false), 195 UnalignedBufferAccess(false), 196 197 HasApertureRegs(false), 198 EnableXNACK(false), 199 DoesNotSupportXNACK(false), 200 EnableCuMode(false), 201 TrapHandler(false), 202 203 EnableLoadStoreOpt(false), 204 EnableUnsafeDSOffsetFolding(false), 205 EnableSIScheduler(false), 206 EnableDS128(false), 207 EnablePRTStrictNull(false), 208 DumpCode(false), 209 210 FP64(false), 211 GCN3Encoding(false), 212 CIInsts(false), 213 GFX8Insts(false), 214 GFX9Insts(false), 215 GFX10Insts(false), 216 GFX7GFX8GFX9Insts(false), 217 SGPRInitBug(false), 218 HasSMemRealTime(false), 219 HasIntClamp(false), 220 HasFmaMixInsts(false), 221 HasMovrel(false), 222 HasVGPRIndexMode(false), 223 HasScalarStores(false), 224 HasScalarAtomics(false), 225 HasSDWAOmod(false), 226 HasSDWAScalar(false), 227 HasSDWASdst(false), 228 HasSDWAMac(false), 229 HasSDWAOutModsVOPC(false), 230 HasDPP(false), 231 HasDPP8(false), 232 HasR128A16(false), 233 HasNSAEncoding(false), 234 HasDLInsts(false), 235 HasDot1Insts(false), 236 HasDot2Insts(false), 237 HasDot5Insts(false), 238 HasDot6Insts(false), 239 EnableSRAMECC(false), 240 DoesNotSupportSRAMECC(false), 241 HasNoSdstCMPX(false), 242 HasVscnt(false), 243 HasRegisterBanking(false), 244 HasVOP3Literal(false), 245 HasNoDataDepHazard(false), 246 FlatAddressSpace(false), 247 FlatInstOffsets(false), 248 FlatGlobalInsts(false), 249 FlatScratchInsts(false), 250 ScalarFlatScratchInsts(false), 251 AddNoCarryInsts(false), 252 HasUnpackedD16VMem(false), 253 LDSMisalignedBug(false), 254 255 ScalarizeGlobal(false), 256 257 HasVcmpxPermlaneHazard(false), 258 HasVMEMtoScalarWriteHazard(false), 259 HasSMEMtoVectorWriteHazard(false), 260 HasInstFwdPrefetchBug(false), 261 HasVcmpxExecWARHazard(false), 262 HasLdsBranchVmemWARHazard(false), 263 HasNSAtoVMEMBug(false), 264 HasOffset3fBug(false), 265 HasFlatSegmentOffsetBug(false), 266 267 FeatureDisable(false), 268 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 269 TLInfo(TM, *this), 270 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 271 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 272 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 273 RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo())); 274 InstSelector.reset(new AMDGPUInstructionSelector( 275 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 276 } 277 278 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { 279 if (getGeneration() < GFX10) 280 return 1; 281 282 switch (Opcode) { 283 case AMDGPU::V_LSHLREV_B64: 284 case AMDGPU::V_LSHLREV_B64_gfx10: 285 case AMDGPU::V_LSHL_B64: 286 case AMDGPU::V_LSHRREV_B64: 287 case AMDGPU::V_LSHRREV_B64_gfx10: 288 case AMDGPU::V_LSHR_B64: 289 case AMDGPU::V_ASHRREV_I64: 290 case AMDGPU::V_ASHRREV_I64_gfx10: 291 case AMDGPU::V_ASHR_I64: 292 return 1; 293 } 294 295 return 2; 296 } 297 298 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 299 const Function &F) const { 300 if (NWaves == 1) 301 return getLocalMemorySize(); 302 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 303 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 304 if (!WorkGroupsPerCu) 305 return 0; 306 unsigned MaxWaves = getMaxWavesPerEU(); 307 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 308 } 309 310 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 311 const Function &F) const { 312 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 313 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 314 if (!WorkGroupsPerCu) 315 return 0; 316 unsigned MaxWaves = getMaxWavesPerEU(); 317 unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu; 318 unsigned NumWaves = Limit / (Bytes ? Bytes : 1u); 319 NumWaves = std::min(NumWaves, MaxWaves); 320 NumWaves = std::max(NumWaves, 1u); 321 return NumWaves; 322 } 323 324 unsigned 325 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 326 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 327 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 328 } 329 330 std::pair<unsigned, unsigned> 331 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 332 switch (CC) { 333 case CallingConv::AMDGPU_CS: 334 case CallingConv::AMDGPU_KERNEL: 335 case CallingConv::SPIR_KERNEL: 336 return std::make_pair(getWavefrontSize() * 2, 337 std::max(getWavefrontSize() * 4, 256u)); 338 case CallingConv::AMDGPU_VS: 339 case CallingConv::AMDGPU_LS: 340 case CallingConv::AMDGPU_HS: 341 case CallingConv::AMDGPU_ES: 342 case CallingConv::AMDGPU_GS: 343 case CallingConv::AMDGPU_PS: 344 return std::make_pair(1, getWavefrontSize()); 345 default: 346 return std::make_pair(1, 16 * getWavefrontSize()); 347 } 348 } 349 350 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 351 const Function &F) const { 352 // FIXME: 1024 if function. 353 // Default minimum/maximum flat work group sizes. 354 std::pair<unsigned, unsigned> Default = 355 getDefaultFlatWorkGroupSize(F.getCallingConv()); 356 357 // Requested minimum/maximum flat work group sizes. 358 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 359 F, "amdgpu-flat-work-group-size", Default); 360 361 // Make sure requested minimum is less than requested maximum. 362 if (Requested.first > Requested.second) 363 return Default; 364 365 // Make sure requested values do not violate subtarget's specifications. 366 if (Requested.first < getMinFlatWorkGroupSize()) 367 return Default; 368 if (Requested.second > getMaxFlatWorkGroupSize()) 369 return Default; 370 371 return Requested; 372 } 373 374 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 375 const Function &F) const { 376 // Default minimum/maximum number of waves per execution unit. 377 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 378 379 // Default/requested minimum/maximum flat work group sizes. 380 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 381 382 // If minimum/maximum flat work group sizes were explicitly requested using 383 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 384 // number of waves per execution unit to values implied by requested 385 // minimum/maximum flat work group sizes. 386 unsigned MinImpliedByFlatWorkGroupSize = 387 getMaxWavesPerEU(FlatWorkGroupSizes.second); 388 bool RequestedFlatWorkGroupSize = false; 389 390 if (F.hasFnAttribute("amdgpu-flat-work-group-size")) { 391 Default.first = MinImpliedByFlatWorkGroupSize; 392 RequestedFlatWorkGroupSize = true; 393 } 394 395 // Requested minimum/maximum number of waves per execution unit. 396 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 397 F, "amdgpu-waves-per-eu", Default, true); 398 399 // Make sure requested minimum is less than requested maximum. 400 if (Requested.second && Requested.first > Requested.second) 401 return Default; 402 403 // Make sure requested values do not violate subtarget's specifications. 404 if (Requested.first < getMinWavesPerEU() || 405 Requested.first > getMaxWavesPerEU()) 406 return Default; 407 if (Requested.second > getMaxWavesPerEU()) 408 return Default; 409 410 // Make sure requested values are compatible with values implied by requested 411 // minimum/maximum flat work group sizes. 412 if (RequestedFlatWorkGroupSize && 413 Requested.first < MinImpliedByFlatWorkGroupSize) 414 return Default; 415 416 return Requested; 417 } 418 419 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 420 Function *Kernel = I->getParent()->getParent(); 421 unsigned MinSize = 0; 422 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 423 bool IdQuery = false; 424 425 // If reqd_work_group_size is present it narrows value down. 426 if (auto *CI = dyn_cast<CallInst>(I)) { 427 const Function *F = CI->getCalledFunction(); 428 if (F) { 429 unsigned Dim = UINT_MAX; 430 switch (F->getIntrinsicID()) { 431 case Intrinsic::amdgcn_workitem_id_x: 432 case Intrinsic::r600_read_tidig_x: 433 IdQuery = true; 434 LLVM_FALLTHROUGH; 435 case Intrinsic::r600_read_local_size_x: 436 Dim = 0; 437 break; 438 case Intrinsic::amdgcn_workitem_id_y: 439 case Intrinsic::r600_read_tidig_y: 440 IdQuery = true; 441 LLVM_FALLTHROUGH; 442 case Intrinsic::r600_read_local_size_y: 443 Dim = 1; 444 break; 445 case Intrinsic::amdgcn_workitem_id_z: 446 case Intrinsic::r600_read_tidig_z: 447 IdQuery = true; 448 LLVM_FALLTHROUGH; 449 case Intrinsic::r600_read_local_size_z: 450 Dim = 2; 451 break; 452 default: 453 break; 454 } 455 if (Dim <= 3) { 456 if (auto Node = Kernel->getMetadata("reqd_work_group_size")) 457 if (Node->getNumOperands() == 3) 458 MinSize = MaxSize = mdconst::extract<ConstantInt>( 459 Node->getOperand(Dim))->getZExtValue(); 460 } 461 } 462 } 463 464 if (!MaxSize) 465 return false; 466 467 // Range metadata is [Lo, Hi). For ID query we need to pass max size 468 // as Hi. For size query we need to pass Hi + 1. 469 if (IdQuery) 470 MinSize = 0; 471 else 472 ++MaxSize; 473 474 MDBuilder MDB(I->getContext()); 475 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 476 APInt(32, MaxSize)); 477 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 478 return true; 479 } 480 481 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 482 unsigned &MaxAlign) const { 483 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 484 F.getCallingConv() == CallingConv::SPIR_KERNEL); 485 486 const DataLayout &DL = F.getParent()->getDataLayout(); 487 uint64_t ExplicitArgBytes = 0; 488 MaxAlign = 1; 489 490 for (const Argument &Arg : F.args()) { 491 Type *ArgTy = Arg.getType(); 492 493 unsigned Align = DL.getABITypeAlignment(ArgTy); 494 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 495 ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize; 496 MaxAlign = std::max(MaxAlign, Align); 497 } 498 499 return ExplicitArgBytes; 500 } 501 502 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 503 unsigned &MaxAlign) const { 504 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 505 506 unsigned ExplicitOffset = getExplicitKernelArgOffset(F); 507 508 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 509 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 510 if (ImplicitBytes != 0) { 511 unsigned Alignment = getAlignmentForImplicitArgPtr(); 512 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 513 } 514 515 // Being able to dereference past the end is useful for emitting scalar loads. 516 return alignTo(TotalSize, 4); 517 } 518 519 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 520 const TargetMachine &TM) : 521 R600GenSubtargetInfo(TT, GPU, FS), 522 AMDGPUSubtarget(TT), 523 InstrInfo(*this), 524 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 525 FMA(false), 526 CaymanISA(false), 527 CFALUBug(false), 528 HasVertexCache(false), 529 R600ALUInst(false), 530 FP64(false), 531 TexVTXClauseSize(0), 532 Gen(R600), 533 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)), 534 InstrItins(getInstrItineraryForCPU(GPU)) { } 535 536 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 537 unsigned NumRegionInstrs) const { 538 // Track register pressure so the scheduler can try to decrease 539 // pressure once register usage is above the threshold defined by 540 // SIRegisterInfo::getRegPressureSetLimit() 541 Policy.ShouldTrackPressure = true; 542 543 // Enabling both top down and bottom up scheduling seems to give us less 544 // register spills than just using one of these approaches on its own. 545 Policy.OnlyTopDown = false; 546 Policy.OnlyBottomUp = false; 547 548 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 549 if (!enableSIScheduler()) 550 Policy.ShouldTrackLaneMasks = true; 551 } 552 553 bool GCNSubtarget::hasMadF16() const { 554 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1; 555 } 556 557 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 558 if (getGeneration() >= AMDGPUSubtarget::GFX10) 559 return 10; 560 561 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 562 if (SGPRs <= 80) 563 return 10; 564 if (SGPRs <= 88) 565 return 9; 566 if (SGPRs <= 100) 567 return 8; 568 return 7; 569 } 570 if (SGPRs <= 48) 571 return 10; 572 if (SGPRs <= 56) 573 return 9; 574 if (SGPRs <= 64) 575 return 8; 576 if (SGPRs <= 72) 577 return 7; 578 if (SGPRs <= 80) 579 return 6; 580 return 5; 581 } 582 583 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 584 if (VGPRs <= 24) 585 return 10; 586 if (VGPRs <= 28) 587 return 9; 588 if (VGPRs <= 32) 589 return 8; 590 if (VGPRs <= 36) 591 return 7; 592 if (VGPRs <= 40) 593 return 6; 594 if (VGPRs <= 48) 595 return 5; 596 if (VGPRs <= 64) 597 return 4; 598 if (VGPRs <= 84) 599 return 3; 600 if (VGPRs <= 128) 601 return 2; 602 return 1; 603 } 604 605 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 606 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 607 if (getGeneration() >= AMDGPUSubtarget::GFX10) 608 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. 609 610 if (MFI.hasFlatScratchInit()) { 611 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 612 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 613 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 614 return 4; // FLAT_SCRATCH, VCC (in that order). 615 } 616 617 if (isXNACKEnabled()) 618 return 4; // XNACK, VCC (in that order). 619 return 2; // VCC. 620 } 621 622 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 623 const Function &F = MF.getFunction(); 624 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 625 626 // Compute maximum number of SGPRs function can use using default/requested 627 // minimum number of waves per execution unit. 628 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 629 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 630 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 631 632 // Check if maximum number of SGPRs was explicitly requested using 633 // "amdgpu-num-sgpr" attribute. 634 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 635 unsigned Requested = AMDGPU::getIntegerAttribute( 636 F, "amdgpu-num-sgpr", MaxNumSGPRs); 637 638 // Make sure requested value does not violate subtarget's specifications. 639 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 640 Requested = 0; 641 642 // If more SGPRs are required to support the input user/system SGPRs, 643 // increase to accommodate them. 644 // 645 // FIXME: This really ends up using the requested number of SGPRs + number 646 // of reserved special registers in total. Theoretically you could re-use 647 // the last input registers for these special registers, but this would 648 // require a lot of complexity to deal with the weird aliasing. 649 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 650 if (Requested && Requested < InputNumSGPRs) 651 Requested = InputNumSGPRs; 652 653 // Make sure requested value is compatible with values implied by 654 // default/requested minimum/maximum number of waves per execution unit. 655 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 656 Requested = 0; 657 if (WavesPerEU.second && 658 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 659 Requested = 0; 660 661 if (Requested) 662 MaxNumSGPRs = Requested; 663 } 664 665 if (hasSGPRInitBug()) 666 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 667 668 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 669 MaxAddressableNumSGPRs); 670 } 671 672 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 673 const Function &F = MF.getFunction(); 674 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 675 676 // Compute maximum number of VGPRs function can use using default/requested 677 // minimum number of waves per execution unit. 678 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 679 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 680 681 // Check if maximum number of VGPRs was explicitly requested using 682 // "amdgpu-num-vgpr" attribute. 683 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 684 unsigned Requested = AMDGPU::getIntegerAttribute( 685 F, "amdgpu-num-vgpr", MaxNumVGPRs); 686 687 // Make sure requested value is compatible with values implied by 688 // default/requested minimum/maximum number of waves per execution unit. 689 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 690 Requested = 0; 691 if (WavesPerEU.second && 692 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 693 Requested = 0; 694 695 if (Requested) 696 MaxNumVGPRs = Requested; 697 } 698 699 return MaxNumVGPRs; 700 } 701 702 namespace { 703 struct MemOpClusterMutation : ScheduleDAGMutation { 704 const SIInstrInfo *TII; 705 706 MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {} 707 708 void apply(ScheduleDAGInstrs *DAG) override { 709 SUnit *SUa = nullptr; 710 // Search for two consequent memory operations and link them 711 // to prevent scheduler from moving them apart. 712 // In DAG pre-process SUnits are in the original order of 713 // the instructions before scheduling. 714 for (SUnit &SU : DAG->SUnits) { 715 MachineInstr &MI2 = *SU.getInstr(); 716 if (!MI2.mayLoad() && !MI2.mayStore()) { 717 SUa = nullptr; 718 continue; 719 } 720 if (!SUa) { 721 SUa = &SU; 722 continue; 723 } 724 725 MachineInstr &MI1 = *SUa->getInstr(); 726 if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) || 727 (TII->isFLAT(MI1) && TII->isFLAT(MI2)) || 728 (TII->isSMRD(MI1) && TII->isSMRD(MI2)) || 729 (TII->isDS(MI1) && TII->isDS(MI2))) { 730 SU.addPredBarrier(SUa); 731 732 for (const SDep &SI : SU.Preds) { 733 if (SI.getSUnit() != SUa) 734 SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial)); 735 } 736 737 if (&SU != &DAG->ExitSU) { 738 for (const SDep &SI : SUa->Succs) { 739 if (SI.getSUnit() != &SU) 740 SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial)); 741 } 742 } 743 } 744 745 SUa = &SU; 746 } 747 } 748 }; 749 } // namespace 750 751 void GCNSubtarget::getPostRAMutations( 752 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 753 Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo)); 754 } 755 756 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 757 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 758 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 759 else 760 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); 761 } 762 763 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 764 if (TM.getTargetTriple().getArch() == Triple::amdgcn) 765 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 766 else 767 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); 768 } 769