1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Implements the AMDGPU specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUSubtarget.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUTargetMachine.h" 17 #include "AMDGPUCallLowering.h" 18 #include "AMDGPUInstructionSelector.h" 19 #include "AMDGPULegalizerInfo.h" 20 #include "AMDGPURegisterBankInfo.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 23 #include "llvm/ADT/SmallString.h" 24 #include "llvm/CodeGen/MachineScheduler.h" 25 #include "llvm/MC/MCSubtargetInfo.h" 26 #include "llvm/IR/MDBuilder.h" 27 #include "llvm/CodeGen/TargetFrameLowering.h" 28 #include <algorithm> 29 30 using namespace llvm; 31 32 #define DEBUG_TYPE "amdgpu-subtarget" 33 34 #define GET_SUBTARGETINFO_TARGET_DESC 35 #define GET_SUBTARGETINFO_CTOR 36 #define AMDGPUSubtarget GCNSubtarget 37 #include "AMDGPUGenSubtargetInfo.inc" 38 #define GET_SUBTARGETINFO_TARGET_DESC 39 #define GET_SUBTARGETINFO_CTOR 40 #undef AMDGPUSubtarget 41 #include "R600GenSubtargetInfo.inc" 42 43 GCNSubtarget::~GCNSubtarget() = default; 44 45 R600Subtarget & 46 R600Subtarget::initializeSubtargetDependencies(const Triple &TT, 47 StringRef GPU, StringRef FS) { 48 SmallString<256> FullFS("+promote-alloca,"); 49 FullFS += FS; 50 ParseSubtargetFeatures(GPU, FullFS); 51 52 // FIXME: I don't think think Evergreen has any useful support for 53 // denormals, but should be checked. Should we issue a warning somewhere 54 // if someone tries to enable these? 55 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { 56 FP32Denormals = false; 57 } 58 59 HasMulU24 = getGeneration() >= EVERGREEN; 60 HasMulI24 = hasCaymanISA(); 61 62 return *this; 63 } 64 65 GCNSubtarget & 66 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 67 StringRef GPU, StringRef FS) { 68 // Determine default and user-specified characteristics 69 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be 70 // enabled, but some instructions do not respect them and they run at the 71 // double precision rate, so don't enable by default. 72 // 73 // We want to be able to turn these off, but making this a subtarget feature 74 // for SI has the unhelpful behavior that it unsets everything else if you 75 // disable it. 76 // 77 // Similarly we want enable-prt-strict-null to be on by default and not to 78 // unset everything else if it is disabled 79 80 // Assuming ECC is enabled is the conservative default. 81 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,+xnack,"); 82 83 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 84 FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,"; 85 86 // FIXME: I don't think think Evergreen has any useful support for 87 // denormals, but should be checked. Should we issue a warning somewhere 88 // if someone tries to enable these? 89 if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { 90 FullFS += "+fp64-fp16-denormals,"; 91 } else { 92 FullFS += "-fp32-denormals,"; 93 } 94 95 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS 96 97 // Disable mutually exclusive bits. 98 if (FS.find_lower("+wavefrontsize") != StringRef::npos) { 99 if (FS.find_lower("wavefrontsize16") == StringRef::npos) 100 FullFS += "-wavefrontsize16,"; 101 if (FS.find_lower("wavefrontsize32") == StringRef::npos) 102 FullFS += "-wavefrontsize32,"; 103 if (FS.find_lower("wavefrontsize64") == StringRef::npos) 104 FullFS += "-wavefrontsize64,"; 105 } 106 107 FullFS += FS; 108 109 ParseSubtargetFeatures(GPU, FullFS); 110 111 // We don't support FP64 for EG/NI atm. 112 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 113 114 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es 115 // on VI and newer hardware to avoid assertion failures due to missing ADDR64 116 // variants of MUBUF instructions. 117 if (!hasAddr64() && !FS.contains("flat-for-global")) { 118 FlatForGlobal = true; 119 } 120 121 // Set defaults if needed. 122 if (MaxPrivateElementSize == 0) 123 MaxPrivateElementSize = 4; 124 125 if (LDSBankCount == 0) 126 LDSBankCount = 32; 127 128 if (TT.getArch() == Triple::amdgcn) { 129 if (LocalMemorySize == 0) 130 LocalMemorySize = 32768; 131 132 // Do something sensible for unspecified target. 133 if (!HasMovrel && !HasVGPRIndexMode) 134 HasMovrel = true; 135 } 136 137 // Don't crash on invalid devices. 138 if (WavefrontSize == 0) 139 WavefrontSize = 64; 140 141 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 142 143 if (DoesNotSupportXNACK && EnableXNACK) { 144 ToggleFeature(AMDGPU::FeatureXNACK); 145 EnableXNACK = false; 146 } 147 148 // ECC is on by default, but turn it off if the hardware doesn't support it 149 // anyway. This matters for the gfx9 targets with d16 loads, but don't support 150 // ECC. 151 if (DoesNotSupportSRAMECC && EnableSRAMECC) { 152 ToggleFeature(AMDGPU::FeatureSRAMECC); 153 EnableSRAMECC = false; 154 } 155 156 return *this; 157 } 158 159 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : 160 TargetTriple(TT), 161 Has16BitInsts(false), 162 HasMadMixInsts(false), 163 FP32Denormals(false), 164 FPExceptions(false), 165 HasSDWA(false), 166 HasVOP3PInsts(false), 167 HasMulI24(true), 168 HasMulU24(true), 169 HasInv2PiInlineImm(false), 170 HasFminFmaxLegacy(true), 171 EnablePromoteAlloca(false), 172 HasTrigReducedRange(false), 173 LocalMemorySize(0), 174 WavefrontSize(0) 175 { } 176 177 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 178 const GCNTargetMachine &TM) : 179 AMDGPUGenSubtargetInfo(TT, GPU, FS), 180 AMDGPUSubtarget(TT), 181 TargetTriple(TT), 182 Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS), 183 InstrItins(getInstrItineraryForCPU(GPU)), 184 LDSBankCount(0), 185 MaxPrivateElementSize(0), 186 187 FastFMAF32(false), 188 HalfRate64Ops(false), 189 190 FP64FP16Denormals(false), 191 FlatForGlobal(false), 192 AutoWaitcntBeforeBarrier(false), 193 CodeObjectV3(false), 194 UnalignedScratchAccess(false), 195 UnalignedBufferAccess(false), 196 197 HasApertureRegs(false), 198 EnableXNACK(false), 199 DoesNotSupportXNACK(false), 200 EnableCuMode(false), 201 TrapHandler(false), 202 203 EnableLoadStoreOpt(false), 204 EnableUnsafeDSOffsetFolding(false), 205 EnableSIScheduler(false), 206 EnableDS128(false), 207 EnablePRTStrictNull(false), 208 DumpCode(false), 209 210 FP64(false), 211 GCN3Encoding(false), 212 CIInsts(false), 213 GFX8Insts(false), 214 GFX9Insts(false), 215 GFX10Insts(false), 216 GFX7GFX8GFX9Insts(false), 217 SGPRInitBug(false), 218 HasSMemRealTime(false), 219 HasIntClamp(false), 220 HasFmaMixInsts(false), 221 HasMovrel(false), 222 HasVGPRIndexMode(false), 223 HasScalarStores(false), 224 HasScalarAtomics(false), 225 HasSDWAOmod(false), 226 HasSDWAScalar(false), 227 HasSDWASdst(false), 228 HasSDWAMac(false), 229 HasSDWAOutModsVOPC(false), 230 HasDPP(false), 231 HasDPP8(false), 232 HasR128A16(false), 233 HasNSAEncoding(false), 234 HasDLInsts(false), 235 HasDot1Insts(false), 236 HasDot2Insts(false), 237 EnableSRAMECC(false), 238 DoesNotSupportSRAMECC(false), 239 HasNoSdstCMPX(false), 240 HasVscnt(false), 241 HasRegisterBanking(false), 242 HasVOP3Literal(false), 243 HasNoDataDepHazard(false), 244 FlatAddressSpace(false), 245 FlatInstOffsets(false), 246 FlatGlobalInsts(false), 247 FlatScratchInsts(false), 248 ScalarFlatScratchInsts(false), 249 AddNoCarryInsts(false), 250 HasUnpackedD16VMem(false), 251 LDSMisalignedBug(false), 252 253 ScalarizeGlobal(false), 254 255 HasVcmpxPermlaneHazard(false), 256 HasVMEMtoScalarWriteHazard(false), 257 HasSMEMtoVectorWriteHazard(false), 258 HasInstFwdPrefetchBug(false), 259 HasVcmpxExecWARHazard(false), 260 HasLdsBranchVmemWARHazard(false), 261 HasNSAtoVMEMBug(false), 262 HasFlatSegmentOffsetBug(false), 263 264 FeatureDisable(false), 265 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 266 TLInfo(TM, *this), 267 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 268 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 269 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 270 RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo())); 271 InstSelector.reset(new AMDGPUInstructionSelector( 272 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 273 } 274 275 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { 276 if (getGeneration() < GFX10) 277 return 1; 278 279 switch (Opcode) { 280 case AMDGPU::V_LSHLREV_B64: 281 case AMDGPU::V_LSHLREV_B64_gfx10: 282 case AMDGPU::V_LSHL_B64: 283 case AMDGPU::V_LSHRREV_B64: 284 case AMDGPU::V_LSHRREV_B64_gfx10: 285 case AMDGPU::V_LSHR_B64: 286 case AMDGPU::V_ASHRREV_I64: 287 case AMDGPU::V_ASHRREV_I64_gfx10: 288 case AMDGPU::V_ASHR_I64: 289 return 1; 290 } 291 292 return 2; 293 } 294 295 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 296 const Function &F) const { 297 if (NWaves == 1) 298 return getLocalMemorySize(); 299 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 300 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 301 if (!WorkGroupsPerCu) 302 return 0; 303 unsigned MaxWaves = getMaxWavesPerEU(); 304 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 305 } 306 307 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 308 const Function &F) const { 309 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 310 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 311 if (!WorkGroupsPerCu) 312 return 0; 313 unsigned MaxWaves = getMaxWavesPerEU(); 314 unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu; 315 unsigned NumWaves = Limit / (Bytes ? Bytes : 1u); 316 NumWaves = std::min(NumWaves, MaxWaves); 317 NumWaves = std::max(NumWaves, 1u); 318 return NumWaves; 319 } 320 321 unsigned 322 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 323 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 324 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 325 } 326 327 std::pair<unsigned, unsigned> 328 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 329 switch (CC) { 330 case CallingConv::AMDGPU_CS: 331 case CallingConv::AMDGPU_KERNEL: 332 case CallingConv::SPIR_KERNEL: 333 return std::make_pair(getWavefrontSize() * 2, 334 std::max(getWavefrontSize() * 4, 256u)); 335 case CallingConv::AMDGPU_VS: 336 case CallingConv::AMDGPU_LS: 337 case CallingConv::AMDGPU_HS: 338 case CallingConv::AMDGPU_ES: 339 case CallingConv::AMDGPU_GS: 340 case CallingConv::AMDGPU_PS: 341 return std::make_pair(1, getWavefrontSize()); 342 default: 343 return std::make_pair(1, 16 * getWavefrontSize()); 344 } 345 } 346 347 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 348 const Function &F) const { 349 // FIXME: 1024 if function. 350 // Default minimum/maximum flat work group sizes. 351 std::pair<unsigned, unsigned> Default = 352 getDefaultFlatWorkGroupSize(F.getCallingConv()); 353 354 // Requested minimum/maximum flat work group sizes. 355 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 356 F, "amdgpu-flat-work-group-size", Default); 357 358 // Make sure requested minimum is less than requested maximum. 359 if (Requested.first > Requested.second) 360 return Default; 361 362 // Make sure requested values do not violate subtarget's specifications. 363 if (Requested.first < getMinFlatWorkGroupSize()) 364 return Default; 365 if (Requested.second > getMaxFlatWorkGroupSize()) 366 return Default; 367 368 return Requested; 369 } 370 371 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 372 const Function &F) const { 373 // Default minimum/maximum number of waves per execution unit. 374 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 375 376 // Default/requested minimum/maximum flat work group sizes. 377 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 378 379 // If minimum/maximum flat work group sizes were explicitly requested using 380 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 381 // number of waves per execution unit to values implied by requested 382 // minimum/maximum flat work group sizes. 383 unsigned MinImpliedByFlatWorkGroupSize = 384 getMaxWavesPerEU(FlatWorkGroupSizes.second); 385 bool RequestedFlatWorkGroupSize = false; 386 387 if (F.hasFnAttribute("amdgpu-flat-work-group-size")) { 388 Default.first = MinImpliedByFlatWorkGroupSize; 389 RequestedFlatWorkGroupSize = true; 390 } 391 392 // Requested minimum/maximum number of waves per execution unit. 393 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 394 F, "amdgpu-waves-per-eu", Default, true); 395 396 // Make sure requested minimum is less than requested maximum. 397 if (Requested.second && Requested.first > Requested.second) 398 return Default; 399 400 // Make sure requested values do not violate subtarget's specifications. 401 if (Requested.first < getMinWavesPerEU() || 402 Requested.first > getMaxWavesPerEU()) 403 return Default; 404 if (Requested.second > getMaxWavesPerEU()) 405 return Default; 406 407 // Make sure requested values are compatible with values implied by requested 408 // minimum/maximum flat work group sizes. 409 if (RequestedFlatWorkGroupSize && 410 Requested.first < MinImpliedByFlatWorkGroupSize) 411 return Default; 412 413 return Requested; 414 } 415 416 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 417 Function *Kernel = I->getParent()->getParent(); 418 unsigned MinSize = 0; 419 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 420 bool IdQuery = false; 421 422 // If reqd_work_group_size is present it narrows value down. 423 if (auto *CI = dyn_cast<CallInst>(I)) { 424 const Function *F = CI->getCalledFunction(); 425 if (F) { 426 unsigned Dim = UINT_MAX; 427 switch (F->getIntrinsicID()) { 428 case Intrinsic::amdgcn_workitem_id_x: 429 case Intrinsic::r600_read_tidig_x: 430 IdQuery = true; 431 LLVM_FALLTHROUGH; 432 case Intrinsic::r600_read_local_size_x: 433 Dim = 0; 434 break; 435 case Intrinsic::amdgcn_workitem_id_y: 436 case Intrinsic::r600_read_tidig_y: 437 IdQuery = true; 438 LLVM_FALLTHROUGH; 439 case Intrinsic::r600_read_local_size_y: 440 Dim = 1; 441 break; 442 case Intrinsic::amdgcn_workitem_id_z: 443 case Intrinsic::r600_read_tidig_z: 444 IdQuery = true; 445 LLVM_FALLTHROUGH; 446 case Intrinsic::r600_read_local_size_z: 447 Dim = 2; 448 break; 449 default: 450 break; 451 } 452 if (Dim <= 3) { 453 if (auto Node = Kernel->getMetadata("reqd_work_group_size")) 454 if (Node->getNumOperands() == 3) 455 MinSize = MaxSize = mdconst::extract<ConstantInt>( 456 Node->getOperand(Dim))->getZExtValue(); 457 } 458 } 459 } 460 461 if (!MaxSize) 462 return false; 463 464 // Range metadata is [Lo, Hi). For ID query we need to pass max size 465 // as Hi. For size query we need to pass Hi + 1. 466 if (IdQuery) 467 MinSize = 0; 468 else 469 ++MaxSize; 470 471 MDBuilder MDB(I->getContext()); 472 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 473 APInt(32, MaxSize)); 474 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 475 return true; 476 } 477 478 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 479 unsigned &MaxAlign) const { 480 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 481 F.getCallingConv() == CallingConv::SPIR_KERNEL); 482 483 const DataLayout &DL = F.getParent()->getDataLayout(); 484 uint64_t ExplicitArgBytes = 0; 485 MaxAlign = 1; 486 487 for (const Argument &Arg : F.args()) { 488 Type *ArgTy = Arg.getType(); 489 490 unsigned Align = DL.getABITypeAlignment(ArgTy); 491 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 492 ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize; 493 MaxAlign = std::max(MaxAlign, Align); 494 } 495 496 return ExplicitArgBytes; 497 } 498 499 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 500 unsigned &MaxAlign) const { 501 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 502 503 unsigned ExplicitOffset = getExplicitKernelArgOffset(F); 504 505 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 506 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 507 if (ImplicitBytes != 0) { 508 unsigned Alignment = getAlignmentForImplicitArgPtr(); 509 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 510 } 511 512 // Being able to dereference past the end is useful for emitting scalar loads. 513 return alignTo(TotalSize, 4); 514 } 515 516 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 517 const TargetMachine &TM) : 518 R600GenSubtargetInfo(TT, GPU, FS), 519 AMDGPUSubtarget(TT), 520 InstrInfo(*this), 521 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 522 FMA(false), 523 CaymanISA(false), 524 CFALUBug(false), 525 HasVertexCache(false), 526 R600ALUInst(false), 527 FP64(false), 528 TexVTXClauseSize(0), 529 Gen(R600), 530 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)), 531 InstrItins(getInstrItineraryForCPU(GPU)) { } 532 533 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 534 unsigned NumRegionInstrs) const { 535 // Track register pressure so the scheduler can try to decrease 536 // pressure once register usage is above the threshold defined by 537 // SIRegisterInfo::getRegPressureSetLimit() 538 Policy.ShouldTrackPressure = true; 539 540 // Enabling both top down and bottom up scheduling seems to give us less 541 // register spills than just using one of these approaches on its own. 542 Policy.OnlyTopDown = false; 543 Policy.OnlyBottomUp = false; 544 545 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 546 if (!enableSIScheduler()) 547 Policy.ShouldTrackLaneMasks = true; 548 } 549 550 bool GCNSubtarget::hasMadF16() const { 551 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1; 552 } 553 554 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 555 if (getGeneration() >= AMDGPUSubtarget::GFX10) 556 return 10; 557 558 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 559 if (SGPRs <= 80) 560 return 10; 561 if (SGPRs <= 88) 562 return 9; 563 if (SGPRs <= 100) 564 return 8; 565 return 7; 566 } 567 if (SGPRs <= 48) 568 return 10; 569 if (SGPRs <= 56) 570 return 9; 571 if (SGPRs <= 64) 572 return 8; 573 if (SGPRs <= 72) 574 return 7; 575 if (SGPRs <= 80) 576 return 6; 577 return 5; 578 } 579 580 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 581 if (VGPRs <= 24) 582 return 10; 583 if (VGPRs <= 28) 584 return 9; 585 if (VGPRs <= 32) 586 return 8; 587 if (VGPRs <= 36) 588 return 7; 589 if (VGPRs <= 40) 590 return 6; 591 if (VGPRs <= 48) 592 return 5; 593 if (VGPRs <= 64) 594 return 4; 595 if (VGPRs <= 84) 596 return 3; 597 if (VGPRs <= 128) 598 return 2; 599 return 1; 600 } 601 602 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 603 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 604 if (getGeneration() >= AMDGPUSubtarget::GFX10) 605 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. 606 607 if (MFI.hasFlatScratchInit()) { 608 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 609 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 610 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 611 return 4; // FLAT_SCRATCH, VCC (in that order). 612 } 613 614 if (isXNACKEnabled()) 615 return 4; // XNACK, VCC (in that order). 616 return 2; // VCC. 617 } 618 619 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 620 const Function &F = MF.getFunction(); 621 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 622 623 // Compute maximum number of SGPRs function can use using default/requested 624 // minimum number of waves per execution unit. 625 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 626 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 627 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 628 629 // Check if maximum number of SGPRs was explicitly requested using 630 // "amdgpu-num-sgpr" attribute. 631 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 632 unsigned Requested = AMDGPU::getIntegerAttribute( 633 F, "amdgpu-num-sgpr", MaxNumSGPRs); 634 635 // Make sure requested value does not violate subtarget's specifications. 636 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 637 Requested = 0; 638 639 // If more SGPRs are required to support the input user/system SGPRs, 640 // increase to accommodate them. 641 // 642 // FIXME: This really ends up using the requested number of SGPRs + number 643 // of reserved special registers in total. Theoretically you could re-use 644 // the last input registers for these special registers, but this would 645 // require a lot of complexity to deal with the weird aliasing. 646 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 647 if (Requested && Requested < InputNumSGPRs) 648 Requested = InputNumSGPRs; 649 650 // Make sure requested value is compatible with values implied by 651 // default/requested minimum/maximum number of waves per execution unit. 652 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 653 Requested = 0; 654 if (WavesPerEU.second && 655 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 656 Requested = 0; 657 658 if (Requested) 659 MaxNumSGPRs = Requested; 660 } 661 662 if (hasSGPRInitBug()) 663 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 664 665 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 666 MaxAddressableNumSGPRs); 667 } 668 669 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 670 const Function &F = MF.getFunction(); 671 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 672 673 // Compute maximum number of VGPRs function can use using default/requested 674 // minimum number of waves per execution unit. 675 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 676 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 677 678 // Check if maximum number of VGPRs was explicitly requested using 679 // "amdgpu-num-vgpr" attribute. 680 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 681 unsigned Requested = AMDGPU::getIntegerAttribute( 682 F, "amdgpu-num-vgpr", MaxNumVGPRs); 683 684 // Make sure requested value is compatible with values implied by 685 // default/requested minimum/maximum number of waves per execution unit. 686 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 687 Requested = 0; 688 if (WavesPerEU.second && 689 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 690 Requested = 0; 691 692 if (Requested) 693 MaxNumVGPRs = Requested; 694 } 695 696 return MaxNumVGPRs; 697 } 698 699 namespace { 700 struct MemOpClusterMutation : ScheduleDAGMutation { 701 const SIInstrInfo *TII; 702 703 MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {} 704 705 void apply(ScheduleDAGInstrs *DAG) override { 706 SUnit *SUa = nullptr; 707 // Search for two consequent memory operations and link them 708 // to prevent scheduler from moving them apart. 709 // In DAG pre-process SUnits are in the original order of 710 // the instructions before scheduling. 711 for (SUnit &SU : DAG->SUnits) { 712 MachineInstr &MI2 = *SU.getInstr(); 713 if (!MI2.mayLoad() && !MI2.mayStore()) { 714 SUa = nullptr; 715 continue; 716 } 717 if (!SUa) { 718 SUa = &SU; 719 continue; 720 } 721 722 MachineInstr &MI1 = *SUa->getInstr(); 723 if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) || 724 (TII->isFLAT(MI1) && TII->isFLAT(MI2)) || 725 (TII->isSMRD(MI1) && TII->isSMRD(MI2)) || 726 (TII->isDS(MI1) && TII->isDS(MI2))) { 727 SU.addPredBarrier(SUa); 728 729 for (const SDep &SI : SU.Preds) { 730 if (SI.getSUnit() != SUa) 731 SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial)); 732 } 733 734 if (&SU != &DAG->ExitSU) { 735 for (const SDep &SI : SUa->Succs) { 736 if (SI.getSUnit() != &SU) 737 SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial)); 738 } 739 } 740 } 741 742 SUa = &SU; 743 } 744 } 745 }; 746 } // namespace 747 748 void GCNSubtarget::getPostRAMutations( 749 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 750 Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo)); 751 } 752 753 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 754 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 755 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 756 else 757 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); 758 } 759 760 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 761 if (TM.getTargetTriple().getArch() == Triple::amdgcn) 762 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 763 else 764 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); 765 } 766