1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Implements the AMDGPU specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUSubtarget.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUTargetMachine.h" 17 #include "AMDGPUCallLowering.h" 18 #include "AMDGPUInstructionSelector.h" 19 #include "AMDGPULegalizerInfo.h" 20 #include "AMDGPURegisterBankInfo.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 23 #include "llvm/ADT/SmallString.h" 24 #include "llvm/CodeGen/MachineScheduler.h" 25 #include "llvm/MC/MCSubtargetInfo.h" 26 #include "llvm/IR/MDBuilder.h" 27 #include "llvm/CodeGen/TargetFrameLowering.h" 28 #include <algorithm> 29 30 using namespace llvm; 31 32 #define DEBUG_TYPE "amdgpu-subtarget" 33 34 #define GET_SUBTARGETINFO_TARGET_DESC 35 #define GET_SUBTARGETINFO_CTOR 36 #define AMDGPUSubtarget GCNSubtarget 37 #include "AMDGPUGenSubtargetInfo.inc" 38 #define GET_SUBTARGETINFO_TARGET_DESC 39 #define GET_SUBTARGETINFO_CTOR 40 #undef AMDGPUSubtarget 41 #include "R600GenSubtargetInfo.inc" 42 43 GCNSubtarget::~GCNSubtarget() = default; 44 45 R600Subtarget & 46 R600Subtarget::initializeSubtargetDependencies(const Triple &TT, 47 StringRef GPU, StringRef FS) { 48 SmallString<256> FullFS("+promote-alloca,"); 49 FullFS += FS; 50 ParseSubtargetFeatures(GPU, FullFS); 51 52 // FIXME: I don't think think Evergreen has any useful support for 53 // denormals, but should be checked. Should we issue a warning somewhere 54 // if someone tries to enable these? 55 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { 56 FP32Denormals = false; 57 } 58 59 HasMulU24 = getGeneration() >= EVERGREEN; 60 HasMulI24 = hasCaymanISA(); 61 62 return *this; 63 } 64 65 GCNSubtarget & 66 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 67 StringRef GPU, StringRef FS) { 68 // Determine default and user-specified characteristics 69 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be 70 // enabled, but some instructions do not respect them and they run at the 71 // double precision rate, so don't enable by default. 72 // 73 // We want to be able to turn these off, but making this a subtarget feature 74 // for SI has the unhelpful behavior that it unsets everything else if you 75 // disable it. 76 // 77 // Similarly we want enable-prt-strict-null to be on by default and not to 78 // unset everything else if it is disabled 79 80 // Assuming ECC is enabled is the conservative default. 81 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,+xnack,"); 82 83 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 84 FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,"; 85 86 // FIXME: I don't think think Evergreen has any useful support for 87 // denormals, but should be checked. Should we issue a warning somewhere 88 // if someone tries to enable these? 89 if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { 90 FullFS += "+fp64-fp16-denormals,"; 91 } else { 92 FullFS += "-fp32-denormals,"; 93 } 94 95 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS 96 97 FullFS += FS; 98 99 ParseSubtargetFeatures(GPU, FullFS); 100 101 // We don't support FP64 for EG/NI atm. 102 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 103 104 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es 105 // on VI and newer hardware to avoid assertion failures due to missing ADDR64 106 // variants of MUBUF instructions. 107 if (!hasAddr64() && !FS.contains("flat-for-global")) { 108 FlatForGlobal = true; 109 } 110 111 // Set defaults if needed. 112 if (MaxPrivateElementSize == 0) 113 MaxPrivateElementSize = 4; 114 115 if (LDSBankCount == 0) 116 LDSBankCount = 32; 117 118 if (TT.getArch() == Triple::amdgcn) { 119 if (LocalMemorySize == 0) 120 LocalMemorySize = 32768; 121 122 // Do something sensible for unspecified target. 123 if (!HasMovrel && !HasVGPRIndexMode) 124 HasMovrel = true; 125 } 126 127 // Don't crash on invalid devices. 128 if (WavefrontSize == 0) 129 WavefrontSize = 64; 130 131 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 132 133 if (DoesNotSupportXNACK && EnableXNACK) { 134 ToggleFeature(AMDGPU::FeatureXNACK); 135 EnableXNACK = false; 136 } 137 138 // ECC is on by default, but turn it off if the hardware doesn't support it 139 // anyway. This matters for the gfx9 targets with d16 loads, but don't support 140 // ECC. 141 if (DoesNotSupportSRAMECC && EnableSRAMECC) { 142 ToggleFeature(AMDGPU::FeatureSRAMECC); 143 EnableSRAMECC = false; 144 } 145 146 return *this; 147 } 148 149 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : 150 TargetTriple(TT), 151 Has16BitInsts(false), 152 HasMadMixInsts(false), 153 FP32Denormals(false), 154 FPExceptions(false), 155 HasSDWA(false), 156 HasVOP3PInsts(false), 157 HasMulI24(true), 158 HasMulU24(true), 159 HasInv2PiInlineImm(false), 160 HasFminFmaxLegacy(true), 161 EnablePromoteAlloca(false), 162 HasTrigReducedRange(false), 163 LocalMemorySize(0), 164 WavefrontSize(0) 165 { } 166 167 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 168 const GCNTargetMachine &TM) : 169 AMDGPUGenSubtargetInfo(TT, GPU, FS), 170 AMDGPUSubtarget(TT), 171 TargetTriple(TT), 172 Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS), 173 InstrItins(getInstrItineraryForCPU(GPU)), 174 LDSBankCount(0), 175 MaxPrivateElementSize(0), 176 177 FastFMAF32(false), 178 HalfRate64Ops(false), 179 180 FP64FP16Denormals(false), 181 FlatForGlobal(false), 182 AutoWaitcntBeforeBarrier(false), 183 CodeObjectV3(false), 184 UnalignedScratchAccess(false), 185 UnalignedBufferAccess(false), 186 187 HasApertureRegs(false), 188 EnableXNACK(false), 189 DoesNotSupportXNACK(false), 190 EnableCuMode(false), 191 TrapHandler(false), 192 193 EnableLoadStoreOpt(false), 194 EnableUnsafeDSOffsetFolding(false), 195 EnableSIScheduler(false), 196 EnableDS128(false), 197 EnablePRTStrictNull(false), 198 DumpCode(false), 199 200 FP64(false), 201 GCN3Encoding(false), 202 CIInsts(false), 203 GFX8Insts(false), 204 GFX9Insts(false), 205 GFX10Insts(false), 206 GFX7GFX8GFX9Insts(false), 207 SGPRInitBug(false), 208 HasSMemRealTime(false), 209 HasIntClamp(false), 210 HasFmaMixInsts(false), 211 HasMovrel(false), 212 HasVGPRIndexMode(false), 213 HasScalarStores(false), 214 HasScalarAtomics(false), 215 HasSDWAOmod(false), 216 HasSDWAScalar(false), 217 HasSDWASdst(false), 218 HasSDWAMac(false), 219 HasSDWAOutModsVOPC(false), 220 HasDPP(false), 221 HasR128A16(false), 222 HasNSAEncoding(false), 223 HasDLInsts(false), 224 HasDot1Insts(false), 225 HasDot2Insts(false), 226 EnableSRAMECC(false), 227 DoesNotSupportSRAMECC(false), 228 HasNoSdstCMPX(false), 229 HasVscnt(false), 230 HasRegisterBanking(false), 231 HasVOP3Literal(false), 232 HasNoDataDepHazard(false), 233 FlatAddressSpace(false), 234 FlatInstOffsets(false), 235 FlatGlobalInsts(false), 236 FlatScratchInsts(false), 237 ScalarFlatScratchInsts(false), 238 AddNoCarryInsts(false), 239 HasUnpackedD16VMem(false), 240 LDSMisalignedBug(false), 241 242 ScalarizeGlobal(false), 243 244 HasVcmpxPermlaneHazard(false), 245 HasVMEMtoScalarWriteHazard(false), 246 HasSMEMtoVectorWriteHazard(false), 247 HasInstFwdPrefetchBug(false), 248 HasVcmpxExecWARHazard(false), 249 HasLdsBranchVmemWARHazard(false), 250 HasNSAtoVMEMBug(false), 251 HasFlatSegmentOffsetBug(false), 252 253 FeatureDisable(false), 254 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 255 TLInfo(TM, *this), 256 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 257 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 258 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 259 RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo())); 260 InstSelector.reset(new AMDGPUInstructionSelector( 261 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 262 } 263 264 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { 265 if (getGeneration() < GFX10) 266 return 1; 267 268 switch (Opcode) { 269 case AMDGPU::V_LSHLREV_B64: 270 case AMDGPU::V_LSHLREV_B64_gfx10: 271 case AMDGPU::V_LSHL_B64: 272 case AMDGPU::V_LSHRREV_B64: 273 case AMDGPU::V_LSHRREV_B64_gfx10: 274 case AMDGPU::V_LSHR_B64: 275 case AMDGPU::V_ASHRREV_I64: 276 case AMDGPU::V_ASHRREV_I64_gfx10: 277 case AMDGPU::V_ASHR_I64: 278 return 1; 279 } 280 281 return 2; 282 } 283 284 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 285 const Function &F) const { 286 if (NWaves == 1) 287 return getLocalMemorySize(); 288 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 289 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 290 if (!WorkGroupsPerCu) 291 return 0; 292 unsigned MaxWaves = getMaxWavesPerEU(); 293 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 294 } 295 296 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 297 const Function &F) const { 298 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 299 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 300 if (!WorkGroupsPerCu) 301 return 0; 302 unsigned MaxWaves = getMaxWavesPerEU(); 303 unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu; 304 unsigned NumWaves = Limit / (Bytes ? Bytes : 1u); 305 NumWaves = std::min(NumWaves, MaxWaves); 306 NumWaves = std::max(NumWaves, 1u); 307 return NumWaves; 308 } 309 310 unsigned 311 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 312 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 313 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 314 } 315 316 std::pair<unsigned, unsigned> 317 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 318 switch (CC) { 319 case CallingConv::AMDGPU_CS: 320 case CallingConv::AMDGPU_KERNEL: 321 case CallingConv::SPIR_KERNEL: 322 return std::make_pair(getWavefrontSize() * 2, 323 std::max(getWavefrontSize() * 4, 256u)); 324 case CallingConv::AMDGPU_VS: 325 case CallingConv::AMDGPU_LS: 326 case CallingConv::AMDGPU_HS: 327 case CallingConv::AMDGPU_ES: 328 case CallingConv::AMDGPU_GS: 329 case CallingConv::AMDGPU_PS: 330 return std::make_pair(1, getWavefrontSize()); 331 default: 332 return std::make_pair(1, 16 * getWavefrontSize()); 333 } 334 } 335 336 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 337 const Function &F) const { 338 // FIXME: 1024 if function. 339 // Default minimum/maximum flat work group sizes. 340 std::pair<unsigned, unsigned> Default = 341 getDefaultFlatWorkGroupSize(F.getCallingConv()); 342 343 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 344 // starts using "amdgpu-flat-work-group-size" attribute. 345 Default.second = AMDGPU::getIntegerAttribute( 346 F, "amdgpu-max-work-group-size", Default.second); 347 Default.first = std::min(Default.first, Default.second); 348 349 // Requested minimum/maximum flat work group sizes. 350 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 351 F, "amdgpu-flat-work-group-size", Default); 352 353 // Make sure requested minimum is less than requested maximum. 354 if (Requested.first > Requested.second) 355 return Default; 356 357 // Make sure requested values do not violate subtarget's specifications. 358 if (Requested.first < getMinFlatWorkGroupSize()) 359 return Default; 360 if (Requested.second > getMaxFlatWorkGroupSize()) 361 return Default; 362 363 return Requested; 364 } 365 366 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 367 const Function &F) const { 368 // Default minimum/maximum number of waves per execution unit. 369 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 370 371 // Default/requested minimum/maximum flat work group sizes. 372 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 373 374 // If minimum/maximum flat work group sizes were explicitly requested using 375 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 376 // number of waves per execution unit to values implied by requested 377 // minimum/maximum flat work group sizes. 378 unsigned MinImpliedByFlatWorkGroupSize = 379 getMaxWavesPerEU(FlatWorkGroupSizes.second); 380 bool RequestedFlatWorkGroupSize = false; 381 382 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 383 // starts using "amdgpu-flat-work-group-size" attribute. 384 if (F.hasFnAttribute("amdgpu-max-work-group-size") || 385 F.hasFnAttribute("amdgpu-flat-work-group-size")) { 386 Default.first = MinImpliedByFlatWorkGroupSize; 387 RequestedFlatWorkGroupSize = true; 388 } 389 390 // Requested minimum/maximum number of waves per execution unit. 391 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 392 F, "amdgpu-waves-per-eu", Default, true); 393 394 // Make sure requested minimum is less than requested maximum. 395 if (Requested.second && Requested.first > Requested.second) 396 return Default; 397 398 // Make sure requested values do not violate subtarget's specifications. 399 if (Requested.first < getMinWavesPerEU() || 400 Requested.first > getMaxWavesPerEU()) 401 return Default; 402 if (Requested.second > getMaxWavesPerEU()) 403 return Default; 404 405 // Make sure requested values are compatible with values implied by requested 406 // minimum/maximum flat work group sizes. 407 if (RequestedFlatWorkGroupSize && 408 Requested.first < MinImpliedByFlatWorkGroupSize) 409 return Default; 410 411 return Requested; 412 } 413 414 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 415 Function *Kernel = I->getParent()->getParent(); 416 unsigned MinSize = 0; 417 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 418 bool IdQuery = false; 419 420 // If reqd_work_group_size is present it narrows value down. 421 if (auto *CI = dyn_cast<CallInst>(I)) { 422 const Function *F = CI->getCalledFunction(); 423 if (F) { 424 unsigned Dim = UINT_MAX; 425 switch (F->getIntrinsicID()) { 426 case Intrinsic::amdgcn_workitem_id_x: 427 case Intrinsic::r600_read_tidig_x: 428 IdQuery = true; 429 LLVM_FALLTHROUGH; 430 case Intrinsic::r600_read_local_size_x: 431 Dim = 0; 432 break; 433 case Intrinsic::amdgcn_workitem_id_y: 434 case Intrinsic::r600_read_tidig_y: 435 IdQuery = true; 436 LLVM_FALLTHROUGH; 437 case Intrinsic::r600_read_local_size_y: 438 Dim = 1; 439 break; 440 case Intrinsic::amdgcn_workitem_id_z: 441 case Intrinsic::r600_read_tidig_z: 442 IdQuery = true; 443 LLVM_FALLTHROUGH; 444 case Intrinsic::r600_read_local_size_z: 445 Dim = 2; 446 break; 447 default: 448 break; 449 } 450 if (Dim <= 3) { 451 if (auto Node = Kernel->getMetadata("reqd_work_group_size")) 452 if (Node->getNumOperands() == 3) 453 MinSize = MaxSize = mdconst::extract<ConstantInt>( 454 Node->getOperand(Dim))->getZExtValue(); 455 } 456 } 457 } 458 459 if (!MaxSize) 460 return false; 461 462 // Range metadata is [Lo, Hi). For ID query we need to pass max size 463 // as Hi. For size query we need to pass Hi + 1. 464 if (IdQuery) 465 MinSize = 0; 466 else 467 ++MaxSize; 468 469 MDBuilder MDB(I->getContext()); 470 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 471 APInt(32, MaxSize)); 472 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 473 return true; 474 } 475 476 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 477 unsigned &MaxAlign) const { 478 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 479 F.getCallingConv() == CallingConv::SPIR_KERNEL); 480 481 const DataLayout &DL = F.getParent()->getDataLayout(); 482 uint64_t ExplicitArgBytes = 0; 483 MaxAlign = 1; 484 485 for (const Argument &Arg : F.args()) { 486 Type *ArgTy = Arg.getType(); 487 488 unsigned Align = DL.getABITypeAlignment(ArgTy); 489 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 490 ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize; 491 MaxAlign = std::max(MaxAlign, Align); 492 } 493 494 return ExplicitArgBytes; 495 } 496 497 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 498 unsigned &MaxAlign) const { 499 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 500 501 unsigned ExplicitOffset = getExplicitKernelArgOffset(F); 502 503 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 504 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 505 if (ImplicitBytes != 0) { 506 unsigned Alignment = getAlignmentForImplicitArgPtr(); 507 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 508 } 509 510 // Being able to dereference past the end is useful for emitting scalar loads. 511 return alignTo(TotalSize, 4); 512 } 513 514 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 515 const TargetMachine &TM) : 516 R600GenSubtargetInfo(TT, GPU, FS), 517 AMDGPUSubtarget(TT), 518 InstrInfo(*this), 519 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 520 FMA(false), 521 CaymanISA(false), 522 CFALUBug(false), 523 HasVertexCache(false), 524 R600ALUInst(false), 525 FP64(false), 526 TexVTXClauseSize(0), 527 Gen(R600), 528 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)), 529 InstrItins(getInstrItineraryForCPU(GPU)) { } 530 531 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 532 unsigned NumRegionInstrs) const { 533 // Track register pressure so the scheduler can try to decrease 534 // pressure once register usage is above the threshold defined by 535 // SIRegisterInfo::getRegPressureSetLimit() 536 Policy.ShouldTrackPressure = true; 537 538 // Enabling both top down and bottom up scheduling seems to give us less 539 // register spills than just using one of these approaches on its own. 540 Policy.OnlyTopDown = false; 541 Policy.OnlyBottomUp = false; 542 543 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 544 if (!enableSIScheduler()) 545 Policy.ShouldTrackLaneMasks = true; 546 } 547 548 bool GCNSubtarget::hasMadF16() const { 549 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1; 550 } 551 552 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 553 if (getGeneration() >= AMDGPUSubtarget::GFX10) 554 return 10; 555 556 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 557 if (SGPRs <= 80) 558 return 10; 559 if (SGPRs <= 88) 560 return 9; 561 if (SGPRs <= 100) 562 return 8; 563 return 7; 564 } 565 if (SGPRs <= 48) 566 return 10; 567 if (SGPRs <= 56) 568 return 9; 569 if (SGPRs <= 64) 570 return 8; 571 if (SGPRs <= 72) 572 return 7; 573 if (SGPRs <= 80) 574 return 6; 575 return 5; 576 } 577 578 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 579 if (VGPRs <= 24) 580 return 10; 581 if (VGPRs <= 28) 582 return 9; 583 if (VGPRs <= 32) 584 return 8; 585 if (VGPRs <= 36) 586 return 7; 587 if (VGPRs <= 40) 588 return 6; 589 if (VGPRs <= 48) 590 return 5; 591 if (VGPRs <= 64) 592 return 4; 593 if (VGPRs <= 84) 594 return 3; 595 if (VGPRs <= 128) 596 return 2; 597 return 1; 598 } 599 600 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 601 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 602 if (getGeneration() >= AMDGPUSubtarget::GFX10) 603 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. 604 605 if (MFI.hasFlatScratchInit()) { 606 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 607 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 608 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 609 return 4; // FLAT_SCRATCH, VCC (in that order). 610 } 611 612 if (isXNACKEnabled()) 613 return 4; // XNACK, VCC (in that order). 614 return 2; // VCC. 615 } 616 617 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 618 const Function &F = MF.getFunction(); 619 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 620 621 // Compute maximum number of SGPRs function can use using default/requested 622 // minimum number of waves per execution unit. 623 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 624 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 625 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 626 627 // Check if maximum number of SGPRs was explicitly requested using 628 // "amdgpu-num-sgpr" attribute. 629 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 630 unsigned Requested = AMDGPU::getIntegerAttribute( 631 F, "amdgpu-num-sgpr", MaxNumSGPRs); 632 633 // Make sure requested value does not violate subtarget's specifications. 634 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 635 Requested = 0; 636 637 // If more SGPRs are required to support the input user/system SGPRs, 638 // increase to accommodate them. 639 // 640 // FIXME: This really ends up using the requested number of SGPRs + number 641 // of reserved special registers in total. Theoretically you could re-use 642 // the last input registers for these special registers, but this would 643 // require a lot of complexity to deal with the weird aliasing. 644 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 645 if (Requested && Requested < InputNumSGPRs) 646 Requested = InputNumSGPRs; 647 648 // Make sure requested value is compatible with values implied by 649 // default/requested minimum/maximum number of waves per execution unit. 650 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 651 Requested = 0; 652 if (WavesPerEU.second && 653 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 654 Requested = 0; 655 656 if (Requested) 657 MaxNumSGPRs = Requested; 658 } 659 660 if (hasSGPRInitBug()) 661 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 662 663 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 664 MaxAddressableNumSGPRs); 665 } 666 667 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 668 const Function &F = MF.getFunction(); 669 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 670 671 // Compute maximum number of VGPRs function can use using default/requested 672 // minimum number of waves per execution unit. 673 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 674 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 675 676 // Check if maximum number of VGPRs was explicitly requested using 677 // "amdgpu-num-vgpr" attribute. 678 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 679 unsigned Requested = AMDGPU::getIntegerAttribute( 680 F, "amdgpu-num-vgpr", MaxNumVGPRs); 681 682 // Make sure requested value is compatible with values implied by 683 // default/requested minimum/maximum number of waves per execution unit. 684 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 685 Requested = 0; 686 if (WavesPerEU.second && 687 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 688 Requested = 0; 689 690 if (Requested) 691 MaxNumVGPRs = Requested; 692 } 693 694 return MaxNumVGPRs; 695 } 696 697 namespace { 698 struct MemOpClusterMutation : ScheduleDAGMutation { 699 const SIInstrInfo *TII; 700 701 MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {} 702 703 void apply(ScheduleDAGInstrs *DAG) override { 704 SUnit *SUa = nullptr; 705 // Search for two consequent memory operations and link them 706 // to prevent scheduler from moving them apart. 707 // In DAG pre-process SUnits are in the original order of 708 // the instructions before scheduling. 709 for (SUnit &SU : DAG->SUnits) { 710 MachineInstr &MI2 = *SU.getInstr(); 711 if (!MI2.mayLoad() && !MI2.mayStore()) { 712 SUa = nullptr; 713 continue; 714 } 715 if (!SUa) { 716 SUa = &SU; 717 continue; 718 } 719 720 MachineInstr &MI1 = *SUa->getInstr(); 721 if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) || 722 (TII->isFLAT(MI1) && TII->isFLAT(MI2)) || 723 (TII->isSMRD(MI1) && TII->isSMRD(MI2)) || 724 (TII->isDS(MI1) && TII->isDS(MI2))) { 725 SU.addPredBarrier(SUa); 726 727 for (const SDep &SI : SU.Preds) { 728 if (SI.getSUnit() != SUa) 729 SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial)); 730 } 731 732 if (&SU != &DAG->ExitSU) { 733 for (const SDep &SI : SUa->Succs) { 734 if (SI.getSUnit() != &SU) 735 SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial)); 736 } 737 } 738 } 739 740 SUa = &SU; 741 } 742 } 743 }; 744 } // namespace 745 746 void GCNSubtarget::getPostRAMutations( 747 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 748 Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo)); 749 } 750 751 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 752 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 753 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 754 else 755 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); 756 } 757 758 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 759 if (TM.getTargetTriple().getArch() == Triple::amdgcn) 760 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 761 else 762 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); 763 } 764