1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Implements the AMDGPU specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUSubtarget.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUTargetMachine.h" 17 #include "AMDGPUCallLowering.h" 18 #include "AMDGPUInstructionSelector.h" 19 #include "AMDGPULegalizerInfo.h" 20 #include "AMDGPURegisterBankInfo.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 23 #include "llvm/ADT/SmallString.h" 24 #include "llvm/CodeGen/MachineScheduler.h" 25 #include "llvm/MC/MCSubtargetInfo.h" 26 #include "llvm/IR/MDBuilder.h" 27 #include "llvm/CodeGen/TargetFrameLowering.h" 28 #include <algorithm> 29 30 using namespace llvm; 31 32 #define DEBUG_TYPE "amdgpu-subtarget" 33 34 #define GET_SUBTARGETINFO_TARGET_DESC 35 #define GET_SUBTARGETINFO_CTOR 36 #define AMDGPUSubtarget GCNSubtarget 37 #include "AMDGPUGenSubtargetInfo.inc" 38 #define GET_SUBTARGETINFO_TARGET_DESC 39 #define GET_SUBTARGETINFO_CTOR 40 #undef AMDGPUSubtarget 41 #include "R600GenSubtargetInfo.inc" 42 43 GCNSubtarget::~GCNSubtarget() = default; 44 45 R600Subtarget & 46 R600Subtarget::initializeSubtargetDependencies(const Triple &TT, 47 StringRef GPU, StringRef FS) { 48 SmallString<256> FullFS("+promote-alloca,"); 49 FullFS += FS; 50 ParseSubtargetFeatures(GPU, FullFS); 51 52 // FIXME: I don't think think Evergreen has any useful support for 53 // denormals, but should be checked. Should we issue a warning somewhere 54 // if someone tries to enable these? 55 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { 56 FP32Denormals = false; 57 } 58 59 HasMulU24 = getGeneration() >= EVERGREEN; 60 HasMulI24 = hasCaymanISA(); 61 62 return *this; 63 } 64 65 GCNSubtarget & 66 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 67 StringRef GPU, StringRef FS) { 68 // Determine default and user-specified characteristics 69 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be 70 // enabled, but some instructions do not respect them and they run at the 71 // double precision rate, so don't enable by default. 72 // 73 // We want to be able to turn these off, but making this a subtarget feature 74 // for SI has the unhelpful behavior that it unsets everything else if you 75 // disable it. 76 // 77 // Similarly we want enable-prt-strict-null to be on by default and not to 78 // unset everything else if it is disabled 79 80 // Assuming ECC is enabled is the conservative default. 81 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,+xnack,"); 82 83 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 84 FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,"; 85 86 // FIXME: I don't think think Evergreen has any useful support for 87 // denormals, but should be checked. Should we issue a warning somewhere 88 // if someone tries to enable these? 89 if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { 90 FullFS += "+fp64-fp16-denormals,"; 91 } else { 92 FullFS += "-fp32-denormals,"; 93 } 94 95 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS 96 97 // Disable mutually exclusive bits. 98 if (FS.find_lower("+wavefrontsize") != StringRef::npos) { 99 if (FS.find_lower("wavefrontsize16") == StringRef::npos) 100 FullFS += "-wavefrontsize16,"; 101 if (FS.find_lower("wavefrontsize32") == StringRef::npos) 102 FullFS += "-wavefrontsize32,"; 103 if (FS.find_lower("wavefrontsize64") == StringRef::npos) 104 FullFS += "-wavefrontsize64,"; 105 } 106 107 FullFS += FS; 108 109 ParseSubtargetFeatures(GPU, FullFS); 110 111 // We don't support FP64 for EG/NI atm. 112 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 113 114 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es 115 // on VI and newer hardware to avoid assertion failures due to missing ADDR64 116 // variants of MUBUF instructions. 117 if (!hasAddr64() && !FS.contains("flat-for-global")) { 118 FlatForGlobal = true; 119 } 120 121 // Set defaults if needed. 122 if (MaxPrivateElementSize == 0) 123 MaxPrivateElementSize = 4; 124 125 if (LDSBankCount == 0) 126 LDSBankCount = 32; 127 128 if (TT.getArch() == Triple::amdgcn) { 129 if (LocalMemorySize == 0) 130 LocalMemorySize = 32768; 131 132 // Do something sensible for unspecified target. 133 if (!HasMovrel && !HasVGPRIndexMode) 134 HasMovrel = true; 135 } 136 137 // Don't crash on invalid devices. 138 if (WavefrontSize == 0) 139 WavefrontSize = 64; 140 141 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 142 143 if (DoesNotSupportXNACK && EnableXNACK) { 144 ToggleFeature(AMDGPU::FeatureXNACK); 145 EnableXNACK = false; 146 } 147 148 // ECC is on by default, but turn it off if the hardware doesn't support it 149 // anyway. This matters for the gfx9 targets with d16 loads, but don't support 150 // ECC. 151 if (DoesNotSupportSRAMECC && EnableSRAMECC) { 152 ToggleFeature(AMDGPU::FeatureSRAMECC); 153 EnableSRAMECC = false; 154 } 155 156 return *this; 157 } 158 159 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : 160 TargetTriple(TT), 161 Has16BitInsts(false), 162 HasMadMixInsts(false), 163 FP32Denormals(false), 164 FPExceptions(false), 165 HasSDWA(false), 166 HasVOP3PInsts(false), 167 HasMulI24(true), 168 HasMulU24(true), 169 HasInv2PiInlineImm(false), 170 HasFminFmaxLegacy(true), 171 EnablePromoteAlloca(false), 172 HasTrigReducedRange(false), 173 LocalMemorySize(0), 174 WavefrontSize(0) 175 { } 176 177 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 178 const GCNTargetMachine &TM) : 179 AMDGPUGenSubtargetInfo(TT, GPU, FS), 180 AMDGPUSubtarget(TT), 181 TargetTriple(TT), 182 Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS), 183 InstrItins(getInstrItineraryForCPU(GPU)), 184 LDSBankCount(0), 185 MaxPrivateElementSize(0), 186 187 FastFMAF32(false), 188 HalfRate64Ops(false), 189 190 FP64FP16Denormals(false), 191 FlatForGlobal(false), 192 AutoWaitcntBeforeBarrier(false), 193 CodeObjectV3(false), 194 UnalignedScratchAccess(false), 195 UnalignedBufferAccess(false), 196 197 HasApertureRegs(false), 198 EnableXNACK(false), 199 DoesNotSupportXNACK(false), 200 EnableCuMode(false), 201 TrapHandler(false), 202 203 EnableLoadStoreOpt(false), 204 EnableUnsafeDSOffsetFolding(false), 205 EnableSIScheduler(false), 206 EnableDS128(false), 207 EnablePRTStrictNull(false), 208 DumpCode(false), 209 210 FP64(false), 211 GCN3Encoding(false), 212 CIInsts(false), 213 GFX8Insts(false), 214 GFX9Insts(false), 215 GFX10Insts(false), 216 GFX7GFX8GFX9Insts(false), 217 SGPRInitBug(false), 218 HasSMemRealTime(false), 219 HasIntClamp(false), 220 HasFmaMixInsts(false), 221 HasMovrel(false), 222 HasVGPRIndexMode(false), 223 HasScalarStores(false), 224 HasScalarAtomics(false), 225 HasSDWAOmod(false), 226 HasSDWAScalar(false), 227 HasSDWASdst(false), 228 HasSDWAMac(false), 229 HasSDWAOutModsVOPC(false), 230 HasDPP(false), 231 HasDPP8(false), 232 HasR128A16(false), 233 HasNSAEncoding(false), 234 HasDLInsts(false), 235 HasDot1Insts(false), 236 HasDot2Insts(false), 237 HasDot5Insts(false), 238 HasDot6Insts(false), 239 EnableSRAMECC(false), 240 DoesNotSupportSRAMECC(false), 241 HasNoSdstCMPX(false), 242 HasVscnt(false), 243 HasRegisterBanking(false), 244 HasVOP3Literal(false), 245 HasNoDataDepHazard(false), 246 FlatAddressSpace(false), 247 FlatInstOffsets(false), 248 FlatGlobalInsts(false), 249 FlatScratchInsts(false), 250 ScalarFlatScratchInsts(false), 251 AddNoCarryInsts(false), 252 HasUnpackedD16VMem(false), 253 LDSMisalignedBug(false), 254 255 ScalarizeGlobal(false), 256 257 HasVcmpxPermlaneHazard(false), 258 HasVMEMtoScalarWriteHazard(false), 259 HasSMEMtoVectorWriteHazard(false), 260 HasInstFwdPrefetchBug(false), 261 HasVcmpxExecWARHazard(false), 262 HasLdsBranchVmemWARHazard(false), 263 HasNSAtoVMEMBug(false), 264 HasFlatSegmentOffsetBug(false), 265 266 FeatureDisable(false), 267 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 268 TLInfo(TM, *this), 269 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 270 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 271 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 272 RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo())); 273 InstSelector.reset(new AMDGPUInstructionSelector( 274 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 275 } 276 277 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { 278 if (getGeneration() < GFX10) 279 return 1; 280 281 switch (Opcode) { 282 case AMDGPU::V_LSHLREV_B64: 283 case AMDGPU::V_LSHLREV_B64_gfx10: 284 case AMDGPU::V_LSHL_B64: 285 case AMDGPU::V_LSHRREV_B64: 286 case AMDGPU::V_LSHRREV_B64_gfx10: 287 case AMDGPU::V_LSHR_B64: 288 case AMDGPU::V_ASHRREV_I64: 289 case AMDGPU::V_ASHRREV_I64_gfx10: 290 case AMDGPU::V_ASHR_I64: 291 return 1; 292 } 293 294 return 2; 295 } 296 297 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 298 const Function &F) const { 299 if (NWaves == 1) 300 return getLocalMemorySize(); 301 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 302 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 303 if (!WorkGroupsPerCu) 304 return 0; 305 unsigned MaxWaves = getMaxWavesPerEU(); 306 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 307 } 308 309 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 310 const Function &F) const { 311 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 312 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 313 if (!WorkGroupsPerCu) 314 return 0; 315 unsigned MaxWaves = getMaxWavesPerEU(); 316 unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu; 317 unsigned NumWaves = Limit / (Bytes ? Bytes : 1u); 318 NumWaves = std::min(NumWaves, MaxWaves); 319 NumWaves = std::max(NumWaves, 1u); 320 return NumWaves; 321 } 322 323 unsigned 324 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 325 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 326 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 327 } 328 329 std::pair<unsigned, unsigned> 330 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 331 switch (CC) { 332 case CallingConv::AMDGPU_CS: 333 case CallingConv::AMDGPU_KERNEL: 334 case CallingConv::SPIR_KERNEL: 335 return std::make_pair(getWavefrontSize() * 2, 336 std::max(getWavefrontSize() * 4, 256u)); 337 case CallingConv::AMDGPU_VS: 338 case CallingConv::AMDGPU_LS: 339 case CallingConv::AMDGPU_HS: 340 case CallingConv::AMDGPU_ES: 341 case CallingConv::AMDGPU_GS: 342 case CallingConv::AMDGPU_PS: 343 return std::make_pair(1, getWavefrontSize()); 344 default: 345 return std::make_pair(1, 16 * getWavefrontSize()); 346 } 347 } 348 349 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 350 const Function &F) const { 351 // FIXME: 1024 if function. 352 // Default minimum/maximum flat work group sizes. 353 std::pair<unsigned, unsigned> Default = 354 getDefaultFlatWorkGroupSize(F.getCallingConv()); 355 356 // Requested minimum/maximum flat work group sizes. 357 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 358 F, "amdgpu-flat-work-group-size", Default); 359 360 // Make sure requested minimum is less than requested maximum. 361 if (Requested.first > Requested.second) 362 return Default; 363 364 // Make sure requested values do not violate subtarget's specifications. 365 if (Requested.first < getMinFlatWorkGroupSize()) 366 return Default; 367 if (Requested.second > getMaxFlatWorkGroupSize()) 368 return Default; 369 370 return Requested; 371 } 372 373 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 374 const Function &F) const { 375 // Default minimum/maximum number of waves per execution unit. 376 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 377 378 // Default/requested minimum/maximum flat work group sizes. 379 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 380 381 // If minimum/maximum flat work group sizes were explicitly requested using 382 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 383 // number of waves per execution unit to values implied by requested 384 // minimum/maximum flat work group sizes. 385 unsigned MinImpliedByFlatWorkGroupSize = 386 getMaxWavesPerEU(FlatWorkGroupSizes.second); 387 bool RequestedFlatWorkGroupSize = false; 388 389 if (F.hasFnAttribute("amdgpu-flat-work-group-size")) { 390 Default.first = MinImpliedByFlatWorkGroupSize; 391 RequestedFlatWorkGroupSize = true; 392 } 393 394 // Requested minimum/maximum number of waves per execution unit. 395 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 396 F, "amdgpu-waves-per-eu", Default, true); 397 398 // Make sure requested minimum is less than requested maximum. 399 if (Requested.second && Requested.first > Requested.second) 400 return Default; 401 402 // Make sure requested values do not violate subtarget's specifications. 403 if (Requested.first < getMinWavesPerEU() || 404 Requested.first > getMaxWavesPerEU()) 405 return Default; 406 if (Requested.second > getMaxWavesPerEU()) 407 return Default; 408 409 // Make sure requested values are compatible with values implied by requested 410 // minimum/maximum flat work group sizes. 411 if (RequestedFlatWorkGroupSize && 412 Requested.first < MinImpliedByFlatWorkGroupSize) 413 return Default; 414 415 return Requested; 416 } 417 418 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 419 Function *Kernel = I->getParent()->getParent(); 420 unsigned MinSize = 0; 421 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 422 bool IdQuery = false; 423 424 // If reqd_work_group_size is present it narrows value down. 425 if (auto *CI = dyn_cast<CallInst>(I)) { 426 const Function *F = CI->getCalledFunction(); 427 if (F) { 428 unsigned Dim = UINT_MAX; 429 switch (F->getIntrinsicID()) { 430 case Intrinsic::amdgcn_workitem_id_x: 431 case Intrinsic::r600_read_tidig_x: 432 IdQuery = true; 433 LLVM_FALLTHROUGH; 434 case Intrinsic::r600_read_local_size_x: 435 Dim = 0; 436 break; 437 case Intrinsic::amdgcn_workitem_id_y: 438 case Intrinsic::r600_read_tidig_y: 439 IdQuery = true; 440 LLVM_FALLTHROUGH; 441 case Intrinsic::r600_read_local_size_y: 442 Dim = 1; 443 break; 444 case Intrinsic::amdgcn_workitem_id_z: 445 case Intrinsic::r600_read_tidig_z: 446 IdQuery = true; 447 LLVM_FALLTHROUGH; 448 case Intrinsic::r600_read_local_size_z: 449 Dim = 2; 450 break; 451 default: 452 break; 453 } 454 if (Dim <= 3) { 455 if (auto Node = Kernel->getMetadata("reqd_work_group_size")) 456 if (Node->getNumOperands() == 3) 457 MinSize = MaxSize = mdconst::extract<ConstantInt>( 458 Node->getOperand(Dim))->getZExtValue(); 459 } 460 } 461 } 462 463 if (!MaxSize) 464 return false; 465 466 // Range metadata is [Lo, Hi). For ID query we need to pass max size 467 // as Hi. For size query we need to pass Hi + 1. 468 if (IdQuery) 469 MinSize = 0; 470 else 471 ++MaxSize; 472 473 MDBuilder MDB(I->getContext()); 474 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 475 APInt(32, MaxSize)); 476 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 477 return true; 478 } 479 480 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 481 unsigned &MaxAlign) const { 482 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 483 F.getCallingConv() == CallingConv::SPIR_KERNEL); 484 485 const DataLayout &DL = F.getParent()->getDataLayout(); 486 uint64_t ExplicitArgBytes = 0; 487 MaxAlign = 1; 488 489 for (const Argument &Arg : F.args()) { 490 Type *ArgTy = Arg.getType(); 491 492 unsigned Align = DL.getABITypeAlignment(ArgTy); 493 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 494 ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize; 495 MaxAlign = std::max(MaxAlign, Align); 496 } 497 498 return ExplicitArgBytes; 499 } 500 501 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 502 unsigned &MaxAlign) const { 503 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 504 505 unsigned ExplicitOffset = getExplicitKernelArgOffset(F); 506 507 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 508 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 509 if (ImplicitBytes != 0) { 510 unsigned Alignment = getAlignmentForImplicitArgPtr(); 511 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 512 } 513 514 // Being able to dereference past the end is useful for emitting scalar loads. 515 return alignTo(TotalSize, 4); 516 } 517 518 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 519 const TargetMachine &TM) : 520 R600GenSubtargetInfo(TT, GPU, FS), 521 AMDGPUSubtarget(TT), 522 InstrInfo(*this), 523 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 524 FMA(false), 525 CaymanISA(false), 526 CFALUBug(false), 527 HasVertexCache(false), 528 R600ALUInst(false), 529 FP64(false), 530 TexVTXClauseSize(0), 531 Gen(R600), 532 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)), 533 InstrItins(getInstrItineraryForCPU(GPU)) { } 534 535 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 536 unsigned NumRegionInstrs) const { 537 // Track register pressure so the scheduler can try to decrease 538 // pressure once register usage is above the threshold defined by 539 // SIRegisterInfo::getRegPressureSetLimit() 540 Policy.ShouldTrackPressure = true; 541 542 // Enabling both top down and bottom up scheduling seems to give us less 543 // register spills than just using one of these approaches on its own. 544 Policy.OnlyTopDown = false; 545 Policy.OnlyBottomUp = false; 546 547 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 548 if (!enableSIScheduler()) 549 Policy.ShouldTrackLaneMasks = true; 550 } 551 552 bool GCNSubtarget::hasMadF16() const { 553 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1; 554 } 555 556 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 557 if (getGeneration() >= AMDGPUSubtarget::GFX10) 558 return 10; 559 560 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 561 if (SGPRs <= 80) 562 return 10; 563 if (SGPRs <= 88) 564 return 9; 565 if (SGPRs <= 100) 566 return 8; 567 return 7; 568 } 569 if (SGPRs <= 48) 570 return 10; 571 if (SGPRs <= 56) 572 return 9; 573 if (SGPRs <= 64) 574 return 8; 575 if (SGPRs <= 72) 576 return 7; 577 if (SGPRs <= 80) 578 return 6; 579 return 5; 580 } 581 582 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 583 if (VGPRs <= 24) 584 return 10; 585 if (VGPRs <= 28) 586 return 9; 587 if (VGPRs <= 32) 588 return 8; 589 if (VGPRs <= 36) 590 return 7; 591 if (VGPRs <= 40) 592 return 6; 593 if (VGPRs <= 48) 594 return 5; 595 if (VGPRs <= 64) 596 return 4; 597 if (VGPRs <= 84) 598 return 3; 599 if (VGPRs <= 128) 600 return 2; 601 return 1; 602 } 603 604 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 605 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 606 if (getGeneration() >= AMDGPUSubtarget::GFX10) 607 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. 608 609 if (MFI.hasFlatScratchInit()) { 610 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 611 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 612 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 613 return 4; // FLAT_SCRATCH, VCC (in that order). 614 } 615 616 if (isXNACKEnabled()) 617 return 4; // XNACK, VCC (in that order). 618 return 2; // VCC. 619 } 620 621 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 622 const Function &F = MF.getFunction(); 623 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 624 625 // Compute maximum number of SGPRs function can use using default/requested 626 // minimum number of waves per execution unit. 627 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 628 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 629 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 630 631 // Check if maximum number of SGPRs was explicitly requested using 632 // "amdgpu-num-sgpr" attribute. 633 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 634 unsigned Requested = AMDGPU::getIntegerAttribute( 635 F, "amdgpu-num-sgpr", MaxNumSGPRs); 636 637 // Make sure requested value does not violate subtarget's specifications. 638 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 639 Requested = 0; 640 641 // If more SGPRs are required to support the input user/system SGPRs, 642 // increase to accommodate them. 643 // 644 // FIXME: This really ends up using the requested number of SGPRs + number 645 // of reserved special registers in total. Theoretically you could re-use 646 // the last input registers for these special registers, but this would 647 // require a lot of complexity to deal with the weird aliasing. 648 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 649 if (Requested && Requested < InputNumSGPRs) 650 Requested = InputNumSGPRs; 651 652 // Make sure requested value is compatible with values implied by 653 // default/requested minimum/maximum number of waves per execution unit. 654 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 655 Requested = 0; 656 if (WavesPerEU.second && 657 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 658 Requested = 0; 659 660 if (Requested) 661 MaxNumSGPRs = Requested; 662 } 663 664 if (hasSGPRInitBug()) 665 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 666 667 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 668 MaxAddressableNumSGPRs); 669 } 670 671 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 672 const Function &F = MF.getFunction(); 673 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 674 675 // Compute maximum number of VGPRs function can use using default/requested 676 // minimum number of waves per execution unit. 677 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 678 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 679 680 // Check if maximum number of VGPRs was explicitly requested using 681 // "amdgpu-num-vgpr" attribute. 682 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 683 unsigned Requested = AMDGPU::getIntegerAttribute( 684 F, "amdgpu-num-vgpr", MaxNumVGPRs); 685 686 // Make sure requested value is compatible with values implied by 687 // default/requested minimum/maximum number of waves per execution unit. 688 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 689 Requested = 0; 690 if (WavesPerEU.second && 691 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 692 Requested = 0; 693 694 if (Requested) 695 MaxNumVGPRs = Requested; 696 } 697 698 return MaxNumVGPRs; 699 } 700 701 namespace { 702 struct MemOpClusterMutation : ScheduleDAGMutation { 703 const SIInstrInfo *TII; 704 705 MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {} 706 707 void apply(ScheduleDAGInstrs *DAG) override { 708 SUnit *SUa = nullptr; 709 // Search for two consequent memory operations and link them 710 // to prevent scheduler from moving them apart. 711 // In DAG pre-process SUnits are in the original order of 712 // the instructions before scheduling. 713 for (SUnit &SU : DAG->SUnits) { 714 MachineInstr &MI2 = *SU.getInstr(); 715 if (!MI2.mayLoad() && !MI2.mayStore()) { 716 SUa = nullptr; 717 continue; 718 } 719 if (!SUa) { 720 SUa = &SU; 721 continue; 722 } 723 724 MachineInstr &MI1 = *SUa->getInstr(); 725 if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) || 726 (TII->isFLAT(MI1) && TII->isFLAT(MI2)) || 727 (TII->isSMRD(MI1) && TII->isSMRD(MI2)) || 728 (TII->isDS(MI1) && TII->isDS(MI2))) { 729 SU.addPredBarrier(SUa); 730 731 for (const SDep &SI : SU.Preds) { 732 if (SI.getSUnit() != SUa) 733 SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial)); 734 } 735 736 if (&SU != &DAG->ExitSU) { 737 for (const SDep &SI : SUa->Succs) { 738 if (SI.getSUnit() != &SU) 739 SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial)); 740 } 741 } 742 } 743 744 SUa = &SU; 745 } 746 } 747 }; 748 } // namespace 749 750 void GCNSubtarget::getPostRAMutations( 751 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 752 Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo)); 753 } 754 755 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 756 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 757 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 758 else 759 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); 760 } 761 762 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 763 if (TM.getTargetTriple().getArch() == Triple::amdgcn) 764 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 765 else 766 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); 767 } 768