1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Implements the AMDGPU specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUSubtarget.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUTargetMachine.h" 17 #include "AMDGPUCallLowering.h" 18 #include "AMDGPUInstructionSelector.h" 19 #include "AMDGPULegalizerInfo.h" 20 #include "AMDGPURegisterBankInfo.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 23 #include "llvm/ADT/SmallString.h" 24 #include "llvm/CodeGen/MachineScheduler.h" 25 #include "llvm/MC/MCSubtargetInfo.h" 26 #include "llvm/IR/MDBuilder.h" 27 #include "llvm/CodeGen/TargetFrameLowering.h" 28 #include <algorithm> 29 30 using namespace llvm; 31 32 #define DEBUG_TYPE "amdgpu-subtarget" 33 34 #define GET_SUBTARGETINFO_TARGET_DESC 35 #define GET_SUBTARGETINFO_CTOR 36 #define AMDGPUSubtarget GCNSubtarget 37 #include "AMDGPUGenSubtargetInfo.inc" 38 #define GET_SUBTARGETINFO_TARGET_DESC 39 #define GET_SUBTARGETINFO_CTOR 40 #undef AMDGPUSubtarget 41 #include "R600GenSubtargetInfo.inc" 42 43 GCNSubtarget::~GCNSubtarget() = default; 44 45 R600Subtarget & 46 R600Subtarget::initializeSubtargetDependencies(const Triple &TT, 47 StringRef GPU, StringRef FS) { 48 SmallString<256> FullFS("+promote-alloca,"); 49 FullFS += FS; 50 ParseSubtargetFeatures(GPU, FullFS); 51 52 // FIXME: I don't think think Evergreen has any useful support for 53 // denormals, but should be checked. Should we issue a warning somewhere 54 // if someone tries to enable these? 55 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { 56 FP32Denormals = false; 57 } 58 59 HasMulU24 = getGeneration() >= EVERGREEN; 60 HasMulI24 = hasCaymanISA(); 61 62 return *this; 63 } 64 65 GCNSubtarget & 66 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 67 StringRef GPU, StringRef FS) { 68 // Determine default and user-specified characteristics 69 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be 70 // enabled, but some instructions do not respect them and they run at the 71 // double precision rate, so don't enable by default. 72 // 73 // We want to be able to turn these off, but making this a subtarget feature 74 // for SI has the unhelpful behavior that it unsets everything else if you 75 // disable it. 76 // 77 // Similarly we want enable-prt-strict-null to be on by default and not to 78 // unset everything else if it is disabled 79 80 // Assuming ECC is enabled is the conservative default. 81 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,"); 82 83 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 84 FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,"; 85 86 // FIXME: I don't think think Evergreen has any useful support for 87 // denormals, but should be checked. Should we issue a warning somewhere 88 // if someone tries to enable these? 89 if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { 90 FullFS += "+fp64-fp16-denormals,"; 91 } else { 92 FullFS += "-fp32-denormals,"; 93 } 94 95 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS 96 97 FullFS += FS; 98 99 ParseSubtargetFeatures(GPU, FullFS); 100 101 // We don't support FP64 for EG/NI atm. 102 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 103 104 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es 105 // on VI and newer hardware to avoid assertion failures due to missing ADDR64 106 // variants of MUBUF instructions. 107 if (!hasAddr64() && !FS.contains("flat-for-global")) { 108 FlatForGlobal = true; 109 } 110 111 // Set defaults if needed. 112 if (MaxPrivateElementSize == 0) 113 MaxPrivateElementSize = 4; 114 115 if (LDSBankCount == 0) 116 LDSBankCount = 32; 117 118 if (TT.getArch() == Triple::amdgcn) { 119 if (LocalMemorySize == 0) 120 LocalMemorySize = 32768; 121 122 // Do something sensible for unspecified target. 123 if (!HasMovrel && !HasVGPRIndexMode) 124 HasMovrel = true; 125 } 126 127 // Don't crash on invalid devices. 128 if (WavefrontSize == 0) 129 WavefrontSize = 64; 130 131 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 132 133 // ECC is on by default, but turn it off if the hardware doesn't support it 134 // anyway. This matters for the gfx9 targets with d16 loads, but don't support 135 // ECC. 136 if (DoesNotSupportSRAMECC && EnableSRAMECC) { 137 ToggleFeature(AMDGPU::FeatureSRAMECC); 138 EnableSRAMECC = false; 139 } 140 141 return *this; 142 } 143 144 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : 145 TargetTriple(TT), 146 Has16BitInsts(false), 147 HasMadMixInsts(false), 148 FP32Denormals(false), 149 FPExceptions(false), 150 HasSDWA(false), 151 HasVOP3PInsts(false), 152 HasMulI24(true), 153 HasMulU24(true), 154 HasInv2PiInlineImm(false), 155 HasFminFmaxLegacy(true), 156 EnablePromoteAlloca(false), 157 HasTrigReducedRange(false), 158 LocalMemorySize(0), 159 WavefrontSize(0) 160 { } 161 162 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 163 const GCNTargetMachine &TM) : 164 AMDGPUGenSubtargetInfo(TT, GPU, FS), 165 AMDGPUSubtarget(TT), 166 TargetTriple(TT), 167 Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS), 168 InstrItins(getInstrItineraryForCPU(GPU)), 169 LDSBankCount(0), 170 MaxPrivateElementSize(0), 171 172 FastFMAF32(false), 173 HalfRate64Ops(false), 174 175 FP64FP16Denormals(false), 176 FlatForGlobal(false), 177 AutoWaitcntBeforeBarrier(false), 178 CodeObjectV3(false), 179 UnalignedScratchAccess(false), 180 UnalignedBufferAccess(false), 181 182 HasApertureRegs(false), 183 EnableXNACK(false), 184 EnableCuMode(false), 185 TrapHandler(false), 186 187 EnableHugePrivateBuffer(false), 188 EnableLoadStoreOpt(false), 189 EnableUnsafeDSOffsetFolding(false), 190 EnableSIScheduler(false), 191 EnableDS128(false), 192 EnablePRTStrictNull(false), 193 DumpCode(false), 194 195 FP64(false), 196 GCN3Encoding(false), 197 CIInsts(false), 198 GFX8Insts(false), 199 GFX9Insts(false), 200 GFX10Insts(false), 201 GFX7GFX8GFX9Insts(false), 202 SGPRInitBug(false), 203 HasSMemRealTime(false), 204 HasIntClamp(false), 205 HasFmaMixInsts(false), 206 HasMovrel(false), 207 HasVGPRIndexMode(false), 208 HasScalarStores(false), 209 HasScalarAtomics(false), 210 HasSDWAOmod(false), 211 HasSDWAScalar(false), 212 HasSDWASdst(false), 213 HasSDWAMac(false), 214 HasSDWAOutModsVOPC(false), 215 HasDPP(false), 216 HasR128A16(false), 217 HasNSAEncoding(false), 218 HasDLInsts(false), 219 HasDot1Insts(false), 220 HasDot2Insts(false), 221 EnableSRAMECC(false), 222 DoesNotSupportSRAMECC(false), 223 HasNoSdstCMPX(false), 224 HasVscnt(false), 225 HasRegisterBanking(false), 226 HasVOP3Literal(false), 227 HasNoDataDepHazard(false), 228 FlatAddressSpace(false), 229 FlatInstOffsets(false), 230 FlatGlobalInsts(false), 231 FlatScratchInsts(false), 232 ScalarFlatScratchInsts(false), 233 AddNoCarryInsts(false), 234 HasUnpackedD16VMem(false), 235 LDSMisalignedBug(false), 236 237 ScalarizeGlobal(false), 238 239 HasVcmpxPermlaneHazard(false), 240 HasVMEMtoScalarWriteHazard(false), 241 HasSMEMtoVectorWriteHazard(false), 242 HasInstFwdPrefetchBug(false), 243 HasVcmpxExecWARHazard(false), 244 HasLdsBranchVmemWARHazard(false), 245 HasNSAtoVMEMBug(false), 246 HasFlatSegmentOffsetBug(false), 247 248 FeatureDisable(false), 249 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 250 TLInfo(TM, *this), 251 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 252 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 253 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 254 RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo())); 255 InstSelector.reset(new AMDGPUInstructionSelector( 256 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 257 } 258 259 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 260 const Function &F) const { 261 if (NWaves == 1) 262 return getLocalMemorySize(); 263 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 264 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 265 if (!WorkGroupsPerCu) 266 return 0; 267 unsigned MaxWaves = getMaxWavesPerEU(); 268 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 269 } 270 271 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 272 const Function &F) const { 273 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 274 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 275 if (!WorkGroupsPerCu) 276 return 0; 277 unsigned MaxWaves = getMaxWavesPerEU(); 278 unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu; 279 unsigned NumWaves = Limit / (Bytes ? Bytes : 1u); 280 NumWaves = std::min(NumWaves, MaxWaves); 281 NumWaves = std::max(NumWaves, 1u); 282 return NumWaves; 283 } 284 285 unsigned 286 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 287 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 288 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 289 } 290 291 std::pair<unsigned, unsigned> 292 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 293 switch (CC) { 294 case CallingConv::AMDGPU_CS: 295 case CallingConv::AMDGPU_KERNEL: 296 case CallingConv::SPIR_KERNEL: 297 return std::make_pair(getWavefrontSize() * 2, 298 std::max(getWavefrontSize() * 4, 256u)); 299 case CallingConv::AMDGPU_VS: 300 case CallingConv::AMDGPU_LS: 301 case CallingConv::AMDGPU_HS: 302 case CallingConv::AMDGPU_ES: 303 case CallingConv::AMDGPU_GS: 304 case CallingConv::AMDGPU_PS: 305 return std::make_pair(1, getWavefrontSize()); 306 default: 307 return std::make_pair(1, 16 * getWavefrontSize()); 308 } 309 } 310 311 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 312 const Function &F) const { 313 // FIXME: 1024 if function. 314 // Default minimum/maximum flat work group sizes. 315 std::pair<unsigned, unsigned> Default = 316 getDefaultFlatWorkGroupSize(F.getCallingConv()); 317 318 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 319 // starts using "amdgpu-flat-work-group-size" attribute. 320 Default.second = AMDGPU::getIntegerAttribute( 321 F, "amdgpu-max-work-group-size", Default.second); 322 Default.first = std::min(Default.first, Default.second); 323 324 // Requested minimum/maximum flat work group sizes. 325 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 326 F, "amdgpu-flat-work-group-size", Default); 327 328 // Make sure requested minimum is less than requested maximum. 329 if (Requested.first > Requested.second) 330 return Default; 331 332 // Make sure requested values do not violate subtarget's specifications. 333 if (Requested.first < getMinFlatWorkGroupSize()) 334 return Default; 335 if (Requested.second > getMaxFlatWorkGroupSize()) 336 return Default; 337 338 return Requested; 339 } 340 341 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 342 const Function &F) const { 343 // Default minimum/maximum number of waves per execution unit. 344 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 345 346 // Default/requested minimum/maximum flat work group sizes. 347 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 348 349 // If minimum/maximum flat work group sizes were explicitly requested using 350 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 351 // number of waves per execution unit to values implied by requested 352 // minimum/maximum flat work group sizes. 353 unsigned MinImpliedByFlatWorkGroupSize = 354 getMaxWavesPerEU(FlatWorkGroupSizes.second); 355 bool RequestedFlatWorkGroupSize = false; 356 357 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 358 // starts using "amdgpu-flat-work-group-size" attribute. 359 if (F.hasFnAttribute("amdgpu-max-work-group-size") || 360 F.hasFnAttribute("amdgpu-flat-work-group-size")) { 361 Default.first = MinImpliedByFlatWorkGroupSize; 362 RequestedFlatWorkGroupSize = true; 363 } 364 365 // Requested minimum/maximum number of waves per execution unit. 366 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 367 F, "amdgpu-waves-per-eu", Default, true); 368 369 // Make sure requested minimum is less than requested maximum. 370 if (Requested.second && Requested.first > Requested.second) 371 return Default; 372 373 // Make sure requested values do not violate subtarget's specifications. 374 if (Requested.first < getMinWavesPerEU() || 375 Requested.first > getMaxWavesPerEU()) 376 return Default; 377 if (Requested.second > getMaxWavesPerEU()) 378 return Default; 379 380 // Make sure requested values are compatible with values implied by requested 381 // minimum/maximum flat work group sizes. 382 if (RequestedFlatWorkGroupSize && 383 Requested.first < MinImpliedByFlatWorkGroupSize) 384 return Default; 385 386 return Requested; 387 } 388 389 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 390 Function *Kernel = I->getParent()->getParent(); 391 unsigned MinSize = 0; 392 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 393 bool IdQuery = false; 394 395 // If reqd_work_group_size is present it narrows value down. 396 if (auto *CI = dyn_cast<CallInst>(I)) { 397 const Function *F = CI->getCalledFunction(); 398 if (F) { 399 unsigned Dim = UINT_MAX; 400 switch (F->getIntrinsicID()) { 401 case Intrinsic::amdgcn_workitem_id_x: 402 case Intrinsic::r600_read_tidig_x: 403 IdQuery = true; 404 LLVM_FALLTHROUGH; 405 case Intrinsic::r600_read_local_size_x: 406 Dim = 0; 407 break; 408 case Intrinsic::amdgcn_workitem_id_y: 409 case Intrinsic::r600_read_tidig_y: 410 IdQuery = true; 411 LLVM_FALLTHROUGH; 412 case Intrinsic::r600_read_local_size_y: 413 Dim = 1; 414 break; 415 case Intrinsic::amdgcn_workitem_id_z: 416 case Intrinsic::r600_read_tidig_z: 417 IdQuery = true; 418 LLVM_FALLTHROUGH; 419 case Intrinsic::r600_read_local_size_z: 420 Dim = 2; 421 break; 422 default: 423 break; 424 } 425 if (Dim <= 3) { 426 if (auto Node = Kernel->getMetadata("reqd_work_group_size")) 427 if (Node->getNumOperands() == 3) 428 MinSize = MaxSize = mdconst::extract<ConstantInt>( 429 Node->getOperand(Dim))->getZExtValue(); 430 } 431 } 432 } 433 434 if (!MaxSize) 435 return false; 436 437 // Range metadata is [Lo, Hi). For ID query we need to pass max size 438 // as Hi. For size query we need to pass Hi + 1. 439 if (IdQuery) 440 MinSize = 0; 441 else 442 ++MaxSize; 443 444 MDBuilder MDB(I->getContext()); 445 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 446 APInt(32, MaxSize)); 447 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 448 return true; 449 } 450 451 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 452 unsigned &MaxAlign) const { 453 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 454 F.getCallingConv() == CallingConv::SPIR_KERNEL); 455 456 const DataLayout &DL = F.getParent()->getDataLayout(); 457 uint64_t ExplicitArgBytes = 0; 458 MaxAlign = 1; 459 460 for (const Argument &Arg : F.args()) { 461 Type *ArgTy = Arg.getType(); 462 463 unsigned Align = DL.getABITypeAlignment(ArgTy); 464 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 465 ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize; 466 MaxAlign = std::max(MaxAlign, Align); 467 } 468 469 return ExplicitArgBytes; 470 } 471 472 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 473 unsigned &MaxAlign) const { 474 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 475 476 unsigned ExplicitOffset = getExplicitKernelArgOffset(F); 477 478 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 479 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 480 if (ImplicitBytes != 0) { 481 unsigned Alignment = getAlignmentForImplicitArgPtr(); 482 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 483 } 484 485 // Being able to dereference past the end is useful for emitting scalar loads. 486 return alignTo(TotalSize, 4); 487 } 488 489 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 490 const TargetMachine &TM) : 491 R600GenSubtargetInfo(TT, GPU, FS), 492 AMDGPUSubtarget(TT), 493 InstrInfo(*this), 494 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 495 FMA(false), 496 CaymanISA(false), 497 CFALUBug(false), 498 HasVertexCache(false), 499 R600ALUInst(false), 500 FP64(false), 501 TexVTXClauseSize(0), 502 Gen(R600), 503 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)), 504 InstrItins(getInstrItineraryForCPU(GPU)) { } 505 506 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 507 unsigned NumRegionInstrs) const { 508 // Track register pressure so the scheduler can try to decrease 509 // pressure once register usage is above the threshold defined by 510 // SIRegisterInfo::getRegPressureSetLimit() 511 Policy.ShouldTrackPressure = true; 512 513 // Enabling both top down and bottom up scheduling seems to give us less 514 // register spills than just using one of these approaches on its own. 515 Policy.OnlyTopDown = false; 516 Policy.OnlyBottomUp = false; 517 518 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 519 if (!enableSIScheduler()) 520 Policy.ShouldTrackLaneMasks = true; 521 } 522 523 bool GCNSubtarget::hasMadF16() const { 524 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1; 525 } 526 527 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 528 if (getGeneration() >= AMDGPUSubtarget::GFX10) 529 return 10; 530 531 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 532 if (SGPRs <= 80) 533 return 10; 534 if (SGPRs <= 88) 535 return 9; 536 if (SGPRs <= 100) 537 return 8; 538 return 7; 539 } 540 if (SGPRs <= 48) 541 return 10; 542 if (SGPRs <= 56) 543 return 9; 544 if (SGPRs <= 64) 545 return 8; 546 if (SGPRs <= 72) 547 return 7; 548 if (SGPRs <= 80) 549 return 6; 550 return 5; 551 } 552 553 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 554 if (VGPRs <= 24) 555 return 10; 556 if (VGPRs <= 28) 557 return 9; 558 if (VGPRs <= 32) 559 return 8; 560 if (VGPRs <= 36) 561 return 7; 562 if (VGPRs <= 40) 563 return 6; 564 if (VGPRs <= 48) 565 return 5; 566 if (VGPRs <= 64) 567 return 4; 568 if (VGPRs <= 84) 569 return 3; 570 if (VGPRs <= 128) 571 return 2; 572 return 1; 573 } 574 575 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 576 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 577 if (getGeneration() >= AMDGPUSubtarget::GFX10) 578 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. 579 580 if (MFI.hasFlatScratchInit()) { 581 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 582 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 583 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 584 return 4; // FLAT_SCRATCH, VCC (in that order). 585 } 586 587 if (isXNACKEnabled()) 588 return 4; // XNACK, VCC (in that order). 589 return 2; // VCC. 590 } 591 592 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 593 const Function &F = MF.getFunction(); 594 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 595 596 // Compute maximum number of SGPRs function can use using default/requested 597 // minimum number of waves per execution unit. 598 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 599 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 600 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 601 602 // Check if maximum number of SGPRs was explicitly requested using 603 // "amdgpu-num-sgpr" attribute. 604 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 605 unsigned Requested = AMDGPU::getIntegerAttribute( 606 F, "amdgpu-num-sgpr", MaxNumSGPRs); 607 608 // Make sure requested value does not violate subtarget's specifications. 609 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 610 Requested = 0; 611 612 // If more SGPRs are required to support the input user/system SGPRs, 613 // increase to accommodate them. 614 // 615 // FIXME: This really ends up using the requested number of SGPRs + number 616 // of reserved special registers in total. Theoretically you could re-use 617 // the last input registers for these special registers, but this would 618 // require a lot of complexity to deal with the weird aliasing. 619 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 620 if (Requested && Requested < InputNumSGPRs) 621 Requested = InputNumSGPRs; 622 623 // Make sure requested value is compatible with values implied by 624 // default/requested minimum/maximum number of waves per execution unit. 625 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 626 Requested = 0; 627 if (WavesPerEU.second && 628 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 629 Requested = 0; 630 631 if (Requested) 632 MaxNumSGPRs = Requested; 633 } 634 635 if (hasSGPRInitBug()) 636 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 637 638 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 639 MaxAddressableNumSGPRs); 640 } 641 642 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 643 const Function &F = MF.getFunction(); 644 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 645 646 // Compute maximum number of VGPRs function can use using default/requested 647 // minimum number of waves per execution unit. 648 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 649 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 650 651 // Check if maximum number of VGPRs was explicitly requested using 652 // "amdgpu-num-vgpr" attribute. 653 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 654 unsigned Requested = AMDGPU::getIntegerAttribute( 655 F, "amdgpu-num-vgpr", MaxNumVGPRs); 656 657 // Make sure requested value is compatible with values implied by 658 // default/requested minimum/maximum number of waves per execution unit. 659 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 660 Requested = 0; 661 if (WavesPerEU.second && 662 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 663 Requested = 0; 664 665 if (Requested) 666 MaxNumVGPRs = Requested; 667 } 668 669 return MaxNumVGPRs; 670 } 671 672 namespace { 673 struct MemOpClusterMutation : ScheduleDAGMutation { 674 const SIInstrInfo *TII; 675 676 MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {} 677 678 void apply(ScheduleDAGInstrs *DAG) override { 679 SUnit *SUa = nullptr; 680 // Search for two consequent memory operations and link them 681 // to prevent scheduler from moving them apart. 682 // In DAG pre-process SUnits are in the original order of 683 // the instructions before scheduling. 684 for (SUnit &SU : DAG->SUnits) { 685 MachineInstr &MI2 = *SU.getInstr(); 686 if (!MI2.mayLoad() && !MI2.mayStore()) { 687 SUa = nullptr; 688 continue; 689 } 690 if (!SUa) { 691 SUa = &SU; 692 continue; 693 } 694 695 MachineInstr &MI1 = *SUa->getInstr(); 696 if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) || 697 (TII->isFLAT(MI1) && TII->isFLAT(MI2)) || 698 (TII->isSMRD(MI1) && TII->isSMRD(MI2)) || 699 (TII->isDS(MI1) && TII->isDS(MI2))) { 700 SU.addPredBarrier(SUa); 701 702 for (const SDep &SI : SU.Preds) { 703 if (SI.getSUnit() != SUa) 704 SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial)); 705 } 706 707 if (&SU != &DAG->ExitSU) { 708 for (const SDep &SI : SUa->Succs) { 709 if (SI.getSUnit() != &SU) 710 SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial)); 711 } 712 } 713 } 714 715 SUa = &SU; 716 } 717 } 718 }; 719 } // namespace 720 721 void GCNSubtarget::getPostRAMutations( 722 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 723 Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo)); 724 } 725 726 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 727 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 728 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 729 else 730 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); 731 } 732 733 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 734 if (TM.getTargetTriple().getArch() == Triple::amdgcn) 735 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 736 else 737 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); 738 } 739