1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Implements the AMDGPU specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUSubtarget.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUTargetMachine.h" 17 #include "AMDGPUCallLowering.h" 18 #include "AMDGPUInstructionSelector.h" 19 #include "AMDGPULegalizerInfo.h" 20 #include "AMDGPURegisterBankInfo.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 23 #include "llvm/ADT/SmallString.h" 24 #include "llvm/CodeGen/MachineScheduler.h" 25 #include "llvm/MC/MCSubtargetInfo.h" 26 #include "llvm/IR/MDBuilder.h" 27 #include "llvm/CodeGen/TargetFrameLowering.h" 28 #include <algorithm> 29 30 using namespace llvm; 31 32 #define DEBUG_TYPE "amdgpu-subtarget" 33 34 #define GET_SUBTARGETINFO_TARGET_DESC 35 #define GET_SUBTARGETINFO_CTOR 36 #define AMDGPUSubtarget GCNSubtarget 37 #include "AMDGPUGenSubtargetInfo.inc" 38 #define GET_SUBTARGETINFO_TARGET_DESC 39 #define GET_SUBTARGETINFO_CTOR 40 #undef AMDGPUSubtarget 41 #include "R600GenSubtargetInfo.inc" 42 43 GCNSubtarget::~GCNSubtarget() = default; 44 45 R600Subtarget & 46 R600Subtarget::initializeSubtargetDependencies(const Triple &TT, 47 StringRef GPU, StringRef FS) { 48 SmallString<256> FullFS("+promote-alloca,"); 49 FullFS += FS; 50 ParseSubtargetFeatures(GPU, FullFS); 51 52 // FIXME: I don't think think Evergreen has any useful support for 53 // denormals, but should be checked. Should we issue a warning somewhere 54 // if someone tries to enable these? 55 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { 56 FP32Denormals = false; 57 } 58 59 HasMulU24 = getGeneration() >= EVERGREEN; 60 HasMulI24 = hasCaymanISA(); 61 62 return *this; 63 } 64 65 GCNSubtarget & 66 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 67 StringRef GPU, StringRef FS) { 68 // Determine default and user-specified characteristics 69 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be 70 // enabled, but some instructions do not respect them and they run at the 71 // double precision rate, so don't enable by default. 72 // 73 // We want to be able to turn these off, but making this a subtarget feature 74 // for SI has the unhelpful behavior that it unsets everything else if you 75 // disable it. 76 // 77 // Similarly we want enable-prt-strict-null to be on by default and not to 78 // unset everything else if it is disabled 79 80 // Assuming ECC is enabled is the conservative default. 81 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,"); 82 83 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 84 FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,"; 85 86 // FIXME: I don't think think Evergreen has any useful support for 87 // denormals, but should be checked. Should we issue a warning somewhere 88 // if someone tries to enable these? 89 if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { 90 FullFS += "+fp64-fp16-denormals,"; 91 } else { 92 FullFS += "-fp32-denormals,"; 93 } 94 95 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS 96 97 FullFS += FS; 98 99 ParseSubtargetFeatures(GPU, FullFS); 100 101 // We don't support FP64 for EG/NI atm. 102 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 103 104 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es 105 // on VI and newer hardware to avoid assertion failures due to missing ADDR64 106 // variants of MUBUF instructions. 107 if (!hasAddr64() && !FS.contains("flat-for-global")) { 108 FlatForGlobal = true; 109 } 110 111 // Set defaults if needed. 112 if (MaxPrivateElementSize == 0) 113 MaxPrivateElementSize = 4; 114 115 if (LDSBankCount == 0) 116 LDSBankCount = 32; 117 118 if (TT.getArch() == Triple::amdgcn) { 119 if (LocalMemorySize == 0) 120 LocalMemorySize = 32768; 121 122 // Do something sensible for unspecified target. 123 if (!HasMovrel && !HasVGPRIndexMode) 124 HasMovrel = true; 125 } 126 127 // Don't crash on invalid devices. 128 if (WavefrontSize == 0) 129 WavefrontSize = 64; 130 131 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 132 133 // ECC is on by default, but turn it off if the hardware doesn't support it 134 // anyway. This matters for the gfx9 targets with d16 loads, but don't support 135 // ECC. 136 if (DoesNotSupportSRAMECC && EnableSRAMECC) { 137 ToggleFeature(AMDGPU::FeatureSRAMECC); 138 EnableSRAMECC = false; 139 } 140 141 return *this; 142 } 143 144 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : 145 TargetTriple(TT), 146 Has16BitInsts(false), 147 HasMadMixInsts(false), 148 FP32Denormals(false), 149 FPExceptions(false), 150 HasSDWA(false), 151 HasVOP3PInsts(false), 152 HasMulI24(true), 153 HasMulU24(true), 154 HasInv2PiInlineImm(false), 155 HasFminFmaxLegacy(true), 156 EnablePromoteAlloca(false), 157 HasTrigReducedRange(false), 158 LocalMemorySize(0), 159 WavefrontSize(0) 160 { } 161 162 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 163 const GCNTargetMachine &TM) : 164 AMDGPUGenSubtargetInfo(TT, GPU, FS), 165 AMDGPUSubtarget(TT), 166 TargetTriple(TT), 167 Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS), 168 InstrItins(getInstrItineraryForCPU(GPU)), 169 LDSBankCount(0), 170 MaxPrivateElementSize(0), 171 172 FastFMAF32(false), 173 HalfRate64Ops(false), 174 175 FP64FP16Denormals(false), 176 FlatForGlobal(false), 177 AutoWaitcntBeforeBarrier(false), 178 CodeObjectV3(false), 179 UnalignedScratchAccess(false), 180 UnalignedBufferAccess(false), 181 182 HasApertureRegs(false), 183 EnableXNACK(false), 184 TrapHandler(false), 185 186 EnableHugePrivateBuffer(false), 187 EnableLoadStoreOpt(false), 188 EnableUnsafeDSOffsetFolding(false), 189 EnableSIScheduler(false), 190 EnableDS128(false), 191 EnablePRTStrictNull(false), 192 DumpCode(false), 193 194 FP64(false), 195 GCN3Encoding(false), 196 CIInsts(false), 197 GFX8Insts(false), 198 GFX9Insts(false), 199 GFX7GFX8GFX9Insts(false), 200 SGPRInitBug(false), 201 HasSMemRealTime(false), 202 HasIntClamp(false), 203 HasFmaMixInsts(false), 204 HasMovrel(false), 205 HasVGPRIndexMode(false), 206 HasScalarStores(false), 207 HasScalarAtomics(false), 208 HasSDWAOmod(false), 209 HasSDWAScalar(false), 210 HasSDWASdst(false), 211 HasSDWAMac(false), 212 HasSDWAOutModsVOPC(false), 213 HasDPP(false), 214 HasR128A16(false), 215 HasDLInsts(false), 216 HasDot1Insts(false), 217 HasDot2Insts(false), 218 EnableSRAMECC(false), 219 DoesNotSupportSRAMECC(false), 220 FlatAddressSpace(false), 221 FlatInstOffsets(false), 222 FlatGlobalInsts(false), 223 FlatScratchInsts(false), 224 AddNoCarryInsts(false), 225 HasUnpackedD16VMem(false), 226 227 ScalarizeGlobal(false), 228 229 FeatureDisable(false), 230 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 231 TLInfo(TM, *this), 232 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 233 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 234 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 235 RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo())); 236 InstSelector.reset(new AMDGPUInstructionSelector( 237 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 238 } 239 240 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 241 const Function &F) const { 242 if (NWaves == 1) 243 return getLocalMemorySize(); 244 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 245 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 246 unsigned MaxWaves = getMaxWavesPerEU(); 247 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 248 } 249 250 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 251 const Function &F) const { 252 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 253 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 254 unsigned MaxWaves = getMaxWavesPerEU(); 255 unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu; 256 unsigned NumWaves = Limit / (Bytes ? Bytes : 1u); 257 NumWaves = std::min(NumWaves, MaxWaves); 258 NumWaves = std::max(NumWaves, 1u); 259 return NumWaves; 260 } 261 262 unsigned 263 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 264 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 265 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 266 } 267 268 std::pair<unsigned, unsigned> 269 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 270 switch (CC) { 271 case CallingConv::AMDGPU_CS: 272 case CallingConv::AMDGPU_KERNEL: 273 case CallingConv::SPIR_KERNEL: 274 return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4); 275 case CallingConv::AMDGPU_VS: 276 case CallingConv::AMDGPU_LS: 277 case CallingConv::AMDGPU_HS: 278 case CallingConv::AMDGPU_ES: 279 case CallingConv::AMDGPU_GS: 280 case CallingConv::AMDGPU_PS: 281 return std::make_pair(1, getWavefrontSize()); 282 default: 283 return std::make_pair(1, 16 * getWavefrontSize()); 284 } 285 } 286 287 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 288 const Function &F) const { 289 // FIXME: 1024 if function. 290 // Default minimum/maximum flat work group sizes. 291 std::pair<unsigned, unsigned> Default = 292 getDefaultFlatWorkGroupSize(F.getCallingConv()); 293 294 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 295 // starts using "amdgpu-flat-work-group-size" attribute. 296 Default.second = AMDGPU::getIntegerAttribute( 297 F, "amdgpu-max-work-group-size", Default.second); 298 Default.first = std::min(Default.first, Default.second); 299 300 // Requested minimum/maximum flat work group sizes. 301 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 302 F, "amdgpu-flat-work-group-size", Default); 303 304 // Make sure requested minimum is less than requested maximum. 305 if (Requested.first > Requested.second) 306 return Default; 307 308 // Make sure requested values do not violate subtarget's specifications. 309 if (Requested.first < getMinFlatWorkGroupSize()) 310 return Default; 311 if (Requested.second > getMaxFlatWorkGroupSize()) 312 return Default; 313 314 return Requested; 315 } 316 317 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 318 const Function &F) const { 319 // Default minimum/maximum number of waves per execution unit. 320 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 321 322 // Default/requested minimum/maximum flat work group sizes. 323 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 324 325 // If minimum/maximum flat work group sizes were explicitly requested using 326 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 327 // number of waves per execution unit to values implied by requested 328 // minimum/maximum flat work group sizes. 329 unsigned MinImpliedByFlatWorkGroupSize = 330 getMaxWavesPerEU(FlatWorkGroupSizes.second); 331 bool RequestedFlatWorkGroupSize = false; 332 333 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 334 // starts using "amdgpu-flat-work-group-size" attribute. 335 if (F.hasFnAttribute("amdgpu-max-work-group-size") || 336 F.hasFnAttribute("amdgpu-flat-work-group-size")) { 337 Default.first = MinImpliedByFlatWorkGroupSize; 338 RequestedFlatWorkGroupSize = true; 339 } 340 341 // Requested minimum/maximum number of waves per execution unit. 342 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 343 F, "amdgpu-waves-per-eu", Default, true); 344 345 // Make sure requested minimum is less than requested maximum. 346 if (Requested.second && Requested.first > Requested.second) 347 return Default; 348 349 // Make sure requested values do not violate subtarget's specifications. 350 if (Requested.first < getMinWavesPerEU() || 351 Requested.first > getMaxWavesPerEU()) 352 return Default; 353 if (Requested.second > getMaxWavesPerEU()) 354 return Default; 355 356 // Make sure requested values are compatible with values implied by requested 357 // minimum/maximum flat work group sizes. 358 if (RequestedFlatWorkGroupSize && 359 Requested.first < MinImpliedByFlatWorkGroupSize) 360 return Default; 361 362 return Requested; 363 } 364 365 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 366 Function *Kernel = I->getParent()->getParent(); 367 unsigned MinSize = 0; 368 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 369 bool IdQuery = false; 370 371 // If reqd_work_group_size is present it narrows value down. 372 if (auto *CI = dyn_cast<CallInst>(I)) { 373 const Function *F = CI->getCalledFunction(); 374 if (F) { 375 unsigned Dim = UINT_MAX; 376 switch (F->getIntrinsicID()) { 377 case Intrinsic::amdgcn_workitem_id_x: 378 case Intrinsic::r600_read_tidig_x: 379 IdQuery = true; 380 LLVM_FALLTHROUGH; 381 case Intrinsic::r600_read_local_size_x: 382 Dim = 0; 383 break; 384 case Intrinsic::amdgcn_workitem_id_y: 385 case Intrinsic::r600_read_tidig_y: 386 IdQuery = true; 387 LLVM_FALLTHROUGH; 388 case Intrinsic::r600_read_local_size_y: 389 Dim = 1; 390 break; 391 case Intrinsic::amdgcn_workitem_id_z: 392 case Intrinsic::r600_read_tidig_z: 393 IdQuery = true; 394 LLVM_FALLTHROUGH; 395 case Intrinsic::r600_read_local_size_z: 396 Dim = 2; 397 break; 398 default: 399 break; 400 } 401 if (Dim <= 3) { 402 if (auto Node = Kernel->getMetadata("reqd_work_group_size")) 403 if (Node->getNumOperands() == 3) 404 MinSize = MaxSize = mdconst::extract<ConstantInt>( 405 Node->getOperand(Dim))->getZExtValue(); 406 } 407 } 408 } 409 410 if (!MaxSize) 411 return false; 412 413 // Range metadata is [Lo, Hi). For ID query we need to pass max size 414 // as Hi. For size query we need to pass Hi + 1. 415 if (IdQuery) 416 MinSize = 0; 417 else 418 ++MaxSize; 419 420 MDBuilder MDB(I->getContext()); 421 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 422 APInt(32, MaxSize)); 423 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 424 return true; 425 } 426 427 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 428 unsigned &MaxAlign) const { 429 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 430 F.getCallingConv() == CallingConv::SPIR_KERNEL); 431 432 const DataLayout &DL = F.getParent()->getDataLayout(); 433 uint64_t ExplicitArgBytes = 0; 434 MaxAlign = 1; 435 436 for (const Argument &Arg : F.args()) { 437 Type *ArgTy = Arg.getType(); 438 439 unsigned Align = DL.getABITypeAlignment(ArgTy); 440 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 441 ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize; 442 MaxAlign = std::max(MaxAlign, Align); 443 } 444 445 return ExplicitArgBytes; 446 } 447 448 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 449 unsigned &MaxAlign) const { 450 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 451 452 unsigned ExplicitOffset = getExplicitKernelArgOffset(F); 453 454 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 455 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 456 if (ImplicitBytes != 0) { 457 unsigned Alignment = getAlignmentForImplicitArgPtr(); 458 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 459 } 460 461 // Being able to dereference past the end is useful for emitting scalar loads. 462 return alignTo(TotalSize, 4); 463 } 464 465 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 466 const TargetMachine &TM) : 467 R600GenSubtargetInfo(TT, GPU, FS), 468 AMDGPUSubtarget(TT), 469 InstrInfo(*this), 470 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 471 FMA(false), 472 CaymanISA(false), 473 CFALUBug(false), 474 HasVertexCache(false), 475 R600ALUInst(false), 476 FP64(false), 477 TexVTXClauseSize(0), 478 Gen(R600), 479 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)), 480 InstrItins(getInstrItineraryForCPU(GPU)) { } 481 482 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 483 unsigned NumRegionInstrs) const { 484 // Track register pressure so the scheduler can try to decrease 485 // pressure once register usage is above the threshold defined by 486 // SIRegisterInfo::getRegPressureSetLimit() 487 Policy.ShouldTrackPressure = true; 488 489 // Enabling both top down and bottom up scheduling seems to give us less 490 // register spills than just using one of these approaches on its own. 491 Policy.OnlyTopDown = false; 492 Policy.OnlyBottomUp = false; 493 494 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 495 if (!enableSIScheduler()) 496 Policy.ShouldTrackLaneMasks = true; 497 } 498 499 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 500 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 501 if (SGPRs <= 80) 502 return 10; 503 if (SGPRs <= 88) 504 return 9; 505 if (SGPRs <= 100) 506 return 8; 507 return 7; 508 } 509 if (SGPRs <= 48) 510 return 10; 511 if (SGPRs <= 56) 512 return 9; 513 if (SGPRs <= 64) 514 return 8; 515 if (SGPRs <= 72) 516 return 7; 517 if (SGPRs <= 80) 518 return 6; 519 return 5; 520 } 521 522 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 523 if (VGPRs <= 24) 524 return 10; 525 if (VGPRs <= 28) 526 return 9; 527 if (VGPRs <= 32) 528 return 8; 529 if (VGPRs <= 36) 530 return 7; 531 if (VGPRs <= 40) 532 return 6; 533 if (VGPRs <= 48) 534 return 5; 535 if (VGPRs <= 64) 536 return 4; 537 if (VGPRs <= 84) 538 return 3; 539 if (VGPRs <= 128) 540 return 2; 541 return 1; 542 } 543 544 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 545 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 546 if (MFI.hasFlatScratchInit()) { 547 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 548 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 549 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 550 return 4; // FLAT_SCRATCH, VCC (in that order). 551 } 552 553 if (isXNACKEnabled()) 554 return 4; // XNACK, VCC (in that order). 555 return 2; // VCC. 556 } 557 558 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 559 const Function &F = MF.getFunction(); 560 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 561 562 // Compute maximum number of SGPRs function can use using default/requested 563 // minimum number of waves per execution unit. 564 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 565 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 566 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 567 568 // Check if maximum number of SGPRs was explicitly requested using 569 // "amdgpu-num-sgpr" attribute. 570 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 571 unsigned Requested = AMDGPU::getIntegerAttribute( 572 F, "amdgpu-num-sgpr", MaxNumSGPRs); 573 574 // Make sure requested value does not violate subtarget's specifications. 575 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 576 Requested = 0; 577 578 // If more SGPRs are required to support the input user/system SGPRs, 579 // increase to accommodate them. 580 // 581 // FIXME: This really ends up using the requested number of SGPRs + number 582 // of reserved special registers in total. Theoretically you could re-use 583 // the last input registers for these special registers, but this would 584 // require a lot of complexity to deal with the weird aliasing. 585 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 586 if (Requested && Requested < InputNumSGPRs) 587 Requested = InputNumSGPRs; 588 589 // Make sure requested value is compatible with values implied by 590 // default/requested minimum/maximum number of waves per execution unit. 591 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 592 Requested = 0; 593 if (WavesPerEU.second && 594 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 595 Requested = 0; 596 597 if (Requested) 598 MaxNumSGPRs = Requested; 599 } 600 601 if (hasSGPRInitBug()) 602 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 603 604 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 605 MaxAddressableNumSGPRs); 606 } 607 608 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 609 const Function &F = MF.getFunction(); 610 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 611 612 // Compute maximum number of VGPRs function can use using default/requested 613 // minimum number of waves per execution unit. 614 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 615 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 616 617 // Check if maximum number of VGPRs was explicitly requested using 618 // "amdgpu-num-vgpr" attribute. 619 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 620 unsigned Requested = AMDGPU::getIntegerAttribute( 621 F, "amdgpu-num-vgpr", MaxNumVGPRs); 622 623 // Make sure requested value is compatible with values implied by 624 // default/requested minimum/maximum number of waves per execution unit. 625 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 626 Requested = 0; 627 if (WavesPerEU.second && 628 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 629 Requested = 0; 630 631 if (Requested) 632 MaxNumVGPRs = Requested; 633 } 634 635 return MaxNumVGPRs; 636 } 637 638 namespace { 639 struct MemOpClusterMutation : ScheduleDAGMutation { 640 const SIInstrInfo *TII; 641 642 MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {} 643 644 void apply(ScheduleDAGInstrs *DAG) override { 645 SUnit *SUa = nullptr; 646 // Search for two consequent memory operations and link them 647 // to prevent scheduler from moving them apart. 648 // In DAG pre-process SUnits are in the original order of 649 // the instructions before scheduling. 650 for (SUnit &SU : DAG->SUnits) { 651 MachineInstr &MI2 = *SU.getInstr(); 652 if (!MI2.mayLoad() && !MI2.mayStore()) { 653 SUa = nullptr; 654 continue; 655 } 656 if (!SUa) { 657 SUa = &SU; 658 continue; 659 } 660 661 MachineInstr &MI1 = *SUa->getInstr(); 662 if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) || 663 (TII->isFLAT(MI1) && TII->isFLAT(MI2)) || 664 (TII->isSMRD(MI1) && TII->isSMRD(MI2)) || 665 (TII->isDS(MI1) && TII->isDS(MI2))) { 666 SU.addPredBarrier(SUa); 667 668 for (const SDep &SI : SU.Preds) { 669 if (SI.getSUnit() != SUa) 670 SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial)); 671 } 672 673 if (&SU != &DAG->ExitSU) { 674 for (const SDep &SI : SUa->Succs) { 675 if (SI.getSUnit() != &SU) 676 SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial)); 677 } 678 } 679 } 680 681 SUa = &SU; 682 } 683 } 684 }; 685 } // namespace 686 687 void GCNSubtarget::getPostRAMutations( 688 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 689 Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo)); 690 } 691 692 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 693 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 694 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 695 else 696 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); 697 } 698 699 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 700 if (TM.getTargetTriple().getArch() == Triple::amdgcn) 701 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 702 else 703 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); 704 } 705