1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Implements the AMDGPU specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUSubtarget.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUTargetMachine.h" 17 #include "AMDGPUCallLowering.h" 18 #include "AMDGPUInstructionSelector.h" 19 #include "AMDGPULegalizerInfo.h" 20 #include "AMDGPURegisterBankInfo.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 23 #include "llvm/ADT/SmallString.h" 24 #include "llvm/CodeGen/MachineScheduler.h" 25 #include "llvm/MC/MCSubtargetInfo.h" 26 #include "llvm/IR/MDBuilder.h" 27 #include "llvm/CodeGen/TargetFrameLowering.h" 28 #include <algorithm> 29 30 using namespace llvm; 31 32 #define DEBUG_TYPE "amdgpu-subtarget" 33 34 #define GET_SUBTARGETINFO_TARGET_DESC 35 #define GET_SUBTARGETINFO_CTOR 36 #define AMDGPUSubtarget GCNSubtarget 37 #include "AMDGPUGenSubtargetInfo.inc" 38 #define GET_SUBTARGETINFO_TARGET_DESC 39 #define GET_SUBTARGETINFO_CTOR 40 #undef AMDGPUSubtarget 41 #include "R600GenSubtargetInfo.inc" 42 43 GCNSubtarget::~GCNSubtarget() = default; 44 45 R600Subtarget & 46 R600Subtarget::initializeSubtargetDependencies(const Triple &TT, 47 StringRef GPU, StringRef FS) { 48 SmallString<256> FullFS("+promote-alloca,"); 49 FullFS += FS; 50 ParseSubtargetFeatures(GPU, FullFS); 51 52 // FIXME: I don't think think Evergreen has any useful support for 53 // denormals, but should be checked. Should we issue a warning somewhere 54 // if someone tries to enable these? 55 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { 56 FP32Denormals = false; 57 } 58 59 HasMulU24 = getGeneration() >= EVERGREEN; 60 HasMulI24 = hasCaymanISA(); 61 62 return *this; 63 } 64 65 GCNSubtarget & 66 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 67 StringRef GPU, StringRef FS) { 68 // Determine default and user-specified characteristics 69 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be 70 // enabled, but some instructions do not respect them and they run at the 71 // double precision rate, so don't enable by default. 72 // 73 // We want to be able to turn these off, but making this a subtarget feature 74 // for SI has the unhelpful behavior that it unsets everything else if you 75 // disable it. 76 // 77 // Similarly we want enable-prt-strict-null to be on by default and not to 78 // unset everything else if it is disabled 79 80 SmallString<256> FullFS("+promote-alloca,+load-store-opt,"); 81 82 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 83 FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,"; 84 85 // FIXME: I don't think think Evergreen has any useful support for 86 // denormals, but should be checked. Should we issue a warning somewhere 87 // if someone tries to enable these? 88 if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { 89 FullFS += "+fp64-fp16-denormals,"; 90 } else { 91 FullFS += "-fp32-denormals,"; 92 } 93 94 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS 95 96 FullFS += FS; 97 98 ParseSubtargetFeatures(GPU, FullFS); 99 100 // We don't support FP64 for EG/NI atm. 101 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 102 103 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es 104 // on VI and newer hardware to avoid assertion failures due to missing ADDR64 105 // variants of MUBUF instructions. 106 if (!hasAddr64() && !FS.contains("flat-for-global")) { 107 FlatForGlobal = true; 108 } 109 110 // Set defaults if needed. 111 if (MaxPrivateElementSize == 0) 112 MaxPrivateElementSize = 4; 113 114 if (LDSBankCount == 0) 115 LDSBankCount = 32; 116 117 if (TT.getArch() == Triple::amdgcn) { 118 if (LocalMemorySize == 0) 119 LocalMemorySize = 32768; 120 121 // Do something sensible for unspecified target. 122 if (!HasMovrel && !HasVGPRIndexMode) 123 HasMovrel = true; 124 } 125 126 // Don't crash on invalid devices. 127 if (WavefrontSize == 0) 128 WavefrontSize = 64; 129 130 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 131 132 return *this; 133 } 134 135 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : 136 TargetTriple(TT), 137 Has16BitInsts(false), 138 HasMadMixInsts(false), 139 FP32Denormals(false), 140 FPExceptions(false), 141 HasSDWA(false), 142 HasVOP3PInsts(false), 143 HasMulI24(true), 144 HasMulU24(true), 145 HasInv2PiInlineImm(false), 146 HasFminFmaxLegacy(true), 147 EnablePromoteAlloca(false), 148 HasTrigReducedRange(false), 149 LocalMemorySize(0), 150 WavefrontSize(0) 151 { } 152 153 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 154 const GCNTargetMachine &TM) : 155 AMDGPUGenSubtargetInfo(TT, GPU, FS), 156 AMDGPUSubtarget(TT), 157 TargetTriple(TT), 158 Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS), 159 InstrItins(getInstrItineraryForCPU(GPU)), 160 LDSBankCount(0), 161 MaxPrivateElementSize(0), 162 163 FastFMAF32(false), 164 HalfRate64Ops(false), 165 166 FP64FP16Denormals(false), 167 FlatForGlobal(false), 168 AutoWaitcntBeforeBarrier(false), 169 CodeObjectV3(false), 170 UnalignedScratchAccess(false), 171 UnalignedBufferAccess(false), 172 173 HasApertureRegs(false), 174 EnableXNACK(false), 175 TrapHandler(false), 176 177 EnableHugePrivateBuffer(false), 178 EnableLoadStoreOpt(false), 179 EnableUnsafeDSOffsetFolding(false), 180 EnableSIScheduler(false), 181 EnableDS128(false), 182 EnablePRTStrictNull(false), 183 DumpCode(false), 184 185 FP64(false), 186 GCN3Encoding(false), 187 CIInsts(false), 188 VIInsts(false), 189 GFX9Insts(false), 190 SGPRInitBug(false), 191 HasSMemRealTime(false), 192 HasIntClamp(false), 193 HasFmaMixInsts(false), 194 HasMovrel(false), 195 HasVGPRIndexMode(false), 196 HasScalarStores(false), 197 HasScalarAtomics(false), 198 HasSDWAOmod(false), 199 HasSDWAScalar(false), 200 HasSDWASdst(false), 201 HasSDWAMac(false), 202 HasSDWAOutModsVOPC(false), 203 HasDPP(false), 204 HasR128A16(false), 205 HasDLInsts(false), 206 HasDot1Insts(false), 207 HasDot2Insts(false), 208 EnableSRAMECC(false), 209 FlatAddressSpace(false), 210 FlatInstOffsets(false), 211 FlatGlobalInsts(false), 212 FlatScratchInsts(false), 213 AddNoCarryInsts(false), 214 HasUnpackedD16VMem(false), 215 216 ScalarizeGlobal(false), 217 218 FeatureDisable(false), 219 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 220 TLInfo(TM, *this), 221 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 222 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 223 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 224 RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo())); 225 InstSelector.reset(new AMDGPUInstructionSelector( 226 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 227 } 228 229 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 230 const Function &F) const { 231 if (NWaves == 1) 232 return getLocalMemorySize(); 233 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 234 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 235 unsigned MaxWaves = getMaxWavesPerEU(); 236 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 237 } 238 239 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 240 const Function &F) const { 241 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 242 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 243 unsigned MaxWaves = getMaxWavesPerEU(); 244 unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu; 245 unsigned NumWaves = Limit / (Bytes ? Bytes : 1u); 246 NumWaves = std::min(NumWaves, MaxWaves); 247 NumWaves = std::max(NumWaves, 1u); 248 return NumWaves; 249 } 250 251 unsigned 252 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 253 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 254 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 255 } 256 257 std::pair<unsigned, unsigned> 258 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 259 switch (CC) { 260 case CallingConv::AMDGPU_CS: 261 case CallingConv::AMDGPU_KERNEL: 262 case CallingConv::SPIR_KERNEL: 263 return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4); 264 case CallingConv::AMDGPU_VS: 265 case CallingConv::AMDGPU_LS: 266 case CallingConv::AMDGPU_HS: 267 case CallingConv::AMDGPU_ES: 268 case CallingConv::AMDGPU_GS: 269 case CallingConv::AMDGPU_PS: 270 return std::make_pair(1, getWavefrontSize()); 271 default: 272 return std::make_pair(1, 16 * getWavefrontSize()); 273 } 274 } 275 276 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 277 const Function &F) const { 278 // FIXME: 1024 if function. 279 // Default minimum/maximum flat work group sizes. 280 std::pair<unsigned, unsigned> Default = 281 getDefaultFlatWorkGroupSize(F.getCallingConv()); 282 283 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 284 // starts using "amdgpu-flat-work-group-size" attribute. 285 Default.second = AMDGPU::getIntegerAttribute( 286 F, "amdgpu-max-work-group-size", Default.second); 287 Default.first = std::min(Default.first, Default.second); 288 289 // Requested minimum/maximum flat work group sizes. 290 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 291 F, "amdgpu-flat-work-group-size", Default); 292 293 // Make sure requested minimum is less than requested maximum. 294 if (Requested.first > Requested.second) 295 return Default; 296 297 // Make sure requested values do not violate subtarget's specifications. 298 if (Requested.first < getMinFlatWorkGroupSize()) 299 return Default; 300 if (Requested.second > getMaxFlatWorkGroupSize()) 301 return Default; 302 303 return Requested; 304 } 305 306 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 307 const Function &F) const { 308 // Default minimum/maximum number of waves per execution unit. 309 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 310 311 // Default/requested minimum/maximum flat work group sizes. 312 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 313 314 // If minimum/maximum flat work group sizes were explicitly requested using 315 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 316 // number of waves per execution unit to values implied by requested 317 // minimum/maximum flat work group sizes. 318 unsigned MinImpliedByFlatWorkGroupSize = 319 getMaxWavesPerEU(FlatWorkGroupSizes.second); 320 bool RequestedFlatWorkGroupSize = false; 321 322 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 323 // starts using "amdgpu-flat-work-group-size" attribute. 324 if (F.hasFnAttribute("amdgpu-max-work-group-size") || 325 F.hasFnAttribute("amdgpu-flat-work-group-size")) { 326 Default.first = MinImpliedByFlatWorkGroupSize; 327 RequestedFlatWorkGroupSize = true; 328 } 329 330 // Requested minimum/maximum number of waves per execution unit. 331 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 332 F, "amdgpu-waves-per-eu", Default, true); 333 334 // Make sure requested minimum is less than requested maximum. 335 if (Requested.second && Requested.first > Requested.second) 336 return Default; 337 338 // Make sure requested values do not violate subtarget's specifications. 339 if (Requested.first < getMinWavesPerEU() || 340 Requested.first > getMaxWavesPerEU()) 341 return Default; 342 if (Requested.second > getMaxWavesPerEU()) 343 return Default; 344 345 // Make sure requested values are compatible with values implied by requested 346 // minimum/maximum flat work group sizes. 347 if (RequestedFlatWorkGroupSize && 348 Requested.first < MinImpliedByFlatWorkGroupSize) 349 return Default; 350 351 return Requested; 352 } 353 354 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 355 Function *Kernel = I->getParent()->getParent(); 356 unsigned MinSize = 0; 357 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 358 bool IdQuery = false; 359 360 // If reqd_work_group_size is present it narrows value down. 361 if (auto *CI = dyn_cast<CallInst>(I)) { 362 const Function *F = CI->getCalledFunction(); 363 if (F) { 364 unsigned Dim = UINT_MAX; 365 switch (F->getIntrinsicID()) { 366 case Intrinsic::amdgcn_workitem_id_x: 367 case Intrinsic::r600_read_tidig_x: 368 IdQuery = true; 369 LLVM_FALLTHROUGH; 370 case Intrinsic::r600_read_local_size_x: 371 Dim = 0; 372 break; 373 case Intrinsic::amdgcn_workitem_id_y: 374 case Intrinsic::r600_read_tidig_y: 375 IdQuery = true; 376 LLVM_FALLTHROUGH; 377 case Intrinsic::r600_read_local_size_y: 378 Dim = 1; 379 break; 380 case Intrinsic::amdgcn_workitem_id_z: 381 case Intrinsic::r600_read_tidig_z: 382 IdQuery = true; 383 LLVM_FALLTHROUGH; 384 case Intrinsic::r600_read_local_size_z: 385 Dim = 2; 386 break; 387 default: 388 break; 389 } 390 if (Dim <= 3) { 391 if (auto Node = Kernel->getMetadata("reqd_work_group_size")) 392 if (Node->getNumOperands() == 3) 393 MinSize = MaxSize = mdconst::extract<ConstantInt>( 394 Node->getOperand(Dim))->getZExtValue(); 395 } 396 } 397 } 398 399 if (!MaxSize) 400 return false; 401 402 // Range metadata is [Lo, Hi). For ID query we need to pass max size 403 // as Hi. For size query we need to pass Hi + 1. 404 if (IdQuery) 405 MinSize = 0; 406 else 407 ++MaxSize; 408 409 MDBuilder MDB(I->getContext()); 410 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 411 APInt(32, MaxSize)); 412 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 413 return true; 414 } 415 416 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 417 unsigned &MaxAlign) const { 418 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 419 F.getCallingConv() == CallingConv::SPIR_KERNEL); 420 421 const DataLayout &DL = F.getParent()->getDataLayout(); 422 uint64_t ExplicitArgBytes = 0; 423 MaxAlign = 1; 424 425 for (const Argument &Arg : F.args()) { 426 Type *ArgTy = Arg.getType(); 427 428 unsigned Align = DL.getABITypeAlignment(ArgTy); 429 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 430 ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize; 431 MaxAlign = std::max(MaxAlign, Align); 432 } 433 434 return ExplicitArgBytes; 435 } 436 437 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 438 unsigned &MaxAlign) const { 439 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 440 441 unsigned ExplicitOffset = getExplicitKernelArgOffset(F); 442 443 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 444 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 445 if (ImplicitBytes != 0) { 446 unsigned Alignment = getAlignmentForImplicitArgPtr(); 447 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 448 } 449 450 // Being able to dereference past the end is useful for emitting scalar loads. 451 return alignTo(TotalSize, 4); 452 } 453 454 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 455 const TargetMachine &TM) : 456 R600GenSubtargetInfo(TT, GPU, FS), 457 AMDGPUSubtarget(TT), 458 InstrInfo(*this), 459 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 460 FMA(false), 461 CaymanISA(false), 462 CFALUBug(false), 463 HasVertexCache(false), 464 R600ALUInst(false), 465 FP64(false), 466 TexVTXClauseSize(0), 467 Gen(R600), 468 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)), 469 InstrItins(getInstrItineraryForCPU(GPU)) { } 470 471 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 472 unsigned NumRegionInstrs) const { 473 // Track register pressure so the scheduler can try to decrease 474 // pressure once register usage is above the threshold defined by 475 // SIRegisterInfo::getRegPressureSetLimit() 476 Policy.ShouldTrackPressure = true; 477 478 // Enabling both top down and bottom up scheduling seems to give us less 479 // register spills than just using one of these approaches on its own. 480 Policy.OnlyTopDown = false; 481 Policy.OnlyBottomUp = false; 482 483 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 484 if (!enableSIScheduler()) 485 Policy.ShouldTrackLaneMasks = true; 486 } 487 488 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 489 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 490 if (SGPRs <= 80) 491 return 10; 492 if (SGPRs <= 88) 493 return 9; 494 if (SGPRs <= 100) 495 return 8; 496 return 7; 497 } 498 if (SGPRs <= 48) 499 return 10; 500 if (SGPRs <= 56) 501 return 9; 502 if (SGPRs <= 64) 503 return 8; 504 if (SGPRs <= 72) 505 return 7; 506 if (SGPRs <= 80) 507 return 6; 508 return 5; 509 } 510 511 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 512 if (VGPRs <= 24) 513 return 10; 514 if (VGPRs <= 28) 515 return 9; 516 if (VGPRs <= 32) 517 return 8; 518 if (VGPRs <= 36) 519 return 7; 520 if (VGPRs <= 40) 521 return 6; 522 if (VGPRs <= 48) 523 return 5; 524 if (VGPRs <= 64) 525 return 4; 526 if (VGPRs <= 84) 527 return 3; 528 if (VGPRs <= 128) 529 return 2; 530 return 1; 531 } 532 533 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 534 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 535 if (MFI.hasFlatScratchInit()) { 536 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 537 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 538 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 539 return 4; // FLAT_SCRATCH, VCC (in that order). 540 } 541 542 if (isXNACKEnabled()) 543 return 4; // XNACK, VCC (in that order). 544 return 2; // VCC. 545 } 546 547 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 548 const Function &F = MF.getFunction(); 549 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 550 551 // Compute maximum number of SGPRs function can use using default/requested 552 // minimum number of waves per execution unit. 553 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 554 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 555 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 556 557 // Check if maximum number of SGPRs was explicitly requested using 558 // "amdgpu-num-sgpr" attribute. 559 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 560 unsigned Requested = AMDGPU::getIntegerAttribute( 561 F, "amdgpu-num-sgpr", MaxNumSGPRs); 562 563 // Make sure requested value does not violate subtarget's specifications. 564 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 565 Requested = 0; 566 567 // If more SGPRs are required to support the input user/system SGPRs, 568 // increase to accommodate them. 569 // 570 // FIXME: This really ends up using the requested number of SGPRs + number 571 // of reserved special registers in total. Theoretically you could re-use 572 // the last input registers for these special registers, but this would 573 // require a lot of complexity to deal with the weird aliasing. 574 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 575 if (Requested && Requested < InputNumSGPRs) 576 Requested = InputNumSGPRs; 577 578 // Make sure requested value is compatible with values implied by 579 // default/requested minimum/maximum number of waves per execution unit. 580 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 581 Requested = 0; 582 if (WavesPerEU.second && 583 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 584 Requested = 0; 585 586 if (Requested) 587 MaxNumSGPRs = Requested; 588 } 589 590 if (hasSGPRInitBug()) 591 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 592 593 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 594 MaxAddressableNumSGPRs); 595 } 596 597 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 598 const Function &F = MF.getFunction(); 599 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 600 601 // Compute maximum number of VGPRs function can use using default/requested 602 // minimum number of waves per execution unit. 603 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 604 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 605 606 // Check if maximum number of VGPRs was explicitly requested using 607 // "amdgpu-num-vgpr" attribute. 608 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 609 unsigned Requested = AMDGPU::getIntegerAttribute( 610 F, "amdgpu-num-vgpr", MaxNumVGPRs); 611 612 // Make sure requested value is compatible with values implied by 613 // default/requested minimum/maximum number of waves per execution unit. 614 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 615 Requested = 0; 616 if (WavesPerEU.second && 617 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 618 Requested = 0; 619 620 if (Requested) 621 MaxNumVGPRs = Requested; 622 } 623 624 return MaxNumVGPRs; 625 } 626 627 namespace { 628 struct MemOpClusterMutation : ScheduleDAGMutation { 629 const SIInstrInfo *TII; 630 631 MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {} 632 633 void apply(ScheduleDAGInstrs *DAG) override { 634 SUnit *SUa = nullptr; 635 // Search for two consequent memory operations and link them 636 // to prevent scheduler from moving them apart. 637 // In DAG pre-process SUnits are in the original order of 638 // the instructions before scheduling. 639 for (SUnit &SU : DAG->SUnits) { 640 MachineInstr &MI2 = *SU.getInstr(); 641 if (!MI2.mayLoad() && !MI2.mayStore()) { 642 SUa = nullptr; 643 continue; 644 } 645 if (!SUa) { 646 SUa = &SU; 647 continue; 648 } 649 650 MachineInstr &MI1 = *SUa->getInstr(); 651 if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) || 652 (TII->isFLAT(MI1) && TII->isFLAT(MI2)) || 653 (TII->isSMRD(MI1) && TII->isSMRD(MI2)) || 654 (TII->isDS(MI1) && TII->isDS(MI2))) { 655 SU.addPredBarrier(SUa); 656 657 for (const SDep &SI : SU.Preds) { 658 if (SI.getSUnit() != SUa) 659 SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial)); 660 } 661 662 if (&SU != &DAG->ExitSU) { 663 for (const SDep &SI : SUa->Succs) { 664 if (SI.getSUnit() != &SU) 665 SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial)); 666 } 667 } 668 } 669 670 SUa = &SU; 671 } 672 } 673 }; 674 } // namespace 675 676 void GCNSubtarget::getPostRAMutations( 677 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 678 Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo)); 679 } 680 681 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 682 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 683 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 684 else 685 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); 686 } 687 688 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 689 if (TM.getTargetTriple().getArch() == Triple::amdgcn) 690 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 691 else 692 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); 693 } 694