1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Implements the AMDGPU specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUSubtarget.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUTargetMachine.h" 17 #include "AMDGPUCallLowering.h" 18 #include "AMDGPUInstructionSelector.h" 19 #include "AMDGPULegalizerInfo.h" 20 #include "AMDGPURegisterBankInfo.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 23 #include "llvm/ADT/SmallString.h" 24 #include "llvm/CodeGen/MachineScheduler.h" 25 #include "llvm/MC/MCSubtargetInfo.h" 26 #include "llvm/IR/MDBuilder.h" 27 #include "llvm/CodeGen/TargetFrameLowering.h" 28 #include <algorithm> 29 30 using namespace llvm; 31 32 #define DEBUG_TYPE "amdgpu-subtarget" 33 34 #define GET_SUBTARGETINFO_TARGET_DESC 35 #define GET_SUBTARGETINFO_CTOR 36 #define AMDGPUSubtarget GCNSubtarget 37 #include "AMDGPUGenSubtargetInfo.inc" 38 #define GET_SUBTARGETINFO_TARGET_DESC 39 #define GET_SUBTARGETINFO_CTOR 40 #undef AMDGPUSubtarget 41 #include "R600GenSubtargetInfo.inc" 42 43 GCNSubtarget::~GCNSubtarget() = default; 44 45 R600Subtarget & 46 R600Subtarget::initializeSubtargetDependencies(const Triple &TT, 47 StringRef GPU, StringRef FS) { 48 SmallString<256> FullFS("+promote-alloca,+dx10-clamp,"); 49 FullFS += FS; 50 ParseSubtargetFeatures(GPU, FullFS); 51 52 // FIXME: I don't think think Evergreen has any useful support for 53 // denormals, but should be checked. Should we issue a warning somewhere 54 // if someone tries to enable these? 55 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { 56 FP32Denormals = false; 57 } 58 59 HasMulU24 = getGeneration() >= EVERGREEN; 60 HasMulI24 = hasCaymanISA(); 61 62 return *this; 63 } 64 65 GCNSubtarget & 66 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 67 StringRef GPU, StringRef FS) { 68 // Determine default and user-specified characteristics 69 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be 70 // enabled, but some instructions do not respect them and they run at the 71 // double precision rate, so don't enable by default. 72 // 73 // We want to be able to turn these off, but making this a subtarget feature 74 // for SI has the unhelpful behavior that it unsets everything else if you 75 // disable it. 76 // 77 // Similarly we want enable-prt-strict-null to be on by default and not to 78 // unset everything else if it is disabled 79 80 SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,"); 81 82 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 83 FullFS += "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,"; 84 85 // FIXME: I don't think think Evergreen has any useful support for 86 // denormals, but should be checked. Should we issue a warning somewhere 87 // if someone tries to enable these? 88 if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { 89 FullFS += "+fp64-fp16-denormals,"; 90 } else { 91 FullFS += "-fp32-denormals,"; 92 } 93 94 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS 95 96 FullFS += FS; 97 98 ParseSubtargetFeatures(GPU, FullFS); 99 100 // We don't support FP64 for EG/NI atm. 101 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 102 103 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es 104 // on VI and newer hardware to avoid assertion failures due to missing ADDR64 105 // variants of MUBUF instructions. 106 if (!hasAddr64() && !FS.contains("flat-for-global")) { 107 FlatForGlobal = true; 108 } 109 110 // Set defaults if needed. 111 if (MaxPrivateElementSize == 0) 112 MaxPrivateElementSize = 4; 113 114 if (LDSBankCount == 0) 115 LDSBankCount = 32; 116 117 if (TT.getArch() == Triple::amdgcn) { 118 if (LocalMemorySize == 0) 119 LocalMemorySize = 32768; 120 121 // Do something sensible for unspecified target. 122 if (!HasMovrel && !HasVGPRIndexMode) 123 HasMovrel = true; 124 } 125 126 // Don't crash on invalid devices. 127 if (WavefrontSize == 0) 128 WavefrontSize = 64; 129 130 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 131 132 return *this; 133 } 134 135 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : 136 TargetTriple(TT), 137 Has16BitInsts(false), 138 HasMadMixInsts(false), 139 FP32Denormals(false), 140 FPExceptions(false), 141 HasSDWA(false), 142 HasVOP3PInsts(false), 143 HasMulI24(true), 144 HasMulU24(true), 145 HasInv2PiInlineImm(false), 146 HasFminFmaxLegacy(true), 147 EnablePromoteAlloca(false), 148 HasTrigReducedRange(false), 149 LocalMemorySize(0), 150 WavefrontSize(0) 151 { } 152 153 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 154 const GCNTargetMachine &TM) : 155 AMDGPUGenSubtargetInfo(TT, GPU, FS), 156 AMDGPUSubtarget(TT), 157 TargetTriple(TT), 158 Gen(SOUTHERN_ISLANDS), 159 InstrItins(getInstrItineraryForCPU(GPU)), 160 LDSBankCount(0), 161 MaxPrivateElementSize(0), 162 163 FastFMAF32(false), 164 HalfRate64Ops(false), 165 166 FP64FP16Denormals(false), 167 DX10Clamp(false), 168 FlatForGlobal(false), 169 AutoWaitcntBeforeBarrier(false), 170 CodeObjectV3(false), 171 UnalignedScratchAccess(false), 172 UnalignedBufferAccess(false), 173 174 HasApertureRegs(false), 175 EnableXNACK(false), 176 TrapHandler(false), 177 178 EnableHugePrivateBuffer(false), 179 EnableLoadStoreOpt(false), 180 EnableUnsafeDSOffsetFolding(false), 181 EnableSIScheduler(false), 182 EnableDS128(false), 183 EnablePRTStrictNull(false), 184 DumpCode(false), 185 186 FP64(false), 187 GCN3Encoding(false), 188 CIInsts(false), 189 VIInsts(false), 190 GFX9Insts(false), 191 SGPRInitBug(false), 192 HasSMemRealTime(false), 193 HasIntClamp(false), 194 HasFmaMixInsts(false), 195 HasMovrel(false), 196 HasVGPRIndexMode(false), 197 HasScalarStores(false), 198 HasScalarAtomics(false), 199 HasSDWAOmod(false), 200 HasSDWAScalar(false), 201 HasSDWASdst(false), 202 HasSDWAMac(false), 203 HasSDWAOutModsVOPC(false), 204 HasDPP(false), 205 HasR128A16(false), 206 HasDLInsts(false), 207 HasDot1Insts(false), 208 HasDot2Insts(false), 209 EnableSRAMECC(false), 210 FlatAddressSpace(false), 211 FlatInstOffsets(false), 212 FlatGlobalInsts(false), 213 FlatScratchInsts(false), 214 AddNoCarryInsts(false), 215 HasUnpackedD16VMem(false), 216 217 ScalarizeGlobal(false), 218 219 FeatureDisable(false), 220 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 221 TLInfo(TM, *this), 222 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 223 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 224 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 225 RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo())); 226 InstSelector.reset(new AMDGPUInstructionSelector( 227 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 228 } 229 230 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 231 const Function &F) const { 232 if (NWaves == 1) 233 return getLocalMemorySize(); 234 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 235 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 236 unsigned MaxWaves = getMaxWavesPerEU(); 237 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 238 } 239 240 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 241 const Function &F) const { 242 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 243 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 244 unsigned MaxWaves = getMaxWavesPerEU(); 245 unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu; 246 unsigned NumWaves = Limit / (Bytes ? Bytes : 1u); 247 NumWaves = std::min(NumWaves, MaxWaves); 248 NumWaves = std::max(NumWaves, 1u); 249 return NumWaves; 250 } 251 252 unsigned 253 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 254 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 255 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 256 } 257 258 std::pair<unsigned, unsigned> 259 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 260 switch (CC) { 261 case CallingConv::AMDGPU_CS: 262 case CallingConv::AMDGPU_KERNEL: 263 case CallingConv::SPIR_KERNEL: 264 return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4); 265 case CallingConv::AMDGPU_VS: 266 case CallingConv::AMDGPU_LS: 267 case CallingConv::AMDGPU_HS: 268 case CallingConv::AMDGPU_ES: 269 case CallingConv::AMDGPU_GS: 270 case CallingConv::AMDGPU_PS: 271 return std::make_pair(1, getWavefrontSize()); 272 default: 273 return std::make_pair(1, 16 * getWavefrontSize()); 274 } 275 } 276 277 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 278 const Function &F) const { 279 // FIXME: 1024 if function. 280 // Default minimum/maximum flat work group sizes. 281 std::pair<unsigned, unsigned> Default = 282 getDefaultFlatWorkGroupSize(F.getCallingConv()); 283 284 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 285 // starts using "amdgpu-flat-work-group-size" attribute. 286 Default.second = AMDGPU::getIntegerAttribute( 287 F, "amdgpu-max-work-group-size", Default.second); 288 Default.first = std::min(Default.first, Default.second); 289 290 // Requested minimum/maximum flat work group sizes. 291 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 292 F, "amdgpu-flat-work-group-size", Default); 293 294 // Make sure requested minimum is less than requested maximum. 295 if (Requested.first > Requested.second) 296 return Default; 297 298 // Make sure requested values do not violate subtarget's specifications. 299 if (Requested.first < getMinFlatWorkGroupSize()) 300 return Default; 301 if (Requested.second > getMaxFlatWorkGroupSize()) 302 return Default; 303 304 return Requested; 305 } 306 307 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 308 const Function &F) const { 309 // Default minimum/maximum number of waves per execution unit. 310 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 311 312 // Default/requested minimum/maximum flat work group sizes. 313 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 314 315 // If minimum/maximum flat work group sizes were explicitly requested using 316 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 317 // number of waves per execution unit to values implied by requested 318 // minimum/maximum flat work group sizes. 319 unsigned MinImpliedByFlatWorkGroupSize = 320 getMaxWavesPerEU(FlatWorkGroupSizes.second); 321 bool RequestedFlatWorkGroupSize = false; 322 323 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 324 // starts using "amdgpu-flat-work-group-size" attribute. 325 if (F.hasFnAttribute("amdgpu-max-work-group-size") || 326 F.hasFnAttribute("amdgpu-flat-work-group-size")) { 327 Default.first = MinImpliedByFlatWorkGroupSize; 328 RequestedFlatWorkGroupSize = true; 329 } 330 331 // Requested minimum/maximum number of waves per execution unit. 332 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 333 F, "amdgpu-waves-per-eu", Default, true); 334 335 // Make sure requested minimum is less than requested maximum. 336 if (Requested.second && Requested.first > Requested.second) 337 return Default; 338 339 // Make sure requested values do not violate subtarget's specifications. 340 if (Requested.first < getMinWavesPerEU() || 341 Requested.first > getMaxWavesPerEU()) 342 return Default; 343 if (Requested.second > getMaxWavesPerEU()) 344 return Default; 345 346 // Make sure requested values are compatible with values implied by requested 347 // minimum/maximum flat work group sizes. 348 if (RequestedFlatWorkGroupSize && 349 Requested.first < MinImpliedByFlatWorkGroupSize) 350 return Default; 351 352 return Requested; 353 } 354 355 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 356 Function *Kernel = I->getParent()->getParent(); 357 unsigned MinSize = 0; 358 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 359 bool IdQuery = false; 360 361 // If reqd_work_group_size is present it narrows value down. 362 if (auto *CI = dyn_cast<CallInst>(I)) { 363 const Function *F = CI->getCalledFunction(); 364 if (F) { 365 unsigned Dim = UINT_MAX; 366 switch (F->getIntrinsicID()) { 367 case Intrinsic::amdgcn_workitem_id_x: 368 case Intrinsic::r600_read_tidig_x: 369 IdQuery = true; 370 LLVM_FALLTHROUGH; 371 case Intrinsic::r600_read_local_size_x: 372 Dim = 0; 373 break; 374 case Intrinsic::amdgcn_workitem_id_y: 375 case Intrinsic::r600_read_tidig_y: 376 IdQuery = true; 377 LLVM_FALLTHROUGH; 378 case Intrinsic::r600_read_local_size_y: 379 Dim = 1; 380 break; 381 case Intrinsic::amdgcn_workitem_id_z: 382 case Intrinsic::r600_read_tidig_z: 383 IdQuery = true; 384 LLVM_FALLTHROUGH; 385 case Intrinsic::r600_read_local_size_z: 386 Dim = 2; 387 break; 388 default: 389 break; 390 } 391 if (Dim <= 3) { 392 if (auto Node = Kernel->getMetadata("reqd_work_group_size")) 393 if (Node->getNumOperands() == 3) 394 MinSize = MaxSize = mdconst::extract<ConstantInt>( 395 Node->getOperand(Dim))->getZExtValue(); 396 } 397 } 398 } 399 400 if (!MaxSize) 401 return false; 402 403 // Range metadata is [Lo, Hi). For ID query we need to pass max size 404 // as Hi. For size query we need to pass Hi + 1. 405 if (IdQuery) 406 MinSize = 0; 407 else 408 ++MaxSize; 409 410 MDBuilder MDB(I->getContext()); 411 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 412 APInt(32, MaxSize)); 413 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 414 return true; 415 } 416 417 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 418 unsigned &MaxAlign) const { 419 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 420 F.getCallingConv() == CallingConv::SPIR_KERNEL); 421 422 const DataLayout &DL = F.getParent()->getDataLayout(); 423 uint64_t ExplicitArgBytes = 0; 424 MaxAlign = 1; 425 426 for (const Argument &Arg : F.args()) { 427 Type *ArgTy = Arg.getType(); 428 429 unsigned Align = DL.getABITypeAlignment(ArgTy); 430 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 431 ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize; 432 MaxAlign = std::max(MaxAlign, Align); 433 } 434 435 return ExplicitArgBytes; 436 } 437 438 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 439 unsigned &MaxAlign) const { 440 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 441 442 unsigned ExplicitOffset = getExplicitKernelArgOffset(F); 443 444 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 445 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 446 if (ImplicitBytes != 0) { 447 unsigned Alignment = getAlignmentForImplicitArgPtr(); 448 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 449 } 450 451 // Being able to dereference past the end is useful for emitting scalar loads. 452 return alignTo(TotalSize, 4); 453 } 454 455 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 456 const TargetMachine &TM) : 457 R600GenSubtargetInfo(TT, GPU, FS), 458 AMDGPUSubtarget(TT), 459 InstrInfo(*this), 460 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 461 FMA(false), 462 CaymanISA(false), 463 CFALUBug(false), 464 DX10Clamp(false), 465 HasVertexCache(false), 466 R600ALUInst(false), 467 FP64(false), 468 TexVTXClauseSize(0), 469 Gen(R600), 470 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)), 471 InstrItins(getInstrItineraryForCPU(GPU)) { } 472 473 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 474 unsigned NumRegionInstrs) const { 475 // Track register pressure so the scheduler can try to decrease 476 // pressure once register usage is above the threshold defined by 477 // SIRegisterInfo::getRegPressureSetLimit() 478 Policy.ShouldTrackPressure = true; 479 480 // Enabling both top down and bottom up scheduling seems to give us less 481 // register spills than just using one of these approaches on its own. 482 Policy.OnlyTopDown = false; 483 Policy.OnlyBottomUp = false; 484 485 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 486 if (!enableSIScheduler()) 487 Policy.ShouldTrackLaneMasks = true; 488 } 489 490 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 491 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 492 if (SGPRs <= 80) 493 return 10; 494 if (SGPRs <= 88) 495 return 9; 496 if (SGPRs <= 100) 497 return 8; 498 return 7; 499 } 500 if (SGPRs <= 48) 501 return 10; 502 if (SGPRs <= 56) 503 return 9; 504 if (SGPRs <= 64) 505 return 8; 506 if (SGPRs <= 72) 507 return 7; 508 if (SGPRs <= 80) 509 return 6; 510 return 5; 511 } 512 513 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 514 if (VGPRs <= 24) 515 return 10; 516 if (VGPRs <= 28) 517 return 9; 518 if (VGPRs <= 32) 519 return 8; 520 if (VGPRs <= 36) 521 return 7; 522 if (VGPRs <= 40) 523 return 6; 524 if (VGPRs <= 48) 525 return 5; 526 if (VGPRs <= 64) 527 return 4; 528 if (VGPRs <= 84) 529 return 3; 530 if (VGPRs <= 128) 531 return 2; 532 return 1; 533 } 534 535 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 536 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 537 if (MFI.hasFlatScratchInit()) { 538 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 539 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 540 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 541 return 4; // FLAT_SCRATCH, VCC (in that order). 542 } 543 544 if (isXNACKEnabled()) 545 return 4; // XNACK, VCC (in that order). 546 return 2; // VCC. 547 } 548 549 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 550 const Function &F = MF.getFunction(); 551 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 552 553 // Compute maximum number of SGPRs function can use using default/requested 554 // minimum number of waves per execution unit. 555 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 556 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 557 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 558 559 // Check if maximum number of SGPRs was explicitly requested using 560 // "amdgpu-num-sgpr" attribute. 561 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 562 unsigned Requested = AMDGPU::getIntegerAttribute( 563 F, "amdgpu-num-sgpr", MaxNumSGPRs); 564 565 // Make sure requested value does not violate subtarget's specifications. 566 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 567 Requested = 0; 568 569 // If more SGPRs are required to support the input user/system SGPRs, 570 // increase to accommodate them. 571 // 572 // FIXME: This really ends up using the requested number of SGPRs + number 573 // of reserved special registers in total. Theoretically you could re-use 574 // the last input registers for these special registers, but this would 575 // require a lot of complexity to deal with the weird aliasing. 576 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 577 if (Requested && Requested < InputNumSGPRs) 578 Requested = InputNumSGPRs; 579 580 // Make sure requested value is compatible with values implied by 581 // default/requested minimum/maximum number of waves per execution unit. 582 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 583 Requested = 0; 584 if (WavesPerEU.second && 585 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 586 Requested = 0; 587 588 if (Requested) 589 MaxNumSGPRs = Requested; 590 } 591 592 if (hasSGPRInitBug()) 593 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 594 595 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 596 MaxAddressableNumSGPRs); 597 } 598 599 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 600 const Function &F = MF.getFunction(); 601 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 602 603 // Compute maximum number of VGPRs function can use using default/requested 604 // minimum number of waves per execution unit. 605 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 606 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 607 608 // Check if maximum number of VGPRs was explicitly requested using 609 // "amdgpu-num-vgpr" attribute. 610 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 611 unsigned Requested = AMDGPU::getIntegerAttribute( 612 F, "amdgpu-num-vgpr", MaxNumVGPRs); 613 614 // Make sure requested value is compatible with values implied by 615 // default/requested minimum/maximum number of waves per execution unit. 616 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 617 Requested = 0; 618 if (WavesPerEU.second && 619 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 620 Requested = 0; 621 622 if (Requested) 623 MaxNumVGPRs = Requested; 624 } 625 626 return MaxNumVGPRs; 627 } 628 629 namespace { 630 struct MemOpClusterMutation : ScheduleDAGMutation { 631 const SIInstrInfo *TII; 632 633 MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {} 634 635 void apply(ScheduleDAGInstrs *DAGInstrs) override { 636 ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); 637 638 SUnit *SUa = nullptr; 639 // Search for two consequent memory operations and link them 640 // to prevent scheduler from moving them apart. 641 // In DAG pre-process SUnits are in the original order of 642 // the instructions before scheduling. 643 for (SUnit &SU : DAG->SUnits) { 644 MachineInstr &MI2 = *SU.getInstr(); 645 if (!MI2.mayLoad() && !MI2.mayStore()) { 646 SUa = nullptr; 647 continue; 648 } 649 if (!SUa) { 650 SUa = &SU; 651 continue; 652 } 653 654 MachineInstr &MI1 = *SUa->getInstr(); 655 if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) || 656 (TII->isFLAT(MI1) && TII->isFLAT(MI2)) || 657 (TII->isSMRD(MI1) && TII->isSMRD(MI2)) || 658 (TII->isDS(MI1) && TII->isDS(MI2))) { 659 SU.addPredBarrier(SUa); 660 661 for (const SDep &SI : SU.Preds) { 662 if (SI.getSUnit() != SUa) 663 SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial)); 664 } 665 666 if (&SU != &DAG->ExitSU) { 667 for (const SDep &SI : SUa->Succs) { 668 if (SI.getSUnit() != &SU) 669 SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial)); 670 } 671 } 672 } 673 674 SUa = &SU; 675 } 676 } 677 }; 678 } // namespace 679 680 void GCNSubtarget::getPostRAMutations( 681 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 682 Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo)); 683 } 684 685 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 686 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 687 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 688 else 689 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); 690 } 691 692 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 693 if (TM.getTargetTriple().getArch() == Triple::amdgcn) 694 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 695 else 696 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); 697 } 698