1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Implements the AMDGPU specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUSubtarget.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUTargetMachine.h" 17 #include "AMDGPUCallLowering.h" 18 #include "AMDGPUInstructionSelector.h" 19 #include "AMDGPULegalizerInfo.h" 20 #include "AMDGPURegisterBankInfo.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 23 #include "llvm/ADT/SmallString.h" 24 #include "llvm/CodeGen/MachineScheduler.h" 25 #include "llvm/MC/MCSubtargetInfo.h" 26 #include "llvm/IR/MDBuilder.h" 27 #include "llvm/CodeGen/TargetFrameLowering.h" 28 #include <algorithm> 29 30 using namespace llvm; 31 32 #define DEBUG_TYPE "amdgpu-subtarget" 33 34 #define GET_SUBTARGETINFO_TARGET_DESC 35 #define GET_SUBTARGETINFO_CTOR 36 #define AMDGPUSubtarget GCNSubtarget 37 #include "AMDGPUGenSubtargetInfo.inc" 38 #define GET_SUBTARGETINFO_TARGET_DESC 39 #define GET_SUBTARGETINFO_CTOR 40 #undef AMDGPUSubtarget 41 #include "R600GenSubtargetInfo.inc" 42 43 GCNSubtarget::~GCNSubtarget() = default; 44 45 R600Subtarget & 46 R600Subtarget::initializeSubtargetDependencies(const Triple &TT, 47 StringRef GPU, StringRef FS) { 48 SmallString<256> FullFS("+promote-alloca,+dx10-clamp,"); 49 FullFS += FS; 50 ParseSubtargetFeatures(GPU, FullFS); 51 52 // FIXME: I don't think think Evergreen has any useful support for 53 // denormals, but should be checked. Should we issue a warning somewhere 54 // if someone tries to enable these? 55 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { 56 FP32Denormals = false; 57 } 58 59 HasMulU24 = getGeneration() >= EVERGREEN; 60 HasMulI24 = hasCaymanISA(); 61 62 return *this; 63 } 64 65 GCNSubtarget & 66 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 67 StringRef GPU, StringRef FS) { 68 // Determine default and user-specified characteristics 69 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be 70 // enabled, but some instructions do not respect them and they run at the 71 // double precision rate, so don't enable by default. 72 // 73 // We want to be able to turn these off, but making this a subtarget feature 74 // for SI has the unhelpful behavior that it unsets everything else if you 75 // disable it. 76 // 77 // Similarly we want enable-prt-strict-null to be on by default and not to 78 // unset everything else if it is disabled 79 80 SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,"); 81 82 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 83 FullFS += "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,"; 84 85 // FIXME: I don't think think Evergreen has any useful support for 86 // denormals, but should be checked. Should we issue a warning somewhere 87 // if someone tries to enable these? 88 if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { 89 FullFS += "+fp64-fp16-denormals,"; 90 } else { 91 FullFS += "-fp32-denormals,"; 92 } 93 94 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS 95 96 FullFS += FS; 97 98 ParseSubtargetFeatures(GPU, FullFS); 99 100 // We don't support FP64 for EG/NI atm. 101 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 102 103 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es 104 // on VI and newer hardware to avoid assertion failures due to missing ADDR64 105 // variants of MUBUF instructions. 106 if (!hasAddr64() && !FS.contains("flat-for-global")) { 107 FlatForGlobal = true; 108 } 109 110 // Set defaults if needed. 111 if (MaxPrivateElementSize == 0) 112 MaxPrivateElementSize = 4; 113 114 if (LDSBankCount == 0) 115 LDSBankCount = 32; 116 117 if (TT.getArch() == Triple::amdgcn) { 118 if (LocalMemorySize == 0) 119 LocalMemorySize = 32768; 120 121 // Do something sensible for unspecified target. 122 if (!HasMovrel && !HasVGPRIndexMode) 123 HasMovrel = true; 124 } 125 126 // Don't crash on invalid devices. 127 if (WavefrontSize == 0) 128 WavefrontSize = 64; 129 130 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 131 132 return *this; 133 } 134 135 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : 136 TargetTriple(TT), 137 Has16BitInsts(false), 138 HasMadMixInsts(false), 139 FP32Denormals(false), 140 FPExceptions(false), 141 HasSDWA(false), 142 HasVOP3PInsts(false), 143 HasMulI24(true), 144 HasMulU24(true), 145 HasInv2PiInlineImm(false), 146 HasFminFmaxLegacy(true), 147 EnablePromoteAlloca(false), 148 HasTrigReducedRange(false), 149 LocalMemorySize(0), 150 WavefrontSize(0) 151 { } 152 153 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 154 const GCNTargetMachine &TM) : 155 AMDGPUGenSubtargetInfo(TT, GPU, FS), 156 AMDGPUSubtarget(TT), 157 TargetTriple(TT), 158 Gen(SOUTHERN_ISLANDS), 159 InstrItins(getInstrItineraryForCPU(GPU)), 160 LDSBankCount(0), 161 MaxPrivateElementSize(0), 162 163 FastFMAF32(false), 164 HalfRate64Ops(false), 165 166 FP64FP16Denormals(false), 167 DX10Clamp(false), 168 FlatForGlobal(false), 169 AutoWaitcntBeforeBarrier(false), 170 CodeObjectV3(false), 171 UnalignedScratchAccess(false), 172 UnalignedBufferAccess(false), 173 174 HasApertureRegs(false), 175 EnableXNACK(false), 176 TrapHandler(false), 177 DebuggerInsertNops(false), 178 DebuggerEmitPrologue(false), 179 180 EnableHugePrivateBuffer(false), 181 EnableLoadStoreOpt(false), 182 EnableUnsafeDSOffsetFolding(false), 183 EnableSIScheduler(false), 184 EnableDS128(false), 185 EnablePRTStrictNull(false), 186 DumpCode(false), 187 188 FP64(false), 189 GCN3Encoding(false), 190 CIInsts(false), 191 VIInsts(false), 192 GFX9Insts(false), 193 SGPRInitBug(false), 194 HasSMemRealTime(false), 195 HasIntClamp(false), 196 HasFmaMixInsts(false), 197 HasMovrel(false), 198 HasVGPRIndexMode(false), 199 HasScalarStores(false), 200 HasScalarAtomics(false), 201 HasSDWAOmod(false), 202 HasSDWAScalar(false), 203 HasSDWASdst(false), 204 HasSDWAMac(false), 205 HasSDWAOutModsVOPC(false), 206 HasDPP(false), 207 HasR128A16(false), 208 HasDLInsts(false), 209 HasDot1Insts(false), 210 HasDot2Insts(false), 211 EnableSRAMECC(false), 212 FlatAddressSpace(false), 213 FlatInstOffsets(false), 214 FlatGlobalInsts(false), 215 FlatScratchInsts(false), 216 AddNoCarryInsts(false), 217 HasUnpackedD16VMem(false), 218 219 ScalarizeGlobal(false), 220 221 FeatureDisable(false), 222 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 223 TLInfo(TM, *this), 224 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 225 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 226 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 227 RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo())); 228 InstSelector.reset(new AMDGPUInstructionSelector( 229 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 230 } 231 232 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 233 const Function &F) const { 234 if (NWaves == 1) 235 return getLocalMemorySize(); 236 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 237 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 238 unsigned MaxWaves = getMaxWavesPerEU(); 239 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 240 } 241 242 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 243 const Function &F) const { 244 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 245 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 246 unsigned MaxWaves = getMaxWavesPerEU(); 247 unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu; 248 unsigned NumWaves = Limit / (Bytes ? Bytes : 1u); 249 NumWaves = std::min(NumWaves, MaxWaves); 250 NumWaves = std::max(NumWaves, 1u); 251 return NumWaves; 252 } 253 254 unsigned 255 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 256 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 257 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 258 } 259 260 std::pair<unsigned, unsigned> 261 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 262 switch (CC) { 263 case CallingConv::AMDGPU_CS: 264 case CallingConv::AMDGPU_KERNEL: 265 case CallingConv::SPIR_KERNEL: 266 return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4); 267 case CallingConv::AMDGPU_VS: 268 case CallingConv::AMDGPU_LS: 269 case CallingConv::AMDGPU_HS: 270 case CallingConv::AMDGPU_ES: 271 case CallingConv::AMDGPU_GS: 272 case CallingConv::AMDGPU_PS: 273 return std::make_pair(1, getWavefrontSize()); 274 default: 275 return std::make_pair(1, 16 * getWavefrontSize()); 276 } 277 } 278 279 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 280 const Function &F) const { 281 // FIXME: 1024 if function. 282 // Default minimum/maximum flat work group sizes. 283 std::pair<unsigned, unsigned> Default = 284 getDefaultFlatWorkGroupSize(F.getCallingConv()); 285 286 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 287 // starts using "amdgpu-flat-work-group-size" attribute. 288 Default.second = AMDGPU::getIntegerAttribute( 289 F, "amdgpu-max-work-group-size", Default.second); 290 Default.first = std::min(Default.first, Default.second); 291 292 // Requested minimum/maximum flat work group sizes. 293 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 294 F, "amdgpu-flat-work-group-size", Default); 295 296 // Make sure requested minimum is less than requested maximum. 297 if (Requested.first > Requested.second) 298 return Default; 299 300 // Make sure requested values do not violate subtarget's specifications. 301 if (Requested.first < getMinFlatWorkGroupSize()) 302 return Default; 303 if (Requested.second > getMaxFlatWorkGroupSize()) 304 return Default; 305 306 return Requested; 307 } 308 309 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 310 const Function &F) const { 311 // Default minimum/maximum number of waves per execution unit. 312 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 313 314 // Default/requested minimum/maximum flat work group sizes. 315 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 316 317 // If minimum/maximum flat work group sizes were explicitly requested using 318 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 319 // number of waves per execution unit to values implied by requested 320 // minimum/maximum flat work group sizes. 321 unsigned MinImpliedByFlatWorkGroupSize = 322 getMaxWavesPerEU(FlatWorkGroupSizes.second); 323 bool RequestedFlatWorkGroupSize = false; 324 325 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 326 // starts using "amdgpu-flat-work-group-size" attribute. 327 if (F.hasFnAttribute("amdgpu-max-work-group-size") || 328 F.hasFnAttribute("amdgpu-flat-work-group-size")) { 329 Default.first = MinImpliedByFlatWorkGroupSize; 330 RequestedFlatWorkGroupSize = true; 331 } 332 333 // Requested minimum/maximum number of waves per execution unit. 334 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 335 F, "amdgpu-waves-per-eu", Default, true); 336 337 // Make sure requested minimum is less than requested maximum. 338 if (Requested.second && Requested.first > Requested.second) 339 return Default; 340 341 // Make sure requested values do not violate subtarget's specifications. 342 if (Requested.first < getMinWavesPerEU() || 343 Requested.first > getMaxWavesPerEU()) 344 return Default; 345 if (Requested.second > getMaxWavesPerEU()) 346 return Default; 347 348 // Make sure requested values are compatible with values implied by requested 349 // minimum/maximum flat work group sizes. 350 if (RequestedFlatWorkGroupSize && 351 Requested.first < MinImpliedByFlatWorkGroupSize) 352 return Default; 353 354 return Requested; 355 } 356 357 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 358 Function *Kernel = I->getParent()->getParent(); 359 unsigned MinSize = 0; 360 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 361 bool IdQuery = false; 362 363 // If reqd_work_group_size is present it narrows value down. 364 if (auto *CI = dyn_cast<CallInst>(I)) { 365 const Function *F = CI->getCalledFunction(); 366 if (F) { 367 unsigned Dim = UINT_MAX; 368 switch (F->getIntrinsicID()) { 369 case Intrinsic::amdgcn_workitem_id_x: 370 case Intrinsic::r600_read_tidig_x: 371 IdQuery = true; 372 LLVM_FALLTHROUGH; 373 case Intrinsic::r600_read_local_size_x: 374 Dim = 0; 375 break; 376 case Intrinsic::amdgcn_workitem_id_y: 377 case Intrinsic::r600_read_tidig_y: 378 IdQuery = true; 379 LLVM_FALLTHROUGH; 380 case Intrinsic::r600_read_local_size_y: 381 Dim = 1; 382 break; 383 case Intrinsic::amdgcn_workitem_id_z: 384 case Intrinsic::r600_read_tidig_z: 385 IdQuery = true; 386 LLVM_FALLTHROUGH; 387 case Intrinsic::r600_read_local_size_z: 388 Dim = 2; 389 break; 390 default: 391 break; 392 } 393 if (Dim <= 3) { 394 if (auto Node = Kernel->getMetadata("reqd_work_group_size")) 395 if (Node->getNumOperands() == 3) 396 MinSize = MaxSize = mdconst::extract<ConstantInt>( 397 Node->getOperand(Dim))->getZExtValue(); 398 } 399 } 400 } 401 402 if (!MaxSize) 403 return false; 404 405 // Range metadata is [Lo, Hi). For ID query we need to pass max size 406 // as Hi. For size query we need to pass Hi + 1. 407 if (IdQuery) 408 MinSize = 0; 409 else 410 ++MaxSize; 411 412 MDBuilder MDB(I->getContext()); 413 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 414 APInt(32, MaxSize)); 415 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 416 return true; 417 } 418 419 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 420 unsigned &MaxAlign) const { 421 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 422 F.getCallingConv() == CallingConv::SPIR_KERNEL); 423 424 const DataLayout &DL = F.getParent()->getDataLayout(); 425 uint64_t ExplicitArgBytes = 0; 426 MaxAlign = 1; 427 428 for (const Argument &Arg : F.args()) { 429 Type *ArgTy = Arg.getType(); 430 431 unsigned Align = DL.getABITypeAlignment(ArgTy); 432 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 433 ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize; 434 MaxAlign = std::max(MaxAlign, Align); 435 } 436 437 return ExplicitArgBytes; 438 } 439 440 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 441 unsigned &MaxAlign) const { 442 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 443 444 unsigned ExplicitOffset = getExplicitKernelArgOffset(F); 445 446 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 447 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 448 if (ImplicitBytes != 0) { 449 unsigned Alignment = getAlignmentForImplicitArgPtr(); 450 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 451 } 452 453 // Being able to dereference past the end is useful for emitting scalar loads. 454 return alignTo(TotalSize, 4); 455 } 456 457 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 458 const TargetMachine &TM) : 459 R600GenSubtargetInfo(TT, GPU, FS), 460 AMDGPUSubtarget(TT), 461 InstrInfo(*this), 462 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 463 FMA(false), 464 CaymanISA(false), 465 CFALUBug(false), 466 DX10Clamp(false), 467 HasVertexCache(false), 468 R600ALUInst(false), 469 FP64(false), 470 TexVTXClauseSize(0), 471 Gen(R600), 472 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)), 473 InstrItins(getInstrItineraryForCPU(GPU)) { } 474 475 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 476 unsigned NumRegionInstrs) const { 477 // Track register pressure so the scheduler can try to decrease 478 // pressure once register usage is above the threshold defined by 479 // SIRegisterInfo::getRegPressureSetLimit() 480 Policy.ShouldTrackPressure = true; 481 482 // Enabling both top down and bottom up scheduling seems to give us less 483 // register spills than just using one of these approaches on its own. 484 Policy.OnlyTopDown = false; 485 Policy.OnlyBottomUp = false; 486 487 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 488 if (!enableSIScheduler()) 489 Policy.ShouldTrackLaneMasks = true; 490 } 491 492 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 493 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 494 if (SGPRs <= 80) 495 return 10; 496 if (SGPRs <= 88) 497 return 9; 498 if (SGPRs <= 100) 499 return 8; 500 return 7; 501 } 502 if (SGPRs <= 48) 503 return 10; 504 if (SGPRs <= 56) 505 return 9; 506 if (SGPRs <= 64) 507 return 8; 508 if (SGPRs <= 72) 509 return 7; 510 if (SGPRs <= 80) 511 return 6; 512 return 5; 513 } 514 515 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 516 if (VGPRs <= 24) 517 return 10; 518 if (VGPRs <= 28) 519 return 9; 520 if (VGPRs <= 32) 521 return 8; 522 if (VGPRs <= 36) 523 return 7; 524 if (VGPRs <= 40) 525 return 6; 526 if (VGPRs <= 48) 527 return 5; 528 if (VGPRs <= 64) 529 return 4; 530 if (VGPRs <= 84) 531 return 3; 532 if (VGPRs <= 128) 533 return 2; 534 return 1; 535 } 536 537 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 538 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 539 if (MFI.hasFlatScratchInit()) { 540 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 541 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 542 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 543 return 4; // FLAT_SCRATCH, VCC (in that order). 544 } 545 546 if (isXNACKEnabled()) 547 return 4; // XNACK, VCC (in that order). 548 return 2; // VCC. 549 } 550 551 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 552 const Function &F = MF.getFunction(); 553 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 554 555 // Compute maximum number of SGPRs function can use using default/requested 556 // minimum number of waves per execution unit. 557 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 558 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 559 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 560 561 // Check if maximum number of SGPRs was explicitly requested using 562 // "amdgpu-num-sgpr" attribute. 563 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 564 unsigned Requested = AMDGPU::getIntegerAttribute( 565 F, "amdgpu-num-sgpr", MaxNumSGPRs); 566 567 // Make sure requested value does not violate subtarget's specifications. 568 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 569 Requested = 0; 570 571 // If more SGPRs are required to support the input user/system SGPRs, 572 // increase to accommodate them. 573 // 574 // FIXME: This really ends up using the requested number of SGPRs + number 575 // of reserved special registers in total. Theoretically you could re-use 576 // the last input registers for these special registers, but this would 577 // require a lot of complexity to deal with the weird aliasing. 578 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 579 if (Requested && Requested < InputNumSGPRs) 580 Requested = InputNumSGPRs; 581 582 // Make sure requested value is compatible with values implied by 583 // default/requested minimum/maximum number of waves per execution unit. 584 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 585 Requested = 0; 586 if (WavesPerEU.second && 587 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 588 Requested = 0; 589 590 if (Requested) 591 MaxNumSGPRs = Requested; 592 } 593 594 if (hasSGPRInitBug()) 595 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 596 597 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 598 MaxAddressableNumSGPRs); 599 } 600 601 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 602 const Function &F = MF.getFunction(); 603 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 604 605 // Compute maximum number of VGPRs function can use using default/requested 606 // minimum number of waves per execution unit. 607 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 608 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 609 610 // Check if maximum number of VGPRs was explicitly requested using 611 // "amdgpu-num-vgpr" attribute. 612 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 613 unsigned Requested = AMDGPU::getIntegerAttribute( 614 F, "amdgpu-num-vgpr", MaxNumVGPRs); 615 616 // Make sure requested value is compatible with values implied by 617 // default/requested minimum/maximum number of waves per execution unit. 618 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 619 Requested = 0; 620 if (WavesPerEU.second && 621 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 622 Requested = 0; 623 624 if (Requested) 625 MaxNumVGPRs = Requested; 626 } 627 628 return MaxNumVGPRs; 629 } 630 631 namespace { 632 struct MemOpClusterMutation : ScheduleDAGMutation { 633 const SIInstrInfo *TII; 634 635 MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {} 636 637 void apply(ScheduleDAGInstrs *DAGInstrs) override { 638 ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); 639 640 SUnit *SUa = nullptr; 641 // Search for two consequent memory operations and link them 642 // to prevent scheduler from moving them apart. 643 // In DAG pre-process SUnits are in the original order of 644 // the instructions before scheduling. 645 for (SUnit &SU : DAG->SUnits) { 646 MachineInstr &MI2 = *SU.getInstr(); 647 if (!MI2.mayLoad() && !MI2.mayStore()) { 648 SUa = nullptr; 649 continue; 650 } 651 if (!SUa) { 652 SUa = &SU; 653 continue; 654 } 655 656 MachineInstr &MI1 = *SUa->getInstr(); 657 if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) || 658 (TII->isFLAT(MI1) && TII->isFLAT(MI2)) || 659 (TII->isSMRD(MI1) && TII->isSMRD(MI2)) || 660 (TII->isDS(MI1) && TII->isDS(MI2))) { 661 SU.addPredBarrier(SUa); 662 663 for (const SDep &SI : SU.Preds) { 664 if (SI.getSUnit() != SUa) 665 SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial)); 666 } 667 668 if (&SU != &DAG->ExitSU) { 669 for (const SDep &SI : SUa->Succs) { 670 if (SI.getSUnit() != &SU) 671 SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial)); 672 } 673 } 674 } 675 676 SUa = &SU; 677 } 678 } 679 }; 680 } // namespace 681 682 void GCNSubtarget::getPostRAMutations( 683 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 684 Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo)); 685 } 686 687 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 688 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 689 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 690 else 691 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); 692 } 693 694 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 695 if (TM.getTargetTriple().getArch() == Triple::amdgcn) 696 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 697 else 698 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); 699 } 700