1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Implements the AMDGPU specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUSubtarget.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUTargetMachine.h" 17 #include "AMDGPUCallLowering.h" 18 #include "AMDGPUInstructionSelector.h" 19 #include "AMDGPULegalizerInfo.h" 20 #include "AMDGPURegisterBankInfo.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 23 #include "llvm/ADT/SmallString.h" 24 #include "llvm/CodeGen/MachineScheduler.h" 25 #include "llvm/MC/MCSubtargetInfo.h" 26 #include "llvm/IR/MDBuilder.h" 27 #include "llvm/CodeGen/TargetFrameLowering.h" 28 #include <algorithm> 29 30 using namespace llvm; 31 32 #define DEBUG_TYPE "amdgpu-subtarget" 33 34 #define GET_SUBTARGETINFO_TARGET_DESC 35 #define GET_SUBTARGETINFO_CTOR 36 #define AMDGPUSubtarget GCNSubtarget 37 #include "AMDGPUGenSubtargetInfo.inc" 38 #define GET_SUBTARGETINFO_TARGET_DESC 39 #define GET_SUBTARGETINFO_CTOR 40 #undef AMDGPUSubtarget 41 #include "R600GenSubtargetInfo.inc" 42 43 GCNSubtarget::~GCNSubtarget() = default; 44 45 R600Subtarget & 46 R600Subtarget::initializeSubtargetDependencies(const Triple &TT, 47 StringRef GPU, StringRef FS) { 48 SmallString<256> FullFS("+promote-alloca,+dx10-clamp,"); 49 FullFS += FS; 50 ParseSubtargetFeatures(GPU, FullFS); 51 52 // FIXME: I don't think think Evergreen has any useful support for 53 // denormals, but should be checked. Should we issue a warning somewhere 54 // if someone tries to enable these? 55 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { 56 FP32Denormals = false; 57 } 58 59 HasMulU24 = getGeneration() >= EVERGREEN; 60 HasMulI24 = hasCaymanISA(); 61 62 return *this; 63 } 64 65 GCNSubtarget & 66 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 67 StringRef GPU, StringRef FS) { 68 // Determine default and user-specified characteristics 69 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be 70 // enabled, but some instructions do not respect them and they run at the 71 // double precision rate, so don't enable by default. 72 // 73 // We want to be able to turn these off, but making this a subtarget feature 74 // for SI has the unhelpful behavior that it unsets everything else if you 75 // disable it. 76 // 77 // Similarly we want enable-prt-strict-null to be on by default and not to 78 // unset everything else if it is disabled 79 80 SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,"); 81 82 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 83 FullFS += "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,"; 84 85 // FIXME: I don't think think Evergreen has any useful support for 86 // denormals, but should be checked. Should we issue a warning somewhere 87 // if someone tries to enable these? 88 if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { 89 FullFS += "+fp64-fp16-denormals,"; 90 } else { 91 FullFS += "-fp32-denormals,"; 92 } 93 94 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS 95 96 FullFS += FS; 97 98 ParseSubtargetFeatures(GPU, FullFS); 99 100 // We don't support FP64 for EG/NI atm. 101 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 102 103 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es 104 // on VI and newer hardware to avoid assertion failures due to missing ADDR64 105 // variants of MUBUF instructions. 106 if (!hasAddr64() && !FS.contains("flat-for-global")) { 107 FlatForGlobal = true; 108 } 109 110 // Set defaults if needed. 111 if (MaxPrivateElementSize == 0) 112 MaxPrivateElementSize = 4; 113 114 if (LDSBankCount == 0) 115 LDSBankCount = 32; 116 117 if (TT.getArch() == Triple::amdgcn) { 118 if (LocalMemorySize == 0) 119 LocalMemorySize = 32768; 120 121 // Do something sensible for unspecified target. 122 if (!HasMovrel && !HasVGPRIndexMode) 123 HasMovrel = true; 124 } 125 126 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 127 128 return *this; 129 } 130 131 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : 132 TargetTriple(TT), 133 Has16BitInsts(false), 134 HasMadMixInsts(false), 135 FP32Denormals(false), 136 FPExceptions(false), 137 HasSDWA(false), 138 HasVOP3PInsts(false), 139 HasMulI24(true), 140 HasMulU24(true), 141 HasInv2PiInlineImm(false), 142 HasFminFmaxLegacy(true), 143 EnablePromoteAlloca(false), 144 HasTrigReducedRange(false), 145 LocalMemorySize(0), 146 WavefrontSize(0) 147 { } 148 149 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 150 const GCNTargetMachine &TM) : 151 AMDGPUGenSubtargetInfo(TT, GPU, FS), 152 AMDGPUSubtarget(TT), 153 TargetTriple(TT), 154 Gen(SOUTHERN_ISLANDS), 155 IsaVersion(ISAVersion0_0_0), 156 InstrItins(getInstrItineraryForCPU(GPU)), 157 LDSBankCount(0), 158 MaxPrivateElementSize(0), 159 160 FastFMAF32(false), 161 HalfRate64Ops(false), 162 163 FP64FP16Denormals(false), 164 DX10Clamp(false), 165 FlatForGlobal(false), 166 AutoWaitcntBeforeBarrier(false), 167 CodeObjectV3(false), 168 UnalignedScratchAccess(false), 169 UnalignedBufferAccess(false), 170 171 HasApertureRegs(false), 172 EnableXNACK(false), 173 TrapHandler(false), 174 DebuggerInsertNops(false), 175 DebuggerEmitPrologue(false), 176 177 EnableHugePrivateBuffer(false), 178 EnableLoadStoreOpt(false), 179 EnableUnsafeDSOffsetFolding(false), 180 EnableSIScheduler(false), 181 EnableDS128(false), 182 EnablePRTStrictNull(false), 183 DumpCode(false), 184 185 FP64(false), 186 GCN3Encoding(false), 187 CIInsts(false), 188 VIInsts(false), 189 GFX9Insts(false), 190 SGPRInitBug(false), 191 HasSMemRealTime(false), 192 HasIntClamp(false), 193 HasFmaMixInsts(false), 194 HasMovrel(false), 195 HasVGPRIndexMode(false), 196 HasScalarStores(false), 197 HasScalarAtomics(false), 198 HasSDWAOmod(false), 199 HasSDWAScalar(false), 200 HasSDWASdst(false), 201 HasSDWAMac(false), 202 HasSDWAOutModsVOPC(false), 203 HasDPP(false), 204 HasR128A16(false), 205 HasDLInsts(false), 206 HasDotInsts(false), 207 EnableSRAMECC(false), 208 FlatAddressSpace(false), 209 FlatInstOffsets(false), 210 FlatGlobalInsts(false), 211 FlatScratchInsts(false), 212 AddNoCarryInsts(false), 213 HasUnpackedD16VMem(false), 214 215 ScalarizeGlobal(false), 216 217 FeatureDisable(false), 218 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 219 TLInfo(TM, *this), 220 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 221 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 222 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 223 RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo())); 224 InstSelector.reset(new AMDGPUInstructionSelector( 225 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 226 } 227 228 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 229 const Function &F) const { 230 if (NWaves == 1) 231 return getLocalMemorySize(); 232 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 233 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 234 unsigned MaxWaves = getMaxWavesPerEU(); 235 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 236 } 237 238 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 239 const Function &F) const { 240 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 241 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 242 unsigned MaxWaves = getMaxWavesPerEU(); 243 unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu; 244 unsigned NumWaves = Limit / (Bytes ? Bytes : 1u); 245 NumWaves = std::min(NumWaves, MaxWaves); 246 NumWaves = std::max(NumWaves, 1u); 247 return NumWaves; 248 } 249 250 unsigned 251 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 252 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 253 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 254 } 255 256 std::pair<unsigned, unsigned> 257 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 258 switch (CC) { 259 case CallingConv::AMDGPU_CS: 260 case CallingConv::AMDGPU_KERNEL: 261 case CallingConv::SPIR_KERNEL: 262 return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4); 263 case CallingConv::AMDGPU_VS: 264 case CallingConv::AMDGPU_LS: 265 case CallingConv::AMDGPU_HS: 266 case CallingConv::AMDGPU_ES: 267 case CallingConv::AMDGPU_GS: 268 case CallingConv::AMDGPU_PS: 269 return std::make_pair(1, getWavefrontSize()); 270 default: 271 return std::make_pair(1, 16 * getWavefrontSize()); 272 } 273 } 274 275 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 276 const Function &F) const { 277 // FIXME: 1024 if function. 278 // Default minimum/maximum flat work group sizes. 279 std::pair<unsigned, unsigned> Default = 280 getDefaultFlatWorkGroupSize(F.getCallingConv()); 281 282 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 283 // starts using "amdgpu-flat-work-group-size" attribute. 284 Default.second = AMDGPU::getIntegerAttribute( 285 F, "amdgpu-max-work-group-size", Default.second); 286 Default.first = std::min(Default.first, Default.second); 287 288 // Requested minimum/maximum flat work group sizes. 289 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 290 F, "amdgpu-flat-work-group-size", Default); 291 292 // Make sure requested minimum is less than requested maximum. 293 if (Requested.first > Requested.second) 294 return Default; 295 296 // Make sure requested values do not violate subtarget's specifications. 297 if (Requested.first < getMinFlatWorkGroupSize()) 298 return Default; 299 if (Requested.second > getMaxFlatWorkGroupSize()) 300 return Default; 301 302 return Requested; 303 } 304 305 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 306 const Function &F) const { 307 // Default minimum/maximum number of waves per execution unit. 308 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 309 310 // Default/requested minimum/maximum flat work group sizes. 311 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 312 313 // If minimum/maximum flat work group sizes were explicitly requested using 314 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 315 // number of waves per execution unit to values implied by requested 316 // minimum/maximum flat work group sizes. 317 unsigned MinImpliedByFlatWorkGroupSize = 318 getMaxWavesPerEU(FlatWorkGroupSizes.second); 319 bool RequestedFlatWorkGroupSize = false; 320 321 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 322 // starts using "amdgpu-flat-work-group-size" attribute. 323 if (F.hasFnAttribute("amdgpu-max-work-group-size") || 324 F.hasFnAttribute("amdgpu-flat-work-group-size")) { 325 Default.first = MinImpliedByFlatWorkGroupSize; 326 RequestedFlatWorkGroupSize = true; 327 } 328 329 // Requested minimum/maximum number of waves per execution unit. 330 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 331 F, "amdgpu-waves-per-eu", Default, true); 332 333 // Make sure requested minimum is less than requested maximum. 334 if (Requested.second && Requested.first > Requested.second) 335 return Default; 336 337 // Make sure requested values do not violate subtarget's specifications. 338 if (Requested.first < getMinWavesPerEU() || 339 Requested.first > getMaxWavesPerEU()) 340 return Default; 341 if (Requested.second > getMaxWavesPerEU()) 342 return Default; 343 344 // Make sure requested values are compatible with values implied by requested 345 // minimum/maximum flat work group sizes. 346 if (RequestedFlatWorkGroupSize && 347 Requested.first < MinImpliedByFlatWorkGroupSize) 348 return Default; 349 350 return Requested; 351 } 352 353 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 354 Function *Kernel = I->getParent()->getParent(); 355 unsigned MinSize = 0; 356 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 357 bool IdQuery = false; 358 359 // If reqd_work_group_size is present it narrows value down. 360 if (auto *CI = dyn_cast<CallInst>(I)) { 361 const Function *F = CI->getCalledFunction(); 362 if (F) { 363 unsigned Dim = UINT_MAX; 364 switch (F->getIntrinsicID()) { 365 case Intrinsic::amdgcn_workitem_id_x: 366 case Intrinsic::r600_read_tidig_x: 367 IdQuery = true; 368 LLVM_FALLTHROUGH; 369 case Intrinsic::r600_read_local_size_x: 370 Dim = 0; 371 break; 372 case Intrinsic::amdgcn_workitem_id_y: 373 case Intrinsic::r600_read_tidig_y: 374 IdQuery = true; 375 LLVM_FALLTHROUGH; 376 case Intrinsic::r600_read_local_size_y: 377 Dim = 1; 378 break; 379 case Intrinsic::amdgcn_workitem_id_z: 380 case Intrinsic::r600_read_tidig_z: 381 IdQuery = true; 382 LLVM_FALLTHROUGH; 383 case Intrinsic::r600_read_local_size_z: 384 Dim = 2; 385 break; 386 default: 387 break; 388 } 389 if (Dim <= 3) { 390 if (auto Node = Kernel->getMetadata("reqd_work_group_size")) 391 if (Node->getNumOperands() == 3) 392 MinSize = MaxSize = mdconst::extract<ConstantInt>( 393 Node->getOperand(Dim))->getZExtValue(); 394 } 395 } 396 } 397 398 if (!MaxSize) 399 return false; 400 401 // Range metadata is [Lo, Hi). For ID query we need to pass max size 402 // as Hi. For size query we need to pass Hi + 1. 403 if (IdQuery) 404 MinSize = 0; 405 else 406 ++MaxSize; 407 408 MDBuilder MDB(I->getContext()); 409 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 410 APInt(32, MaxSize)); 411 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 412 return true; 413 } 414 415 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 416 unsigned &MaxAlign) const { 417 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 418 F.getCallingConv() == CallingConv::SPIR_KERNEL); 419 420 const DataLayout &DL = F.getParent()->getDataLayout(); 421 uint64_t ExplicitArgBytes = 0; 422 MaxAlign = 1; 423 424 for (const Argument &Arg : F.args()) { 425 Type *ArgTy = Arg.getType(); 426 427 unsigned Align = DL.getABITypeAlignment(ArgTy); 428 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 429 ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize; 430 MaxAlign = std::max(MaxAlign, Align); 431 } 432 433 return ExplicitArgBytes; 434 } 435 436 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 437 unsigned &MaxAlign) const { 438 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 439 440 unsigned ExplicitOffset = getExplicitKernelArgOffset(F); 441 442 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 443 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 444 if (ImplicitBytes != 0) { 445 unsigned Alignment = getAlignmentForImplicitArgPtr(); 446 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 447 } 448 449 // Being able to dereference past the end is useful for emitting scalar loads. 450 return alignTo(TotalSize, 4); 451 } 452 453 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 454 const TargetMachine &TM) : 455 R600GenSubtargetInfo(TT, GPU, FS), 456 AMDGPUSubtarget(TT), 457 InstrInfo(*this), 458 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 459 FMA(false), 460 CaymanISA(false), 461 CFALUBug(false), 462 DX10Clamp(false), 463 HasVertexCache(false), 464 R600ALUInst(false), 465 FP64(false), 466 TexVTXClauseSize(0), 467 Gen(R600), 468 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)), 469 InstrItins(getInstrItineraryForCPU(GPU)) { } 470 471 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 472 unsigned NumRegionInstrs) const { 473 // Track register pressure so the scheduler can try to decrease 474 // pressure once register usage is above the threshold defined by 475 // SIRegisterInfo::getRegPressureSetLimit() 476 Policy.ShouldTrackPressure = true; 477 478 // Enabling both top down and bottom up scheduling seems to give us less 479 // register spills than just using one of these approaches on its own. 480 Policy.OnlyTopDown = false; 481 Policy.OnlyBottomUp = false; 482 483 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 484 if (!enableSIScheduler()) 485 Policy.ShouldTrackLaneMasks = true; 486 } 487 488 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 489 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 490 if (SGPRs <= 80) 491 return 10; 492 if (SGPRs <= 88) 493 return 9; 494 if (SGPRs <= 100) 495 return 8; 496 return 7; 497 } 498 if (SGPRs <= 48) 499 return 10; 500 if (SGPRs <= 56) 501 return 9; 502 if (SGPRs <= 64) 503 return 8; 504 if (SGPRs <= 72) 505 return 7; 506 if (SGPRs <= 80) 507 return 6; 508 return 5; 509 } 510 511 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 512 if (VGPRs <= 24) 513 return 10; 514 if (VGPRs <= 28) 515 return 9; 516 if (VGPRs <= 32) 517 return 8; 518 if (VGPRs <= 36) 519 return 7; 520 if (VGPRs <= 40) 521 return 6; 522 if (VGPRs <= 48) 523 return 5; 524 if (VGPRs <= 64) 525 return 4; 526 if (VGPRs <= 84) 527 return 3; 528 if (VGPRs <= 128) 529 return 2; 530 return 1; 531 } 532 533 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 534 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 535 if (MFI.hasFlatScratchInit()) { 536 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 537 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 538 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 539 return 4; // FLAT_SCRATCH, VCC (in that order). 540 } 541 542 if (isXNACKEnabled()) 543 return 4; // XNACK, VCC (in that order). 544 return 2; // VCC. 545 } 546 547 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 548 const Function &F = MF.getFunction(); 549 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 550 551 // Compute maximum number of SGPRs function can use using default/requested 552 // minimum number of waves per execution unit. 553 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 554 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 555 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 556 557 // Check if maximum number of SGPRs was explicitly requested using 558 // "amdgpu-num-sgpr" attribute. 559 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 560 unsigned Requested = AMDGPU::getIntegerAttribute( 561 F, "amdgpu-num-sgpr", MaxNumSGPRs); 562 563 // Make sure requested value does not violate subtarget's specifications. 564 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 565 Requested = 0; 566 567 // If more SGPRs are required to support the input user/system SGPRs, 568 // increase to accommodate them. 569 // 570 // FIXME: This really ends up using the requested number of SGPRs + number 571 // of reserved special registers in total. Theoretically you could re-use 572 // the last input registers for these special registers, but this would 573 // require a lot of complexity to deal with the weird aliasing. 574 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 575 if (Requested && Requested < InputNumSGPRs) 576 Requested = InputNumSGPRs; 577 578 // Make sure requested value is compatible with values implied by 579 // default/requested minimum/maximum number of waves per execution unit. 580 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 581 Requested = 0; 582 if (WavesPerEU.second && 583 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 584 Requested = 0; 585 586 if (Requested) 587 MaxNumSGPRs = Requested; 588 } 589 590 if (hasSGPRInitBug()) 591 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 592 593 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 594 MaxAddressableNumSGPRs); 595 } 596 597 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 598 const Function &F = MF.getFunction(); 599 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 600 601 // Compute maximum number of VGPRs function can use using default/requested 602 // minimum number of waves per execution unit. 603 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 604 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 605 606 // Check if maximum number of VGPRs was explicitly requested using 607 // "amdgpu-num-vgpr" attribute. 608 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 609 unsigned Requested = AMDGPU::getIntegerAttribute( 610 F, "amdgpu-num-vgpr", MaxNumVGPRs); 611 612 // Make sure requested value is compatible with values implied by 613 // default/requested minimum/maximum number of waves per execution unit. 614 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 615 Requested = 0; 616 if (WavesPerEU.second && 617 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 618 Requested = 0; 619 620 if (Requested) 621 MaxNumVGPRs = Requested; 622 } 623 624 return MaxNumVGPRs; 625 } 626 627 namespace { 628 struct MemOpClusterMutation : ScheduleDAGMutation { 629 const SIInstrInfo *TII; 630 631 MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {} 632 633 void apply(ScheduleDAGInstrs *DAGInstrs) override { 634 ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); 635 636 SUnit *SUa = nullptr; 637 // Search for two consequent memory operations and link them 638 // to prevent scheduler from moving them apart. 639 // In DAG pre-process SUnits are in the original order of 640 // the instructions before scheduling. 641 for (SUnit &SU : DAG->SUnits) { 642 MachineInstr &MI2 = *SU.getInstr(); 643 if (!MI2.mayLoad() && !MI2.mayStore()) { 644 SUa = nullptr; 645 continue; 646 } 647 if (!SUa) { 648 SUa = &SU; 649 continue; 650 } 651 652 MachineInstr &MI1 = *SUa->getInstr(); 653 if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) || 654 (TII->isFLAT(MI1) && TII->isFLAT(MI2)) || 655 (TII->isSMRD(MI1) && TII->isSMRD(MI2)) || 656 (TII->isDS(MI1) && TII->isDS(MI2))) { 657 SU.addPredBarrier(SUa); 658 659 for (const SDep &SI : SU.Preds) { 660 if (SI.getSUnit() != SUa) 661 SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial)); 662 } 663 664 if (&SU != &DAG->ExitSU) { 665 for (const SDep &SI : SUa->Succs) { 666 if (SI.getSUnit() != &SU) 667 SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial)); 668 } 669 } 670 } 671 672 SUa = &SU; 673 } 674 } 675 }; 676 } // namespace 677 678 void GCNSubtarget::getPostRAMutations( 679 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 680 Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo)); 681 } 682 683 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 684 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 685 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 686 else 687 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); 688 } 689 690 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 691 if (TM.getTargetTriple().getArch() == Triple::amdgcn) 692 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 693 else 694 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); 695 } 696