1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// Implements the AMDGPU specific subclass of TargetSubtarget. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPUSubtarget.h" 16 #include "AMDGPU.h" 17 #include "AMDGPUTargetMachine.h" 18 #include "AMDGPUCallLowering.h" 19 #include "AMDGPUInstructionSelector.h" 20 #include "AMDGPULegalizerInfo.h" 21 #include "AMDGPURegisterBankInfo.h" 22 #include "SIMachineFunctionInfo.h" 23 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 24 #include "llvm/ADT/SmallString.h" 25 #include "llvm/CodeGen/MachineScheduler.h" 26 #include "llvm/MC/MCSubtargetInfo.h" 27 #include "llvm/IR/MDBuilder.h" 28 #include "llvm/CodeGen/TargetFrameLowering.h" 29 #include <algorithm> 30 31 using namespace llvm; 32 33 #define DEBUG_TYPE "amdgpu-subtarget" 34 35 #define GET_SUBTARGETINFO_TARGET_DESC 36 #define GET_SUBTARGETINFO_CTOR 37 #define AMDGPUSubtarget GCNSubtarget 38 #include "AMDGPUGenSubtargetInfo.inc" 39 #define GET_SUBTARGETINFO_TARGET_DESC 40 #define GET_SUBTARGETINFO_CTOR 41 #undef AMDGPUSubtarget 42 #include "R600GenSubtargetInfo.inc" 43 44 GCNSubtarget::~GCNSubtarget() = default; 45 46 R600Subtarget & 47 R600Subtarget::initializeSubtargetDependencies(const Triple &TT, 48 StringRef GPU, StringRef FS) { 49 SmallString<256> FullFS("+promote-alloca,+dx10-clamp,"); 50 FullFS += FS; 51 ParseSubtargetFeatures(GPU, FullFS); 52 53 // FIXME: I don't think think Evergreen has any useful support for 54 // denormals, but should be checked. Should we issue a warning somewhere 55 // if someone tries to enable these? 56 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { 57 FP32Denormals = false; 58 } 59 60 HasMulU24 = getGeneration() >= EVERGREEN; 61 HasMulI24 = hasCaymanISA(); 62 63 return *this; 64 } 65 66 GCNSubtarget & 67 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 68 StringRef GPU, StringRef FS) { 69 // Determine default and user-specified characteristics 70 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be 71 // enabled, but some instructions do not respect them and they run at the 72 // double precision rate, so don't enable by default. 73 // 74 // We want to be able to turn these off, but making this a subtarget feature 75 // for SI has the unhelpful behavior that it unsets everything else if you 76 // disable it. 77 78 SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,"); 79 80 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 81 FullFS += "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,"; 82 83 // FIXME: I don't think think Evergreen has any useful support for 84 // denormals, but should be checked. Should we issue a warning somewhere 85 // if someone tries to enable these? 86 if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { 87 FullFS += "+fp64-fp16-denormals,"; 88 } else { 89 FullFS += "-fp32-denormals,"; 90 } 91 92 FullFS += FS; 93 94 ParseSubtargetFeatures(GPU, FullFS); 95 96 // We don't support FP64 for EG/NI atm. 97 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 98 99 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es 100 // on VI and newer hardware to avoid assertion failures due to missing ADDR64 101 // variants of MUBUF instructions. 102 if (!hasAddr64() && !FS.contains("flat-for-global")) { 103 FlatForGlobal = true; 104 } 105 106 // Set defaults if needed. 107 if (MaxPrivateElementSize == 0) 108 MaxPrivateElementSize = 4; 109 110 if (LDSBankCount == 0) 111 LDSBankCount = 32; 112 113 if (TT.getArch() == Triple::amdgcn) { 114 if (LocalMemorySize == 0) 115 LocalMemorySize = 32768; 116 117 // Do something sensible for unspecified target. 118 if (!HasMovrel && !HasVGPRIndexMode) 119 HasMovrel = true; 120 } 121 122 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 123 124 return *this; 125 } 126 127 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, 128 const FeatureBitset &FeatureBits) : 129 TargetTriple(TT), 130 SubtargetFeatureBits(FeatureBits), 131 Has16BitInsts(false), 132 HasMadMixInsts(false), 133 FP32Denormals(false), 134 FPExceptions(false), 135 HasSDWA(false), 136 HasVOP3PInsts(false), 137 HasMulI24(true), 138 HasMulU24(true), 139 HasInv2PiInlineImm(false), 140 HasFminFmaxLegacy(true), 141 EnablePromoteAlloca(false), 142 LocalMemorySize(0), 143 WavefrontSize(0) 144 { } 145 146 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 147 const GCNTargetMachine &TM) : 148 AMDGPUGenSubtargetInfo(TT, GPU, FS), 149 AMDGPUSubtarget(TT, getFeatureBits()), 150 TargetTriple(TT), 151 Gen(SOUTHERN_ISLANDS), 152 IsaVersion(ISAVersion0_0_0), 153 LDSBankCount(0), 154 MaxPrivateElementSize(0), 155 156 FastFMAF32(false), 157 HalfRate64Ops(false), 158 159 FP64FP16Denormals(false), 160 DX10Clamp(false), 161 FlatForGlobal(false), 162 AutoWaitcntBeforeBarrier(false), 163 CodeObjectV3(false), 164 UnalignedScratchAccess(false), 165 UnalignedBufferAccess(false), 166 167 HasApertureRegs(false), 168 EnableXNACK(false), 169 TrapHandler(false), 170 DebuggerInsertNops(false), 171 DebuggerEmitPrologue(false), 172 173 EnableHugePrivateBuffer(false), 174 EnableVGPRSpilling(false), 175 EnableLoadStoreOpt(false), 176 EnableUnsafeDSOffsetFolding(false), 177 EnableSIScheduler(false), 178 EnableDS128(false), 179 DumpCode(false), 180 181 FP64(false), 182 GCN3Encoding(false), 183 CIInsts(false), 184 VIInsts(false), 185 GFX9Insts(false), 186 SGPRInitBug(false), 187 HasSMemRealTime(false), 188 HasIntClamp(false), 189 HasFmaMixInsts(false), 190 HasMovrel(false), 191 HasVGPRIndexMode(false), 192 HasScalarStores(false), 193 HasScalarAtomics(false), 194 HasSDWAOmod(false), 195 HasSDWAScalar(false), 196 HasSDWASdst(false), 197 HasSDWAMac(false), 198 HasSDWAOutModsVOPC(false), 199 HasDPP(false), 200 HasDLInsts(false), 201 D16PreservesUnusedBits(false), 202 FlatAddressSpace(false), 203 FlatInstOffsets(false), 204 FlatGlobalInsts(false), 205 FlatScratchInsts(false), 206 AddNoCarryInsts(false), 207 HasUnpackedD16VMem(false), 208 209 ScalarizeGlobal(false), 210 211 FeatureDisable(false), 212 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 213 TLInfo(TM, *this), 214 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 215 AS = AMDGPU::getAMDGPUAS(TT); 216 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 217 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 218 RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo())); 219 InstSelector.reset(new AMDGPUInstructionSelector( 220 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 221 } 222 223 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 224 const Function &F) const { 225 if (NWaves == 1) 226 return getLocalMemorySize(); 227 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 228 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 229 unsigned MaxWaves = getMaxWavesPerEU(); 230 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 231 } 232 233 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 234 const Function &F) const { 235 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 236 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 237 unsigned MaxWaves = getMaxWavesPerEU(); 238 unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu; 239 unsigned NumWaves = Limit / (Bytes ? Bytes : 1u); 240 NumWaves = std::min(NumWaves, MaxWaves); 241 NumWaves = std::max(NumWaves, 1u); 242 return NumWaves; 243 } 244 245 unsigned 246 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 247 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 248 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 249 } 250 251 std::pair<unsigned, unsigned> 252 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 253 switch (CC) { 254 case CallingConv::AMDGPU_CS: 255 case CallingConv::AMDGPU_KERNEL: 256 case CallingConv::SPIR_KERNEL: 257 return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4); 258 case CallingConv::AMDGPU_VS: 259 case CallingConv::AMDGPU_LS: 260 case CallingConv::AMDGPU_HS: 261 case CallingConv::AMDGPU_ES: 262 case CallingConv::AMDGPU_GS: 263 case CallingConv::AMDGPU_PS: 264 return std::make_pair(1, getWavefrontSize()); 265 default: 266 return std::make_pair(1, 16 * getWavefrontSize()); 267 } 268 } 269 270 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 271 const Function &F) const { 272 // FIXME: 1024 if function. 273 // Default minimum/maximum flat work group sizes. 274 std::pair<unsigned, unsigned> Default = 275 getDefaultFlatWorkGroupSize(F.getCallingConv()); 276 277 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 278 // starts using "amdgpu-flat-work-group-size" attribute. 279 Default.second = AMDGPU::getIntegerAttribute( 280 F, "amdgpu-max-work-group-size", Default.second); 281 Default.first = std::min(Default.first, Default.second); 282 283 // Requested minimum/maximum flat work group sizes. 284 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 285 F, "amdgpu-flat-work-group-size", Default); 286 287 // Make sure requested minimum is less than requested maximum. 288 if (Requested.first > Requested.second) 289 return Default; 290 291 // Make sure requested values do not violate subtarget's specifications. 292 if (Requested.first < getMinFlatWorkGroupSize()) 293 return Default; 294 if (Requested.second > getMaxFlatWorkGroupSize()) 295 return Default; 296 297 return Requested; 298 } 299 300 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 301 const Function &F) const { 302 // Default minimum/maximum number of waves per execution unit. 303 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 304 305 // Default/requested minimum/maximum flat work group sizes. 306 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 307 308 // If minimum/maximum flat work group sizes were explicitly requested using 309 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 310 // number of waves per execution unit to values implied by requested 311 // minimum/maximum flat work group sizes. 312 unsigned MinImpliedByFlatWorkGroupSize = 313 getMaxWavesPerEU(FlatWorkGroupSizes.second); 314 bool RequestedFlatWorkGroupSize = false; 315 316 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 317 // starts using "amdgpu-flat-work-group-size" attribute. 318 if (F.hasFnAttribute("amdgpu-max-work-group-size") || 319 F.hasFnAttribute("amdgpu-flat-work-group-size")) { 320 Default.first = MinImpliedByFlatWorkGroupSize; 321 RequestedFlatWorkGroupSize = true; 322 } 323 324 // Requested minimum/maximum number of waves per execution unit. 325 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 326 F, "amdgpu-waves-per-eu", Default, true); 327 328 // Make sure requested minimum is less than requested maximum. 329 if (Requested.second && Requested.first > Requested.second) 330 return Default; 331 332 // Make sure requested values do not violate subtarget's specifications. 333 if (Requested.first < getMinWavesPerEU() || 334 Requested.first > getMaxWavesPerEU()) 335 return Default; 336 if (Requested.second > getMaxWavesPerEU()) 337 return Default; 338 339 // Make sure requested values are compatible with values implied by requested 340 // minimum/maximum flat work group sizes. 341 if (RequestedFlatWorkGroupSize && 342 Requested.first < MinImpliedByFlatWorkGroupSize) 343 return Default; 344 345 return Requested; 346 } 347 348 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 349 Function *Kernel = I->getParent()->getParent(); 350 unsigned MinSize = 0; 351 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 352 bool IdQuery = false; 353 354 // If reqd_work_group_size is present it narrows value down. 355 if (auto *CI = dyn_cast<CallInst>(I)) { 356 const Function *F = CI->getCalledFunction(); 357 if (F) { 358 unsigned Dim = UINT_MAX; 359 switch (F->getIntrinsicID()) { 360 case Intrinsic::amdgcn_workitem_id_x: 361 case Intrinsic::r600_read_tidig_x: 362 IdQuery = true; 363 LLVM_FALLTHROUGH; 364 case Intrinsic::r600_read_local_size_x: 365 Dim = 0; 366 break; 367 case Intrinsic::amdgcn_workitem_id_y: 368 case Intrinsic::r600_read_tidig_y: 369 IdQuery = true; 370 LLVM_FALLTHROUGH; 371 case Intrinsic::r600_read_local_size_y: 372 Dim = 1; 373 break; 374 case Intrinsic::amdgcn_workitem_id_z: 375 case Intrinsic::r600_read_tidig_z: 376 IdQuery = true; 377 LLVM_FALLTHROUGH; 378 case Intrinsic::r600_read_local_size_z: 379 Dim = 2; 380 break; 381 default: 382 break; 383 } 384 if (Dim <= 3) { 385 if (auto Node = Kernel->getMetadata("reqd_work_group_size")) 386 if (Node->getNumOperands() == 3) 387 MinSize = MaxSize = mdconst::extract<ConstantInt>( 388 Node->getOperand(Dim))->getZExtValue(); 389 } 390 } 391 } 392 393 if (!MaxSize) 394 return false; 395 396 // Range metadata is [Lo, Hi). For ID query we need to pass max size 397 // as Hi. For size query we need to pass Hi + 1. 398 if (IdQuery) 399 MinSize = 0; 400 else 401 ++MaxSize; 402 403 MDBuilder MDB(I->getContext()); 404 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 405 APInt(32, MaxSize)); 406 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 407 return true; 408 } 409 410 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 411 unsigned &MaxAlign) const { 412 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 413 F.getCallingConv() == CallingConv::SPIR_KERNEL); 414 415 const DataLayout &DL = F.getParent()->getDataLayout(); 416 uint64_t ExplicitArgBytes = 0; 417 MaxAlign = 1; 418 419 for (const Argument &Arg : F.args()) { 420 Type *ArgTy = Arg.getType(); 421 422 unsigned Align = DL.getABITypeAlignment(ArgTy); 423 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 424 ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize; 425 MaxAlign = std::max(MaxAlign, Align); 426 } 427 428 return ExplicitArgBytes; 429 } 430 431 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 432 unsigned &MaxAlign) const { 433 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 434 435 unsigned ExplicitOffset = getExplicitKernelArgOffset(F); 436 437 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 438 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 439 if (ImplicitBytes != 0) { 440 unsigned Alignment = getAlignmentForImplicitArgPtr(); 441 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 442 } 443 444 // Being able to dereference past the end is useful for emitting scalar loads. 445 return alignTo(TotalSize, 4); 446 } 447 448 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 449 const TargetMachine &TM) : 450 R600GenSubtargetInfo(TT, GPU, FS), 451 AMDGPUSubtarget(TT, getFeatureBits()), 452 InstrInfo(*this), 453 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 454 FMA(false), 455 CaymanISA(false), 456 CFALUBug(false), 457 DX10Clamp(false), 458 HasVertexCache(false), 459 R600ALUInst(false), 460 FP64(false), 461 TexVTXClauseSize(0), 462 Gen(R600), 463 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)), 464 InstrItins(getInstrItineraryForCPU(GPU)), 465 AS (AMDGPU::getAMDGPUAS(TT)) { } 466 467 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 468 unsigned NumRegionInstrs) const { 469 // Track register pressure so the scheduler can try to decrease 470 // pressure once register usage is above the threshold defined by 471 // SIRegisterInfo::getRegPressureSetLimit() 472 Policy.ShouldTrackPressure = true; 473 474 // Enabling both top down and bottom up scheduling seems to give us less 475 // register spills than just using one of these approaches on its own. 476 Policy.OnlyTopDown = false; 477 Policy.OnlyBottomUp = false; 478 479 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 480 if (!enableSIScheduler()) 481 Policy.ShouldTrackLaneMasks = true; 482 } 483 484 bool GCNSubtarget::isVGPRSpillingEnabled(const Function& F) const { 485 return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv()); 486 } 487 488 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 489 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 490 if (SGPRs <= 80) 491 return 10; 492 if (SGPRs <= 88) 493 return 9; 494 if (SGPRs <= 100) 495 return 8; 496 return 7; 497 } 498 if (SGPRs <= 48) 499 return 10; 500 if (SGPRs <= 56) 501 return 9; 502 if (SGPRs <= 64) 503 return 8; 504 if (SGPRs <= 72) 505 return 7; 506 if (SGPRs <= 80) 507 return 6; 508 return 5; 509 } 510 511 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 512 if (VGPRs <= 24) 513 return 10; 514 if (VGPRs <= 28) 515 return 9; 516 if (VGPRs <= 32) 517 return 8; 518 if (VGPRs <= 36) 519 return 7; 520 if (VGPRs <= 40) 521 return 6; 522 if (VGPRs <= 48) 523 return 5; 524 if (VGPRs <= 64) 525 return 4; 526 if (VGPRs <= 84) 527 return 3; 528 if (VGPRs <= 128) 529 return 2; 530 return 1; 531 } 532 533 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 534 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 535 if (MFI.hasFlatScratchInit()) { 536 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 537 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 538 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 539 return 4; // FLAT_SCRATCH, VCC (in that order). 540 } 541 542 if (isXNACKEnabled()) 543 return 4; // XNACK, VCC (in that order). 544 return 2; // VCC. 545 } 546 547 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 548 const Function &F = MF.getFunction(); 549 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 550 551 // Compute maximum number of SGPRs function can use using default/requested 552 // minimum number of waves per execution unit. 553 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 554 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 555 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 556 557 // Check if maximum number of SGPRs was explicitly requested using 558 // "amdgpu-num-sgpr" attribute. 559 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 560 unsigned Requested = AMDGPU::getIntegerAttribute( 561 F, "amdgpu-num-sgpr", MaxNumSGPRs); 562 563 // Make sure requested value does not violate subtarget's specifications. 564 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 565 Requested = 0; 566 567 // If more SGPRs are required to support the input user/system SGPRs, 568 // increase to accommodate them. 569 // 570 // FIXME: This really ends up using the requested number of SGPRs + number 571 // of reserved special registers in total. Theoretically you could re-use 572 // the last input registers for these special registers, but this would 573 // require a lot of complexity to deal with the weird aliasing. 574 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 575 if (Requested && Requested < InputNumSGPRs) 576 Requested = InputNumSGPRs; 577 578 // Make sure requested value is compatible with values implied by 579 // default/requested minimum/maximum number of waves per execution unit. 580 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 581 Requested = 0; 582 if (WavesPerEU.second && 583 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 584 Requested = 0; 585 586 if (Requested) 587 MaxNumSGPRs = Requested; 588 } 589 590 if (hasSGPRInitBug()) 591 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 592 593 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 594 MaxAddressableNumSGPRs); 595 } 596 597 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 598 const Function &F = MF.getFunction(); 599 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 600 601 // Compute maximum number of VGPRs function can use using default/requested 602 // minimum number of waves per execution unit. 603 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 604 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 605 606 // Check if maximum number of VGPRs was explicitly requested using 607 // "amdgpu-num-vgpr" attribute. 608 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 609 unsigned Requested = AMDGPU::getIntegerAttribute( 610 F, "amdgpu-num-vgpr", MaxNumVGPRs); 611 612 // Make sure requested value is compatible with values implied by 613 // default/requested minimum/maximum number of waves per execution unit. 614 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 615 Requested = 0; 616 if (WavesPerEU.second && 617 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 618 Requested = 0; 619 620 if (Requested) 621 MaxNumVGPRs = Requested; 622 } 623 624 return MaxNumVGPRs; 625 } 626 627 namespace { 628 struct MemOpClusterMutation : ScheduleDAGMutation { 629 const SIInstrInfo *TII; 630 631 MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {} 632 633 void apply(ScheduleDAGInstrs *DAGInstrs) override { 634 ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); 635 636 SUnit *SUa = nullptr; 637 // Search for two consequent memory operations and link them 638 // to prevent scheduler from moving them apart. 639 // In DAG pre-process SUnits are in the original order of 640 // the instructions before scheduling. 641 for (SUnit &SU : DAG->SUnits) { 642 MachineInstr &MI2 = *SU.getInstr(); 643 if (!MI2.mayLoad() && !MI2.mayStore()) { 644 SUa = nullptr; 645 continue; 646 } 647 if (!SUa) { 648 SUa = &SU; 649 continue; 650 } 651 652 MachineInstr &MI1 = *SUa->getInstr(); 653 if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) || 654 (TII->isFLAT(MI1) && TII->isFLAT(MI2)) || 655 (TII->isSMRD(MI1) && TII->isSMRD(MI2)) || 656 (TII->isDS(MI1) && TII->isDS(MI2))) { 657 SU.addPredBarrier(SUa); 658 659 for (const SDep &SI : SU.Preds) { 660 if (SI.getSUnit() != SUa) 661 SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial)); 662 } 663 664 if (&SU != &DAG->ExitSU) { 665 for (const SDep &SI : SUa->Succs) { 666 if (SI.getSUnit() != &SU) 667 SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial)); 668 } 669 } 670 } 671 672 SUa = &SU; 673 } 674 } 675 }; 676 } // namespace 677 678 void GCNSubtarget::getPostRAMutations( 679 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 680 Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo)); 681 } 682 683 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 684 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 685 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 686 else 687 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); 688 } 689 690 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 691 if (TM.getTargetTriple().getArch() == Triple::amdgcn) 692 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 693 else 694 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); 695 } 696