1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// Implements the AMDGPU specific subclass of TargetSubtarget. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPUSubtarget.h" 16 #include "AMDGPU.h" 17 #include "AMDGPUTargetMachine.h" 18 #include "AMDGPUCallLowering.h" 19 #include "AMDGPUInstructionSelector.h" 20 #include "AMDGPULegalizerInfo.h" 21 #include "AMDGPURegisterBankInfo.h" 22 #include "SIMachineFunctionInfo.h" 23 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 24 #include "llvm/ADT/SmallString.h" 25 #include "llvm/CodeGen/MachineScheduler.h" 26 #include "llvm/MC/MCSubtargetInfo.h" 27 #include "llvm/IR/MDBuilder.h" 28 #include "llvm/CodeGen/TargetFrameLowering.h" 29 #include <algorithm> 30 31 using namespace llvm; 32 33 #define DEBUG_TYPE "amdgpu-subtarget" 34 35 #define GET_SUBTARGETINFO_TARGET_DESC 36 #define GET_SUBTARGETINFO_CTOR 37 #define AMDGPUSubtarget GCNSubtarget 38 #include "AMDGPUGenSubtargetInfo.inc" 39 #define GET_SUBTARGETINFO_TARGET_DESC 40 #define GET_SUBTARGETINFO_CTOR 41 #undef AMDGPUSubtarget 42 #include "R600GenSubtargetInfo.inc" 43 44 GCNSubtarget::~GCNSubtarget() = default; 45 46 R600Subtarget & 47 R600Subtarget::initializeSubtargetDependencies(const Triple &TT, 48 StringRef GPU, StringRef FS) { 49 SmallString<256> FullFS("+promote-alloca,+dx10-clamp,"); 50 FullFS += FS; 51 ParseSubtargetFeatures(GPU, FullFS); 52 53 // FIXME: I don't think think Evergreen has any useful support for 54 // denormals, but should be checked. Should we issue a warning somewhere 55 // if someone tries to enable these? 56 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { 57 FP32Denormals = false; 58 } 59 60 HasMulU24 = getGeneration() >= EVERGREEN; 61 HasMulI24 = hasCaymanISA(); 62 63 return *this; 64 } 65 66 GCNSubtarget & 67 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 68 StringRef GPU, StringRef FS) { 69 // Determine default and user-specified characteristics 70 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be 71 // enabled, but some instructions do not respect them and they run at the 72 // double precision rate, so don't enable by default. 73 // 74 // We want to be able to turn these off, but making this a subtarget feature 75 // for SI has the unhelpful behavior that it unsets everything else if you 76 // disable it. 77 78 SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,"); 79 80 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 81 FullFS += "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,"; 82 83 // FIXME: I don't think think Evergreen has any useful support for 84 // denormals, but should be checked. Should we issue a warning somewhere 85 // if someone tries to enable these? 86 if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { 87 FullFS += "+fp64-fp16-denormals,"; 88 } else { 89 FullFS += "-fp32-denormals,"; 90 } 91 92 FullFS += FS; 93 94 ParseSubtargetFeatures(GPU, FullFS); 95 96 // We don't support FP64 for EG/NI atm. 97 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 98 99 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es 100 // on VI and newer hardware to avoid assertion failures due to missing ADDR64 101 // variants of MUBUF instructions. 102 if (!hasAddr64() && !FS.contains("flat-for-global")) { 103 FlatForGlobal = true; 104 } 105 106 // Set defaults if needed. 107 if (MaxPrivateElementSize == 0) 108 MaxPrivateElementSize = 4; 109 110 if (LDSBankCount == 0) 111 LDSBankCount = 32; 112 113 if (TT.getArch() == Triple::amdgcn) { 114 if (LocalMemorySize == 0) 115 LocalMemorySize = 32768; 116 117 // Do something sensible for unspecified target. 118 if (!HasMovrel && !HasVGPRIndexMode) 119 HasMovrel = true; 120 } 121 122 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 123 124 return *this; 125 } 126 127 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, 128 const FeatureBitset &FeatureBits) : 129 TargetTriple(TT), 130 SubtargetFeatureBits(FeatureBits), 131 Has16BitInsts(false), 132 HasMadMixInsts(false), 133 FP32Denormals(false), 134 FPExceptions(false), 135 HasSDWA(false), 136 HasVOP3PInsts(false), 137 HasMulI24(true), 138 HasMulU24(true), 139 HasInv2PiInlineImm(false), 140 HasFminFmaxLegacy(true), 141 EnablePromoteAlloca(false), 142 LocalMemorySize(0), 143 WavefrontSize(0) 144 { } 145 146 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 147 const GCNTargetMachine &TM) : 148 AMDGPUGenSubtargetInfo(TT, GPU, FS), 149 AMDGPUSubtarget(TT, getFeatureBits()), 150 TargetTriple(TT), 151 Gen(SOUTHERN_ISLANDS), 152 IsaVersion(ISAVersion0_0_0), 153 LDSBankCount(0), 154 MaxPrivateElementSize(0), 155 156 FastFMAF32(false), 157 HalfRate64Ops(false), 158 159 FP64FP16Denormals(false), 160 DX10Clamp(false), 161 FlatForGlobal(false), 162 AutoWaitcntBeforeBarrier(false), 163 CodeObjectV3(false), 164 UnalignedScratchAccess(false), 165 UnalignedBufferAccess(false), 166 167 HasApertureRegs(false), 168 EnableXNACK(false), 169 TrapHandler(false), 170 DebuggerInsertNops(false), 171 DebuggerEmitPrologue(false), 172 173 EnableHugePrivateBuffer(false), 174 EnableVGPRSpilling(false), 175 EnableLoadStoreOpt(false), 176 EnableUnsafeDSOffsetFolding(false), 177 EnableSIScheduler(false), 178 EnableDS128(false), 179 DumpCode(false), 180 181 FP64(false), 182 GCN3Encoding(false), 183 CIInsts(false), 184 VIInsts(false), 185 GFX9Insts(false), 186 SGPRInitBug(false), 187 HasSMemRealTime(false), 188 HasIntClamp(false), 189 HasFmaMixInsts(false), 190 HasMovrel(false), 191 HasVGPRIndexMode(false), 192 HasScalarStores(false), 193 HasScalarAtomics(false), 194 HasSDWAOmod(false), 195 HasSDWAScalar(false), 196 HasSDWASdst(false), 197 HasSDWAMac(false), 198 HasSDWAOutModsVOPC(false), 199 HasDPP(false), 200 HasR128A16(false), 201 HasDLInsts(false), 202 D16PreservesUnusedBits(false), 203 FlatAddressSpace(false), 204 FlatInstOffsets(false), 205 FlatGlobalInsts(false), 206 FlatScratchInsts(false), 207 AddNoCarryInsts(false), 208 HasUnpackedD16VMem(false), 209 210 ScalarizeGlobal(false), 211 212 FeatureDisable(false), 213 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 214 TLInfo(TM, *this), 215 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 216 AS = AMDGPU::getAMDGPUAS(TT); 217 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 218 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 219 RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo())); 220 InstSelector.reset(new AMDGPUInstructionSelector( 221 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 222 } 223 224 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 225 const Function &F) const { 226 if (NWaves == 1) 227 return getLocalMemorySize(); 228 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 229 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 230 unsigned MaxWaves = getMaxWavesPerEU(); 231 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 232 } 233 234 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 235 const Function &F) const { 236 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 237 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 238 unsigned MaxWaves = getMaxWavesPerEU(); 239 unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu; 240 unsigned NumWaves = Limit / (Bytes ? Bytes : 1u); 241 NumWaves = std::min(NumWaves, MaxWaves); 242 NumWaves = std::max(NumWaves, 1u); 243 return NumWaves; 244 } 245 246 unsigned 247 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 248 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 249 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 250 } 251 252 std::pair<unsigned, unsigned> 253 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 254 switch (CC) { 255 case CallingConv::AMDGPU_CS: 256 case CallingConv::AMDGPU_KERNEL: 257 case CallingConv::SPIR_KERNEL: 258 return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4); 259 case CallingConv::AMDGPU_VS: 260 case CallingConv::AMDGPU_LS: 261 case CallingConv::AMDGPU_HS: 262 case CallingConv::AMDGPU_ES: 263 case CallingConv::AMDGPU_GS: 264 case CallingConv::AMDGPU_PS: 265 return std::make_pair(1, getWavefrontSize()); 266 default: 267 return std::make_pair(1, 16 * getWavefrontSize()); 268 } 269 } 270 271 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 272 const Function &F) const { 273 // FIXME: 1024 if function. 274 // Default minimum/maximum flat work group sizes. 275 std::pair<unsigned, unsigned> Default = 276 getDefaultFlatWorkGroupSize(F.getCallingConv()); 277 278 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 279 // starts using "amdgpu-flat-work-group-size" attribute. 280 Default.second = AMDGPU::getIntegerAttribute( 281 F, "amdgpu-max-work-group-size", Default.second); 282 Default.first = std::min(Default.first, Default.second); 283 284 // Requested minimum/maximum flat work group sizes. 285 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 286 F, "amdgpu-flat-work-group-size", Default); 287 288 // Make sure requested minimum is less than requested maximum. 289 if (Requested.first > Requested.second) 290 return Default; 291 292 // Make sure requested values do not violate subtarget's specifications. 293 if (Requested.first < getMinFlatWorkGroupSize()) 294 return Default; 295 if (Requested.second > getMaxFlatWorkGroupSize()) 296 return Default; 297 298 return Requested; 299 } 300 301 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 302 const Function &F) const { 303 // Default minimum/maximum number of waves per execution unit. 304 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 305 306 // Default/requested minimum/maximum flat work group sizes. 307 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 308 309 // If minimum/maximum flat work group sizes were explicitly requested using 310 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 311 // number of waves per execution unit to values implied by requested 312 // minimum/maximum flat work group sizes. 313 unsigned MinImpliedByFlatWorkGroupSize = 314 getMaxWavesPerEU(FlatWorkGroupSizes.second); 315 bool RequestedFlatWorkGroupSize = false; 316 317 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 318 // starts using "amdgpu-flat-work-group-size" attribute. 319 if (F.hasFnAttribute("amdgpu-max-work-group-size") || 320 F.hasFnAttribute("amdgpu-flat-work-group-size")) { 321 Default.first = MinImpliedByFlatWorkGroupSize; 322 RequestedFlatWorkGroupSize = true; 323 } 324 325 // Requested minimum/maximum number of waves per execution unit. 326 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 327 F, "amdgpu-waves-per-eu", Default, true); 328 329 // Make sure requested minimum is less than requested maximum. 330 if (Requested.second && Requested.first > Requested.second) 331 return Default; 332 333 // Make sure requested values do not violate subtarget's specifications. 334 if (Requested.first < getMinWavesPerEU() || 335 Requested.first > getMaxWavesPerEU()) 336 return Default; 337 if (Requested.second > getMaxWavesPerEU()) 338 return Default; 339 340 // Make sure requested values are compatible with values implied by requested 341 // minimum/maximum flat work group sizes. 342 if (RequestedFlatWorkGroupSize && 343 Requested.first < MinImpliedByFlatWorkGroupSize) 344 return Default; 345 346 return Requested; 347 } 348 349 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 350 Function *Kernel = I->getParent()->getParent(); 351 unsigned MinSize = 0; 352 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 353 bool IdQuery = false; 354 355 // If reqd_work_group_size is present it narrows value down. 356 if (auto *CI = dyn_cast<CallInst>(I)) { 357 const Function *F = CI->getCalledFunction(); 358 if (F) { 359 unsigned Dim = UINT_MAX; 360 switch (F->getIntrinsicID()) { 361 case Intrinsic::amdgcn_workitem_id_x: 362 case Intrinsic::r600_read_tidig_x: 363 IdQuery = true; 364 LLVM_FALLTHROUGH; 365 case Intrinsic::r600_read_local_size_x: 366 Dim = 0; 367 break; 368 case Intrinsic::amdgcn_workitem_id_y: 369 case Intrinsic::r600_read_tidig_y: 370 IdQuery = true; 371 LLVM_FALLTHROUGH; 372 case Intrinsic::r600_read_local_size_y: 373 Dim = 1; 374 break; 375 case Intrinsic::amdgcn_workitem_id_z: 376 case Intrinsic::r600_read_tidig_z: 377 IdQuery = true; 378 LLVM_FALLTHROUGH; 379 case Intrinsic::r600_read_local_size_z: 380 Dim = 2; 381 break; 382 default: 383 break; 384 } 385 if (Dim <= 3) { 386 if (auto Node = Kernel->getMetadata("reqd_work_group_size")) 387 if (Node->getNumOperands() == 3) 388 MinSize = MaxSize = mdconst::extract<ConstantInt>( 389 Node->getOperand(Dim))->getZExtValue(); 390 } 391 } 392 } 393 394 if (!MaxSize) 395 return false; 396 397 // Range metadata is [Lo, Hi). For ID query we need to pass max size 398 // as Hi. For size query we need to pass Hi + 1. 399 if (IdQuery) 400 MinSize = 0; 401 else 402 ++MaxSize; 403 404 MDBuilder MDB(I->getContext()); 405 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 406 APInt(32, MaxSize)); 407 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 408 return true; 409 } 410 411 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 412 unsigned &MaxAlign) const { 413 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 414 F.getCallingConv() == CallingConv::SPIR_KERNEL); 415 416 const DataLayout &DL = F.getParent()->getDataLayout(); 417 uint64_t ExplicitArgBytes = 0; 418 MaxAlign = 1; 419 420 for (const Argument &Arg : F.args()) { 421 Type *ArgTy = Arg.getType(); 422 423 unsigned Align = DL.getABITypeAlignment(ArgTy); 424 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 425 ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize; 426 MaxAlign = std::max(MaxAlign, Align); 427 } 428 429 return ExplicitArgBytes; 430 } 431 432 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 433 unsigned &MaxAlign) const { 434 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 435 436 unsigned ExplicitOffset = getExplicitKernelArgOffset(F); 437 438 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 439 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 440 if (ImplicitBytes != 0) { 441 unsigned Alignment = getAlignmentForImplicitArgPtr(); 442 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 443 } 444 445 // Being able to dereference past the end is useful for emitting scalar loads. 446 return alignTo(TotalSize, 4); 447 } 448 449 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 450 const TargetMachine &TM) : 451 R600GenSubtargetInfo(TT, GPU, FS), 452 AMDGPUSubtarget(TT, getFeatureBits()), 453 InstrInfo(*this), 454 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 455 FMA(false), 456 CaymanISA(false), 457 CFALUBug(false), 458 DX10Clamp(false), 459 HasVertexCache(false), 460 R600ALUInst(false), 461 FP64(false), 462 TexVTXClauseSize(0), 463 Gen(R600), 464 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)), 465 InstrItins(getInstrItineraryForCPU(GPU)), 466 AS (AMDGPU::getAMDGPUAS(TT)) { } 467 468 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 469 unsigned NumRegionInstrs) const { 470 // Track register pressure so the scheduler can try to decrease 471 // pressure once register usage is above the threshold defined by 472 // SIRegisterInfo::getRegPressureSetLimit() 473 Policy.ShouldTrackPressure = true; 474 475 // Enabling both top down and bottom up scheduling seems to give us less 476 // register spills than just using one of these approaches on its own. 477 Policy.OnlyTopDown = false; 478 Policy.OnlyBottomUp = false; 479 480 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 481 if (!enableSIScheduler()) 482 Policy.ShouldTrackLaneMasks = true; 483 } 484 485 bool GCNSubtarget::isVGPRSpillingEnabled(const Function& F) const { 486 return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv()); 487 } 488 489 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 490 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 491 if (SGPRs <= 80) 492 return 10; 493 if (SGPRs <= 88) 494 return 9; 495 if (SGPRs <= 100) 496 return 8; 497 return 7; 498 } 499 if (SGPRs <= 48) 500 return 10; 501 if (SGPRs <= 56) 502 return 9; 503 if (SGPRs <= 64) 504 return 8; 505 if (SGPRs <= 72) 506 return 7; 507 if (SGPRs <= 80) 508 return 6; 509 return 5; 510 } 511 512 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 513 if (VGPRs <= 24) 514 return 10; 515 if (VGPRs <= 28) 516 return 9; 517 if (VGPRs <= 32) 518 return 8; 519 if (VGPRs <= 36) 520 return 7; 521 if (VGPRs <= 40) 522 return 6; 523 if (VGPRs <= 48) 524 return 5; 525 if (VGPRs <= 64) 526 return 4; 527 if (VGPRs <= 84) 528 return 3; 529 if (VGPRs <= 128) 530 return 2; 531 return 1; 532 } 533 534 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 535 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 536 if (MFI.hasFlatScratchInit()) { 537 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 538 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 539 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 540 return 4; // FLAT_SCRATCH, VCC (in that order). 541 } 542 543 if (isXNACKEnabled()) 544 return 4; // XNACK, VCC (in that order). 545 return 2; // VCC. 546 } 547 548 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 549 const Function &F = MF.getFunction(); 550 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 551 552 // Compute maximum number of SGPRs function can use using default/requested 553 // minimum number of waves per execution unit. 554 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 555 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 556 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 557 558 // Check if maximum number of SGPRs was explicitly requested using 559 // "amdgpu-num-sgpr" attribute. 560 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 561 unsigned Requested = AMDGPU::getIntegerAttribute( 562 F, "amdgpu-num-sgpr", MaxNumSGPRs); 563 564 // Make sure requested value does not violate subtarget's specifications. 565 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 566 Requested = 0; 567 568 // If more SGPRs are required to support the input user/system SGPRs, 569 // increase to accommodate them. 570 // 571 // FIXME: This really ends up using the requested number of SGPRs + number 572 // of reserved special registers in total. Theoretically you could re-use 573 // the last input registers for these special registers, but this would 574 // require a lot of complexity to deal with the weird aliasing. 575 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 576 if (Requested && Requested < InputNumSGPRs) 577 Requested = InputNumSGPRs; 578 579 // Make sure requested value is compatible with values implied by 580 // default/requested minimum/maximum number of waves per execution unit. 581 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 582 Requested = 0; 583 if (WavesPerEU.second && 584 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 585 Requested = 0; 586 587 if (Requested) 588 MaxNumSGPRs = Requested; 589 } 590 591 if (hasSGPRInitBug()) 592 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 593 594 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 595 MaxAddressableNumSGPRs); 596 } 597 598 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 599 const Function &F = MF.getFunction(); 600 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 601 602 // Compute maximum number of VGPRs function can use using default/requested 603 // minimum number of waves per execution unit. 604 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 605 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 606 607 // Check if maximum number of VGPRs was explicitly requested using 608 // "amdgpu-num-vgpr" attribute. 609 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 610 unsigned Requested = AMDGPU::getIntegerAttribute( 611 F, "amdgpu-num-vgpr", MaxNumVGPRs); 612 613 // Make sure requested value is compatible with values implied by 614 // default/requested minimum/maximum number of waves per execution unit. 615 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 616 Requested = 0; 617 if (WavesPerEU.second && 618 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 619 Requested = 0; 620 621 if (Requested) 622 MaxNumVGPRs = Requested; 623 } 624 625 return MaxNumVGPRs; 626 } 627 628 namespace { 629 struct MemOpClusterMutation : ScheduleDAGMutation { 630 const SIInstrInfo *TII; 631 632 MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {} 633 634 void apply(ScheduleDAGInstrs *DAGInstrs) override { 635 ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); 636 637 SUnit *SUa = nullptr; 638 // Search for two consequent memory operations and link them 639 // to prevent scheduler from moving them apart. 640 // In DAG pre-process SUnits are in the original order of 641 // the instructions before scheduling. 642 for (SUnit &SU : DAG->SUnits) { 643 MachineInstr &MI2 = *SU.getInstr(); 644 if (!MI2.mayLoad() && !MI2.mayStore()) { 645 SUa = nullptr; 646 continue; 647 } 648 if (!SUa) { 649 SUa = &SU; 650 continue; 651 } 652 653 MachineInstr &MI1 = *SUa->getInstr(); 654 if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) || 655 (TII->isFLAT(MI1) && TII->isFLAT(MI2)) || 656 (TII->isSMRD(MI1) && TII->isSMRD(MI2)) || 657 (TII->isDS(MI1) && TII->isDS(MI2))) { 658 SU.addPredBarrier(SUa); 659 660 for (const SDep &SI : SU.Preds) { 661 if (SI.getSUnit() != SUa) 662 SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial)); 663 } 664 665 if (&SU != &DAG->ExitSU) { 666 for (const SDep &SI : SUa->Succs) { 667 if (SI.getSUnit() != &SU) 668 SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial)); 669 } 670 } 671 } 672 673 SUa = &SU; 674 } 675 } 676 }; 677 } // namespace 678 679 void GCNSubtarget::getPostRAMutations( 680 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 681 Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo)); 682 } 683 684 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 685 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 686 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 687 else 688 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); 689 } 690 691 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 692 if (TM.getTargetTriple().getArch() == Triple::amdgcn) 693 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 694 else 695 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); 696 } 697