1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Implements the AMDGPU specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUSubtarget.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUTargetMachine.h" 17 #include "AMDGPUCallLowering.h" 18 #include "AMDGPUInstructionSelector.h" 19 #include "AMDGPULegalizerInfo.h" 20 #include "AMDGPURegisterBankInfo.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 23 #include "llvm/ADT/SmallString.h" 24 #include "llvm/CodeGen/MachineScheduler.h" 25 #include "llvm/MC/MCSubtargetInfo.h" 26 #include "llvm/IR/MDBuilder.h" 27 #include "llvm/CodeGen/TargetFrameLowering.h" 28 #include <algorithm> 29 30 using namespace llvm; 31 32 #define DEBUG_TYPE "amdgpu-subtarget" 33 34 #define GET_SUBTARGETINFO_TARGET_DESC 35 #define GET_SUBTARGETINFO_CTOR 36 #define AMDGPUSubtarget GCNSubtarget 37 #include "AMDGPUGenSubtargetInfo.inc" 38 #define GET_SUBTARGETINFO_TARGET_DESC 39 #define GET_SUBTARGETINFO_CTOR 40 #undef AMDGPUSubtarget 41 #include "R600GenSubtargetInfo.inc" 42 43 GCNSubtarget::~GCNSubtarget() = default; 44 45 R600Subtarget & 46 R600Subtarget::initializeSubtargetDependencies(const Triple &TT, 47 StringRef GPU, StringRef FS) { 48 SmallString<256> FullFS("+promote-alloca,"); 49 FullFS += FS; 50 ParseSubtargetFeatures(GPU, FullFS); 51 52 // FIXME: I don't think think Evergreen has any useful support for 53 // denormals, but should be checked. Should we issue a warning somewhere 54 // if someone tries to enable these? 55 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { 56 FP32Denormals = false; 57 } 58 59 HasMulU24 = getGeneration() >= EVERGREEN; 60 HasMulI24 = hasCaymanISA(); 61 62 return *this; 63 } 64 65 GCNSubtarget & 66 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 67 StringRef GPU, StringRef FS) { 68 // Determine default and user-specified characteristics 69 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be 70 // enabled, but some instructions do not respect them and they run at the 71 // double precision rate, so don't enable by default. 72 // 73 // We want to be able to turn these off, but making this a subtarget feature 74 // for SI has the unhelpful behavior that it unsets everything else if you 75 // disable it. 76 // 77 // Similarly we want enable-prt-strict-null to be on by default and not to 78 // unset everything else if it is disabled 79 80 // Assuming ECC is enabled is the conservative default. 81 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,"); 82 83 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 84 FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,"; 85 86 // FIXME: I don't think think Evergreen has any useful support for 87 // denormals, but should be checked. Should we issue a warning somewhere 88 // if someone tries to enable these? 89 if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { 90 FullFS += "+fp64-fp16-denormals,"; 91 } else { 92 FullFS += "-fp32-denormals,"; 93 } 94 95 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS 96 97 FullFS += FS; 98 99 ParseSubtargetFeatures(GPU, FullFS); 100 101 // We don't support FP64 for EG/NI atm. 102 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 103 104 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es 105 // on VI and newer hardware to avoid assertion failures due to missing ADDR64 106 // variants of MUBUF instructions. 107 if (!hasAddr64() && !FS.contains("flat-for-global")) { 108 FlatForGlobal = true; 109 } 110 111 // Set defaults if needed. 112 if (MaxPrivateElementSize == 0) 113 MaxPrivateElementSize = 4; 114 115 if (LDSBankCount == 0) 116 LDSBankCount = 32; 117 118 if (TT.getArch() == Triple::amdgcn) { 119 if (LocalMemorySize == 0) 120 LocalMemorySize = 32768; 121 122 // Do something sensible for unspecified target. 123 if (!HasMovrel && !HasVGPRIndexMode) 124 HasMovrel = true; 125 } 126 127 // Don't crash on invalid devices. 128 if (WavefrontSize == 0) 129 WavefrontSize = 64; 130 131 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 132 133 // ECC is on by default, but turn it off if the hardware doesn't support it 134 // anyway. This matters for the gfx9 targets with d16 loads, but don't support 135 // ECC. 136 if (DoesNotSupportSRAMECC && EnableSRAMECC) { 137 ToggleFeature(AMDGPU::FeatureSRAMECC); 138 EnableSRAMECC = false; 139 } 140 141 return *this; 142 } 143 144 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : 145 TargetTriple(TT), 146 Has16BitInsts(false), 147 HasMadMixInsts(false), 148 FP32Denormals(false), 149 FPExceptions(false), 150 HasSDWA(false), 151 HasVOP3PInsts(false), 152 HasMulI24(true), 153 HasMulU24(true), 154 HasInv2PiInlineImm(false), 155 HasFminFmaxLegacy(true), 156 EnablePromoteAlloca(false), 157 HasTrigReducedRange(false), 158 LocalMemorySize(0), 159 WavefrontSize(0) 160 { } 161 162 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 163 const GCNTargetMachine &TM) : 164 AMDGPUGenSubtargetInfo(TT, GPU, FS), 165 AMDGPUSubtarget(TT), 166 TargetTriple(TT), 167 Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS), 168 InstrItins(getInstrItineraryForCPU(GPU)), 169 LDSBankCount(0), 170 MaxPrivateElementSize(0), 171 172 FastFMAF32(false), 173 HalfRate64Ops(false), 174 175 FP64FP16Denormals(false), 176 FlatForGlobal(false), 177 AutoWaitcntBeforeBarrier(false), 178 CodeObjectV3(false), 179 UnalignedScratchAccess(false), 180 UnalignedBufferAccess(false), 181 182 HasApertureRegs(false), 183 EnableXNACK(false), 184 EnableCuMode(false), 185 TrapHandler(false), 186 187 EnableHugePrivateBuffer(false), 188 EnableLoadStoreOpt(false), 189 EnableUnsafeDSOffsetFolding(false), 190 EnableSIScheduler(false), 191 EnableDS128(false), 192 EnablePRTStrictNull(false), 193 DumpCode(false), 194 195 FP64(false), 196 GCN3Encoding(false), 197 CIInsts(false), 198 GFX8Insts(false), 199 GFX9Insts(false), 200 GFX10Insts(false), 201 GFX7GFX8GFX9Insts(false), 202 SGPRInitBug(false), 203 HasSMemRealTime(false), 204 HasIntClamp(false), 205 HasFmaMixInsts(false), 206 HasMovrel(false), 207 HasVGPRIndexMode(false), 208 HasScalarStores(false), 209 HasScalarAtomics(false), 210 HasSDWAOmod(false), 211 HasSDWAScalar(false), 212 HasSDWASdst(false), 213 HasSDWAMac(false), 214 HasSDWAOutModsVOPC(false), 215 HasDPP(false), 216 HasR128A16(false), 217 HasNSAEncoding(false), 218 HasDLInsts(false), 219 HasDot1Insts(false), 220 HasDot2Insts(false), 221 EnableSRAMECC(false), 222 DoesNotSupportSRAMECC(false), 223 HasNoSdstCMPX(false), 224 HasVscnt(false), 225 HasRegisterBanking(false), 226 HasVOP3Literal(false), 227 HasNoDataDepHazard(false), 228 FlatAddressSpace(false), 229 FlatInstOffsets(false), 230 FlatGlobalInsts(false), 231 FlatScratchInsts(false), 232 ScalarFlatScratchInsts(false), 233 AddNoCarryInsts(false), 234 HasUnpackedD16VMem(false), 235 LDSMisalignedBug(false), 236 237 ScalarizeGlobal(false), 238 239 HasVcmpxPermlaneHazard(false), 240 HasVMEMtoScalarWriteHazard(false), 241 HasSMEMtoVectorWriteHazard(false), 242 HasInstFwdPrefetchBug(false), 243 HasVcmpxExecWARHazard(false), 244 HasLdsBranchVmemWARHazard(false), 245 HasNSAtoVMEMBug(false), 246 HasFlatSegmentOffsetBug(false), 247 248 FeatureDisable(false), 249 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 250 TLInfo(TM, *this), 251 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 252 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 253 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 254 RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo())); 255 InstSelector.reset(new AMDGPUInstructionSelector( 256 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 257 } 258 259 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { 260 if (getGeneration() < GFX10) 261 return 1; 262 263 switch (Opcode) { 264 case AMDGPU::V_LSHLREV_B64: 265 case AMDGPU::V_LSHLREV_B64_gfx10: 266 case AMDGPU::V_LSHL_B64: 267 case AMDGPU::V_LSHRREV_B64: 268 case AMDGPU::V_LSHRREV_B64_gfx10: 269 case AMDGPU::V_LSHR_B64: 270 case AMDGPU::V_ASHRREV_I64: 271 case AMDGPU::V_ASHRREV_I64_gfx10: 272 case AMDGPU::V_ASHR_I64: 273 return 1; 274 } 275 276 return 2; 277 } 278 279 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 280 const Function &F) const { 281 if (NWaves == 1) 282 return getLocalMemorySize(); 283 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 284 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 285 if (!WorkGroupsPerCu) 286 return 0; 287 unsigned MaxWaves = getMaxWavesPerEU(); 288 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 289 } 290 291 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 292 const Function &F) const { 293 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 294 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 295 if (!WorkGroupsPerCu) 296 return 0; 297 unsigned MaxWaves = getMaxWavesPerEU(); 298 unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu; 299 unsigned NumWaves = Limit / (Bytes ? Bytes : 1u); 300 NumWaves = std::min(NumWaves, MaxWaves); 301 NumWaves = std::max(NumWaves, 1u); 302 return NumWaves; 303 } 304 305 unsigned 306 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 307 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 308 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 309 } 310 311 std::pair<unsigned, unsigned> 312 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 313 switch (CC) { 314 case CallingConv::AMDGPU_CS: 315 case CallingConv::AMDGPU_KERNEL: 316 case CallingConv::SPIR_KERNEL: 317 return std::make_pair(getWavefrontSize() * 2, 318 std::max(getWavefrontSize() * 4, 256u)); 319 case CallingConv::AMDGPU_VS: 320 case CallingConv::AMDGPU_LS: 321 case CallingConv::AMDGPU_HS: 322 case CallingConv::AMDGPU_ES: 323 case CallingConv::AMDGPU_GS: 324 case CallingConv::AMDGPU_PS: 325 return std::make_pair(1, getWavefrontSize()); 326 default: 327 return std::make_pair(1, 16 * getWavefrontSize()); 328 } 329 } 330 331 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 332 const Function &F) const { 333 // FIXME: 1024 if function. 334 // Default minimum/maximum flat work group sizes. 335 std::pair<unsigned, unsigned> Default = 336 getDefaultFlatWorkGroupSize(F.getCallingConv()); 337 338 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 339 // starts using "amdgpu-flat-work-group-size" attribute. 340 Default.second = AMDGPU::getIntegerAttribute( 341 F, "amdgpu-max-work-group-size", Default.second); 342 Default.first = std::min(Default.first, Default.second); 343 344 // Requested minimum/maximum flat work group sizes. 345 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 346 F, "amdgpu-flat-work-group-size", Default); 347 348 // Make sure requested minimum is less than requested maximum. 349 if (Requested.first > Requested.second) 350 return Default; 351 352 // Make sure requested values do not violate subtarget's specifications. 353 if (Requested.first < getMinFlatWorkGroupSize()) 354 return Default; 355 if (Requested.second > getMaxFlatWorkGroupSize()) 356 return Default; 357 358 return Requested; 359 } 360 361 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 362 const Function &F) const { 363 // Default minimum/maximum number of waves per execution unit. 364 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 365 366 // Default/requested minimum/maximum flat work group sizes. 367 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 368 369 // If minimum/maximum flat work group sizes were explicitly requested using 370 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 371 // number of waves per execution unit to values implied by requested 372 // minimum/maximum flat work group sizes. 373 unsigned MinImpliedByFlatWorkGroupSize = 374 getMaxWavesPerEU(FlatWorkGroupSizes.second); 375 bool RequestedFlatWorkGroupSize = false; 376 377 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 378 // starts using "amdgpu-flat-work-group-size" attribute. 379 if (F.hasFnAttribute("amdgpu-max-work-group-size") || 380 F.hasFnAttribute("amdgpu-flat-work-group-size")) { 381 Default.first = MinImpliedByFlatWorkGroupSize; 382 RequestedFlatWorkGroupSize = true; 383 } 384 385 // Requested minimum/maximum number of waves per execution unit. 386 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 387 F, "amdgpu-waves-per-eu", Default, true); 388 389 // Make sure requested minimum is less than requested maximum. 390 if (Requested.second && Requested.first > Requested.second) 391 return Default; 392 393 // Make sure requested values do not violate subtarget's specifications. 394 if (Requested.first < getMinWavesPerEU() || 395 Requested.first > getMaxWavesPerEU()) 396 return Default; 397 if (Requested.second > getMaxWavesPerEU()) 398 return Default; 399 400 // Make sure requested values are compatible with values implied by requested 401 // minimum/maximum flat work group sizes. 402 if (RequestedFlatWorkGroupSize && 403 Requested.first < MinImpliedByFlatWorkGroupSize) 404 return Default; 405 406 return Requested; 407 } 408 409 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 410 Function *Kernel = I->getParent()->getParent(); 411 unsigned MinSize = 0; 412 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 413 bool IdQuery = false; 414 415 // If reqd_work_group_size is present it narrows value down. 416 if (auto *CI = dyn_cast<CallInst>(I)) { 417 const Function *F = CI->getCalledFunction(); 418 if (F) { 419 unsigned Dim = UINT_MAX; 420 switch (F->getIntrinsicID()) { 421 case Intrinsic::amdgcn_workitem_id_x: 422 case Intrinsic::r600_read_tidig_x: 423 IdQuery = true; 424 LLVM_FALLTHROUGH; 425 case Intrinsic::r600_read_local_size_x: 426 Dim = 0; 427 break; 428 case Intrinsic::amdgcn_workitem_id_y: 429 case Intrinsic::r600_read_tidig_y: 430 IdQuery = true; 431 LLVM_FALLTHROUGH; 432 case Intrinsic::r600_read_local_size_y: 433 Dim = 1; 434 break; 435 case Intrinsic::amdgcn_workitem_id_z: 436 case Intrinsic::r600_read_tidig_z: 437 IdQuery = true; 438 LLVM_FALLTHROUGH; 439 case Intrinsic::r600_read_local_size_z: 440 Dim = 2; 441 break; 442 default: 443 break; 444 } 445 if (Dim <= 3) { 446 if (auto Node = Kernel->getMetadata("reqd_work_group_size")) 447 if (Node->getNumOperands() == 3) 448 MinSize = MaxSize = mdconst::extract<ConstantInt>( 449 Node->getOperand(Dim))->getZExtValue(); 450 } 451 } 452 } 453 454 if (!MaxSize) 455 return false; 456 457 // Range metadata is [Lo, Hi). For ID query we need to pass max size 458 // as Hi. For size query we need to pass Hi + 1. 459 if (IdQuery) 460 MinSize = 0; 461 else 462 ++MaxSize; 463 464 MDBuilder MDB(I->getContext()); 465 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 466 APInt(32, MaxSize)); 467 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 468 return true; 469 } 470 471 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 472 unsigned &MaxAlign) const { 473 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 474 F.getCallingConv() == CallingConv::SPIR_KERNEL); 475 476 const DataLayout &DL = F.getParent()->getDataLayout(); 477 uint64_t ExplicitArgBytes = 0; 478 MaxAlign = 1; 479 480 for (const Argument &Arg : F.args()) { 481 Type *ArgTy = Arg.getType(); 482 483 unsigned Align = DL.getABITypeAlignment(ArgTy); 484 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 485 ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize; 486 MaxAlign = std::max(MaxAlign, Align); 487 } 488 489 return ExplicitArgBytes; 490 } 491 492 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 493 unsigned &MaxAlign) const { 494 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 495 496 unsigned ExplicitOffset = getExplicitKernelArgOffset(F); 497 498 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 499 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 500 if (ImplicitBytes != 0) { 501 unsigned Alignment = getAlignmentForImplicitArgPtr(); 502 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 503 } 504 505 // Being able to dereference past the end is useful for emitting scalar loads. 506 return alignTo(TotalSize, 4); 507 } 508 509 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 510 const TargetMachine &TM) : 511 R600GenSubtargetInfo(TT, GPU, FS), 512 AMDGPUSubtarget(TT), 513 InstrInfo(*this), 514 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 515 FMA(false), 516 CaymanISA(false), 517 CFALUBug(false), 518 HasVertexCache(false), 519 R600ALUInst(false), 520 FP64(false), 521 TexVTXClauseSize(0), 522 Gen(R600), 523 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)), 524 InstrItins(getInstrItineraryForCPU(GPU)) { } 525 526 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 527 unsigned NumRegionInstrs) const { 528 // Track register pressure so the scheduler can try to decrease 529 // pressure once register usage is above the threshold defined by 530 // SIRegisterInfo::getRegPressureSetLimit() 531 Policy.ShouldTrackPressure = true; 532 533 // Enabling both top down and bottom up scheduling seems to give us less 534 // register spills than just using one of these approaches on its own. 535 Policy.OnlyTopDown = false; 536 Policy.OnlyBottomUp = false; 537 538 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 539 if (!enableSIScheduler()) 540 Policy.ShouldTrackLaneMasks = true; 541 } 542 543 bool GCNSubtarget::hasMadF16() const { 544 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1; 545 } 546 547 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 548 if (getGeneration() >= AMDGPUSubtarget::GFX10) 549 return 10; 550 551 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 552 if (SGPRs <= 80) 553 return 10; 554 if (SGPRs <= 88) 555 return 9; 556 if (SGPRs <= 100) 557 return 8; 558 return 7; 559 } 560 if (SGPRs <= 48) 561 return 10; 562 if (SGPRs <= 56) 563 return 9; 564 if (SGPRs <= 64) 565 return 8; 566 if (SGPRs <= 72) 567 return 7; 568 if (SGPRs <= 80) 569 return 6; 570 return 5; 571 } 572 573 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 574 if (VGPRs <= 24) 575 return 10; 576 if (VGPRs <= 28) 577 return 9; 578 if (VGPRs <= 32) 579 return 8; 580 if (VGPRs <= 36) 581 return 7; 582 if (VGPRs <= 40) 583 return 6; 584 if (VGPRs <= 48) 585 return 5; 586 if (VGPRs <= 64) 587 return 4; 588 if (VGPRs <= 84) 589 return 3; 590 if (VGPRs <= 128) 591 return 2; 592 return 1; 593 } 594 595 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 596 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 597 if (getGeneration() >= AMDGPUSubtarget::GFX10) 598 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. 599 600 if (MFI.hasFlatScratchInit()) { 601 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 602 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 603 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 604 return 4; // FLAT_SCRATCH, VCC (in that order). 605 } 606 607 if (isXNACKEnabled()) 608 return 4; // XNACK, VCC (in that order). 609 return 2; // VCC. 610 } 611 612 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 613 const Function &F = MF.getFunction(); 614 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 615 616 // Compute maximum number of SGPRs function can use using default/requested 617 // minimum number of waves per execution unit. 618 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 619 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 620 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 621 622 // Check if maximum number of SGPRs was explicitly requested using 623 // "amdgpu-num-sgpr" attribute. 624 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 625 unsigned Requested = AMDGPU::getIntegerAttribute( 626 F, "amdgpu-num-sgpr", MaxNumSGPRs); 627 628 // Make sure requested value does not violate subtarget's specifications. 629 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 630 Requested = 0; 631 632 // If more SGPRs are required to support the input user/system SGPRs, 633 // increase to accommodate them. 634 // 635 // FIXME: This really ends up using the requested number of SGPRs + number 636 // of reserved special registers in total. Theoretically you could re-use 637 // the last input registers for these special registers, but this would 638 // require a lot of complexity to deal with the weird aliasing. 639 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 640 if (Requested && Requested < InputNumSGPRs) 641 Requested = InputNumSGPRs; 642 643 // Make sure requested value is compatible with values implied by 644 // default/requested minimum/maximum number of waves per execution unit. 645 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 646 Requested = 0; 647 if (WavesPerEU.second && 648 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 649 Requested = 0; 650 651 if (Requested) 652 MaxNumSGPRs = Requested; 653 } 654 655 if (hasSGPRInitBug()) 656 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 657 658 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 659 MaxAddressableNumSGPRs); 660 } 661 662 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 663 const Function &F = MF.getFunction(); 664 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 665 666 // Compute maximum number of VGPRs function can use using default/requested 667 // minimum number of waves per execution unit. 668 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 669 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 670 671 // Check if maximum number of VGPRs was explicitly requested using 672 // "amdgpu-num-vgpr" attribute. 673 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 674 unsigned Requested = AMDGPU::getIntegerAttribute( 675 F, "amdgpu-num-vgpr", MaxNumVGPRs); 676 677 // Make sure requested value is compatible with values implied by 678 // default/requested minimum/maximum number of waves per execution unit. 679 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 680 Requested = 0; 681 if (WavesPerEU.second && 682 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 683 Requested = 0; 684 685 if (Requested) 686 MaxNumVGPRs = Requested; 687 } 688 689 return MaxNumVGPRs; 690 } 691 692 namespace { 693 struct MemOpClusterMutation : ScheduleDAGMutation { 694 const SIInstrInfo *TII; 695 696 MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {} 697 698 void apply(ScheduleDAGInstrs *DAG) override { 699 SUnit *SUa = nullptr; 700 // Search for two consequent memory operations and link them 701 // to prevent scheduler from moving them apart. 702 // In DAG pre-process SUnits are in the original order of 703 // the instructions before scheduling. 704 for (SUnit &SU : DAG->SUnits) { 705 MachineInstr &MI2 = *SU.getInstr(); 706 if (!MI2.mayLoad() && !MI2.mayStore()) { 707 SUa = nullptr; 708 continue; 709 } 710 if (!SUa) { 711 SUa = &SU; 712 continue; 713 } 714 715 MachineInstr &MI1 = *SUa->getInstr(); 716 if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) || 717 (TII->isFLAT(MI1) && TII->isFLAT(MI2)) || 718 (TII->isSMRD(MI1) && TII->isSMRD(MI2)) || 719 (TII->isDS(MI1) && TII->isDS(MI2))) { 720 SU.addPredBarrier(SUa); 721 722 for (const SDep &SI : SU.Preds) { 723 if (SI.getSUnit() != SUa) 724 SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial)); 725 } 726 727 if (&SU != &DAG->ExitSU) { 728 for (const SDep &SI : SUa->Succs) { 729 if (SI.getSUnit() != &SU) 730 SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial)); 731 } 732 } 733 } 734 735 SUa = &SU; 736 } 737 } 738 }; 739 } // namespace 740 741 void GCNSubtarget::getPostRAMutations( 742 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 743 Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo)); 744 } 745 746 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 747 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 748 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 749 else 750 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); 751 } 752 753 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 754 if (TM.getTargetTriple().getArch() == Triple::amdgcn) 755 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 756 else 757 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); 758 } 759