1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Implements the AMDGPU specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUSubtarget.h" 15 #include "AMDGPU.h" 16 #include "AMDGPUTargetMachine.h" 17 #include "AMDGPUCallLowering.h" 18 #include "AMDGPUInstructionSelector.h" 19 #include "AMDGPULegalizerInfo.h" 20 #include "AMDGPURegisterBankInfo.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 23 #include "llvm/ADT/SmallString.h" 24 #include "llvm/CodeGen/MachineScheduler.h" 25 #include "llvm/MC/MCSubtargetInfo.h" 26 #include "llvm/IR/MDBuilder.h" 27 #include "llvm/CodeGen/TargetFrameLowering.h" 28 #include <algorithm> 29 30 using namespace llvm; 31 32 #define DEBUG_TYPE "amdgpu-subtarget" 33 34 #define GET_SUBTARGETINFO_TARGET_DESC 35 #define GET_SUBTARGETINFO_CTOR 36 #define AMDGPUSubtarget GCNSubtarget 37 #include "AMDGPUGenSubtargetInfo.inc" 38 #define GET_SUBTARGETINFO_TARGET_DESC 39 #define GET_SUBTARGETINFO_CTOR 40 #undef AMDGPUSubtarget 41 #include "R600GenSubtargetInfo.inc" 42 43 GCNSubtarget::~GCNSubtarget() = default; 44 45 R600Subtarget & 46 R600Subtarget::initializeSubtargetDependencies(const Triple &TT, 47 StringRef GPU, StringRef FS) { 48 SmallString<256> FullFS("+promote-alloca,"); 49 FullFS += FS; 50 ParseSubtargetFeatures(GPU, FullFS); 51 52 // FIXME: I don't think think Evergreen has any useful support for 53 // denormals, but should be checked. Should we issue a warning somewhere 54 // if someone tries to enable these? 55 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { 56 FP32Denormals = false; 57 } 58 59 HasMulU24 = getGeneration() >= EVERGREEN; 60 HasMulI24 = hasCaymanISA(); 61 62 return *this; 63 } 64 65 GCNSubtarget & 66 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 67 StringRef GPU, StringRef FS) { 68 // Determine default and user-specified characteristics 69 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be 70 // enabled, but some instructions do not respect them and they run at the 71 // double precision rate, so don't enable by default. 72 // 73 // We want to be able to turn these off, but making this a subtarget feature 74 // for SI has the unhelpful behavior that it unsets everything else if you 75 // disable it. 76 // 77 // Similarly we want enable-prt-strict-null to be on by default and not to 78 // unset everything else if it is disabled 79 80 // Assuming ECC is enabled is the conservative default. 81 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,+xnack,"); 82 83 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 84 FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,"; 85 86 // FIXME: I don't think think Evergreen has any useful support for 87 // denormals, but should be checked. Should we issue a warning somewhere 88 // if someone tries to enable these? 89 if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { 90 FullFS += "+fp64-fp16-denormals,"; 91 } else { 92 FullFS += "-fp32-denormals,"; 93 } 94 95 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS 96 97 FullFS += FS; 98 99 ParseSubtargetFeatures(GPU, FullFS); 100 101 // We don't support FP64 for EG/NI atm. 102 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 103 104 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es 105 // on VI and newer hardware to avoid assertion failures due to missing ADDR64 106 // variants of MUBUF instructions. 107 if (!hasAddr64() && !FS.contains("flat-for-global")) { 108 FlatForGlobal = true; 109 } 110 111 // Set defaults if needed. 112 if (MaxPrivateElementSize == 0) 113 MaxPrivateElementSize = 4; 114 115 if (LDSBankCount == 0) 116 LDSBankCount = 32; 117 118 if (TT.getArch() == Triple::amdgcn) { 119 if (LocalMemorySize == 0) 120 LocalMemorySize = 32768; 121 122 // Do something sensible for unspecified target. 123 if (!HasMovrel && !HasVGPRIndexMode) 124 HasMovrel = true; 125 } 126 127 // Don't crash on invalid devices. 128 if (WavefrontSize == 0) 129 WavefrontSize = 64; 130 131 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 132 133 if (DoesNotSupportXNACK && EnableXNACK) { 134 ToggleFeature(AMDGPU::FeatureXNACK); 135 EnableXNACK = false; 136 } 137 138 // ECC is on by default, but turn it off if the hardware doesn't support it 139 // anyway. This matters for the gfx9 targets with d16 loads, but don't support 140 // ECC. 141 if (DoesNotSupportSRAMECC && EnableSRAMECC) { 142 ToggleFeature(AMDGPU::FeatureSRAMECC); 143 EnableSRAMECC = false; 144 } 145 146 return *this; 147 } 148 149 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : 150 TargetTriple(TT), 151 Has16BitInsts(false), 152 HasMadMixInsts(false), 153 FP32Denormals(false), 154 FPExceptions(false), 155 HasSDWA(false), 156 HasVOP3PInsts(false), 157 HasMulI24(true), 158 HasMulU24(true), 159 HasInv2PiInlineImm(false), 160 HasFminFmaxLegacy(true), 161 EnablePromoteAlloca(false), 162 HasTrigReducedRange(false), 163 LocalMemorySize(0), 164 WavefrontSize(0) 165 { } 166 167 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 168 const GCNTargetMachine &TM) : 169 AMDGPUGenSubtargetInfo(TT, GPU, FS), 170 AMDGPUSubtarget(TT), 171 TargetTriple(TT), 172 Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS), 173 InstrItins(getInstrItineraryForCPU(GPU)), 174 LDSBankCount(0), 175 MaxPrivateElementSize(0), 176 177 FastFMAF32(false), 178 HalfRate64Ops(false), 179 180 FP64FP16Denormals(false), 181 FlatForGlobal(false), 182 AutoWaitcntBeforeBarrier(false), 183 CodeObjectV3(false), 184 UnalignedScratchAccess(false), 185 UnalignedBufferAccess(false), 186 187 HasApertureRegs(false), 188 EnableXNACK(false), 189 DoesNotSupportXNACK(false), 190 EnableCuMode(false), 191 TrapHandler(false), 192 193 EnableHugePrivateBuffer(false), 194 EnableLoadStoreOpt(false), 195 EnableUnsafeDSOffsetFolding(false), 196 EnableSIScheduler(false), 197 EnableDS128(false), 198 EnablePRTStrictNull(false), 199 DumpCode(false), 200 201 FP64(false), 202 GCN3Encoding(false), 203 CIInsts(false), 204 GFX8Insts(false), 205 GFX9Insts(false), 206 GFX10Insts(false), 207 GFX7GFX8GFX9Insts(false), 208 SGPRInitBug(false), 209 HasSMemRealTime(false), 210 HasIntClamp(false), 211 HasFmaMixInsts(false), 212 HasMovrel(false), 213 HasVGPRIndexMode(false), 214 HasScalarStores(false), 215 HasScalarAtomics(false), 216 HasSDWAOmod(false), 217 HasSDWAScalar(false), 218 HasSDWASdst(false), 219 HasSDWAMac(false), 220 HasSDWAOutModsVOPC(false), 221 HasDPP(false), 222 HasR128A16(false), 223 HasNSAEncoding(false), 224 HasDLInsts(false), 225 HasDot1Insts(false), 226 HasDot2Insts(false), 227 EnableSRAMECC(false), 228 DoesNotSupportSRAMECC(false), 229 HasNoSdstCMPX(false), 230 HasVscnt(false), 231 HasRegisterBanking(false), 232 HasVOP3Literal(false), 233 HasNoDataDepHazard(false), 234 FlatAddressSpace(false), 235 FlatInstOffsets(false), 236 FlatGlobalInsts(false), 237 FlatScratchInsts(false), 238 ScalarFlatScratchInsts(false), 239 AddNoCarryInsts(false), 240 HasUnpackedD16VMem(false), 241 LDSMisalignedBug(false), 242 243 ScalarizeGlobal(false), 244 245 HasVcmpxPermlaneHazard(false), 246 HasVMEMtoScalarWriteHazard(false), 247 HasSMEMtoVectorWriteHazard(false), 248 HasInstFwdPrefetchBug(false), 249 HasVcmpxExecWARHazard(false), 250 HasLdsBranchVmemWARHazard(false), 251 HasNSAtoVMEMBug(false), 252 HasFlatSegmentOffsetBug(false), 253 254 FeatureDisable(false), 255 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 256 TLInfo(TM, *this), 257 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 258 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 259 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 260 RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo())); 261 InstSelector.reset(new AMDGPUInstructionSelector( 262 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 263 } 264 265 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { 266 if (getGeneration() < GFX10) 267 return 1; 268 269 switch (Opcode) { 270 case AMDGPU::V_LSHLREV_B64: 271 case AMDGPU::V_LSHLREV_B64_gfx10: 272 case AMDGPU::V_LSHL_B64: 273 case AMDGPU::V_LSHRREV_B64: 274 case AMDGPU::V_LSHRREV_B64_gfx10: 275 case AMDGPU::V_LSHR_B64: 276 case AMDGPU::V_ASHRREV_I64: 277 case AMDGPU::V_ASHRREV_I64_gfx10: 278 case AMDGPU::V_ASHR_I64: 279 return 1; 280 } 281 282 return 2; 283 } 284 285 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 286 const Function &F) const { 287 if (NWaves == 1) 288 return getLocalMemorySize(); 289 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 290 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 291 if (!WorkGroupsPerCu) 292 return 0; 293 unsigned MaxWaves = getMaxWavesPerEU(); 294 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 295 } 296 297 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 298 const Function &F) const { 299 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 300 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 301 if (!WorkGroupsPerCu) 302 return 0; 303 unsigned MaxWaves = getMaxWavesPerEU(); 304 unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu; 305 unsigned NumWaves = Limit / (Bytes ? Bytes : 1u); 306 NumWaves = std::min(NumWaves, MaxWaves); 307 NumWaves = std::max(NumWaves, 1u); 308 return NumWaves; 309 } 310 311 unsigned 312 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 313 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 314 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 315 } 316 317 std::pair<unsigned, unsigned> 318 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 319 switch (CC) { 320 case CallingConv::AMDGPU_CS: 321 case CallingConv::AMDGPU_KERNEL: 322 case CallingConv::SPIR_KERNEL: 323 return std::make_pair(getWavefrontSize() * 2, 324 std::max(getWavefrontSize() * 4, 256u)); 325 case CallingConv::AMDGPU_VS: 326 case CallingConv::AMDGPU_LS: 327 case CallingConv::AMDGPU_HS: 328 case CallingConv::AMDGPU_ES: 329 case CallingConv::AMDGPU_GS: 330 case CallingConv::AMDGPU_PS: 331 return std::make_pair(1, getWavefrontSize()); 332 default: 333 return std::make_pair(1, 16 * getWavefrontSize()); 334 } 335 } 336 337 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 338 const Function &F) const { 339 // FIXME: 1024 if function. 340 // Default minimum/maximum flat work group sizes. 341 std::pair<unsigned, unsigned> Default = 342 getDefaultFlatWorkGroupSize(F.getCallingConv()); 343 344 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 345 // starts using "amdgpu-flat-work-group-size" attribute. 346 Default.second = AMDGPU::getIntegerAttribute( 347 F, "amdgpu-max-work-group-size", Default.second); 348 Default.first = std::min(Default.first, Default.second); 349 350 // Requested minimum/maximum flat work group sizes. 351 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 352 F, "amdgpu-flat-work-group-size", Default); 353 354 // Make sure requested minimum is less than requested maximum. 355 if (Requested.first > Requested.second) 356 return Default; 357 358 // Make sure requested values do not violate subtarget's specifications. 359 if (Requested.first < getMinFlatWorkGroupSize()) 360 return Default; 361 if (Requested.second > getMaxFlatWorkGroupSize()) 362 return Default; 363 364 return Requested; 365 } 366 367 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 368 const Function &F) const { 369 // Default minimum/maximum number of waves per execution unit. 370 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 371 372 // Default/requested minimum/maximum flat work group sizes. 373 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 374 375 // If minimum/maximum flat work group sizes were explicitly requested using 376 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 377 // number of waves per execution unit to values implied by requested 378 // minimum/maximum flat work group sizes. 379 unsigned MinImpliedByFlatWorkGroupSize = 380 getMaxWavesPerEU(FlatWorkGroupSizes.second); 381 bool RequestedFlatWorkGroupSize = false; 382 383 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 384 // starts using "amdgpu-flat-work-group-size" attribute. 385 if (F.hasFnAttribute("amdgpu-max-work-group-size") || 386 F.hasFnAttribute("amdgpu-flat-work-group-size")) { 387 Default.first = MinImpliedByFlatWorkGroupSize; 388 RequestedFlatWorkGroupSize = true; 389 } 390 391 // Requested minimum/maximum number of waves per execution unit. 392 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 393 F, "amdgpu-waves-per-eu", Default, true); 394 395 // Make sure requested minimum is less than requested maximum. 396 if (Requested.second && Requested.first > Requested.second) 397 return Default; 398 399 // Make sure requested values do not violate subtarget's specifications. 400 if (Requested.first < getMinWavesPerEU() || 401 Requested.first > getMaxWavesPerEU()) 402 return Default; 403 if (Requested.second > getMaxWavesPerEU()) 404 return Default; 405 406 // Make sure requested values are compatible with values implied by requested 407 // minimum/maximum flat work group sizes. 408 if (RequestedFlatWorkGroupSize && 409 Requested.first < MinImpliedByFlatWorkGroupSize) 410 return Default; 411 412 return Requested; 413 } 414 415 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 416 Function *Kernel = I->getParent()->getParent(); 417 unsigned MinSize = 0; 418 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 419 bool IdQuery = false; 420 421 // If reqd_work_group_size is present it narrows value down. 422 if (auto *CI = dyn_cast<CallInst>(I)) { 423 const Function *F = CI->getCalledFunction(); 424 if (F) { 425 unsigned Dim = UINT_MAX; 426 switch (F->getIntrinsicID()) { 427 case Intrinsic::amdgcn_workitem_id_x: 428 case Intrinsic::r600_read_tidig_x: 429 IdQuery = true; 430 LLVM_FALLTHROUGH; 431 case Intrinsic::r600_read_local_size_x: 432 Dim = 0; 433 break; 434 case Intrinsic::amdgcn_workitem_id_y: 435 case Intrinsic::r600_read_tidig_y: 436 IdQuery = true; 437 LLVM_FALLTHROUGH; 438 case Intrinsic::r600_read_local_size_y: 439 Dim = 1; 440 break; 441 case Intrinsic::amdgcn_workitem_id_z: 442 case Intrinsic::r600_read_tidig_z: 443 IdQuery = true; 444 LLVM_FALLTHROUGH; 445 case Intrinsic::r600_read_local_size_z: 446 Dim = 2; 447 break; 448 default: 449 break; 450 } 451 if (Dim <= 3) { 452 if (auto Node = Kernel->getMetadata("reqd_work_group_size")) 453 if (Node->getNumOperands() == 3) 454 MinSize = MaxSize = mdconst::extract<ConstantInt>( 455 Node->getOperand(Dim))->getZExtValue(); 456 } 457 } 458 } 459 460 if (!MaxSize) 461 return false; 462 463 // Range metadata is [Lo, Hi). For ID query we need to pass max size 464 // as Hi. For size query we need to pass Hi + 1. 465 if (IdQuery) 466 MinSize = 0; 467 else 468 ++MaxSize; 469 470 MDBuilder MDB(I->getContext()); 471 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 472 APInt(32, MaxSize)); 473 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 474 return true; 475 } 476 477 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 478 unsigned &MaxAlign) const { 479 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 480 F.getCallingConv() == CallingConv::SPIR_KERNEL); 481 482 const DataLayout &DL = F.getParent()->getDataLayout(); 483 uint64_t ExplicitArgBytes = 0; 484 MaxAlign = 1; 485 486 for (const Argument &Arg : F.args()) { 487 Type *ArgTy = Arg.getType(); 488 489 unsigned Align = DL.getABITypeAlignment(ArgTy); 490 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 491 ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize; 492 MaxAlign = std::max(MaxAlign, Align); 493 } 494 495 return ExplicitArgBytes; 496 } 497 498 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 499 unsigned &MaxAlign) const { 500 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 501 502 unsigned ExplicitOffset = getExplicitKernelArgOffset(F); 503 504 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 505 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 506 if (ImplicitBytes != 0) { 507 unsigned Alignment = getAlignmentForImplicitArgPtr(); 508 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 509 } 510 511 // Being able to dereference past the end is useful for emitting scalar loads. 512 return alignTo(TotalSize, 4); 513 } 514 515 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 516 const TargetMachine &TM) : 517 R600GenSubtargetInfo(TT, GPU, FS), 518 AMDGPUSubtarget(TT), 519 InstrInfo(*this), 520 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 521 FMA(false), 522 CaymanISA(false), 523 CFALUBug(false), 524 HasVertexCache(false), 525 R600ALUInst(false), 526 FP64(false), 527 TexVTXClauseSize(0), 528 Gen(R600), 529 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)), 530 InstrItins(getInstrItineraryForCPU(GPU)) { } 531 532 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 533 unsigned NumRegionInstrs) const { 534 // Track register pressure so the scheduler can try to decrease 535 // pressure once register usage is above the threshold defined by 536 // SIRegisterInfo::getRegPressureSetLimit() 537 Policy.ShouldTrackPressure = true; 538 539 // Enabling both top down and bottom up scheduling seems to give us less 540 // register spills than just using one of these approaches on its own. 541 Policy.OnlyTopDown = false; 542 Policy.OnlyBottomUp = false; 543 544 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 545 if (!enableSIScheduler()) 546 Policy.ShouldTrackLaneMasks = true; 547 } 548 549 bool GCNSubtarget::hasMadF16() const { 550 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1; 551 } 552 553 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 554 if (getGeneration() >= AMDGPUSubtarget::GFX10) 555 return 10; 556 557 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 558 if (SGPRs <= 80) 559 return 10; 560 if (SGPRs <= 88) 561 return 9; 562 if (SGPRs <= 100) 563 return 8; 564 return 7; 565 } 566 if (SGPRs <= 48) 567 return 10; 568 if (SGPRs <= 56) 569 return 9; 570 if (SGPRs <= 64) 571 return 8; 572 if (SGPRs <= 72) 573 return 7; 574 if (SGPRs <= 80) 575 return 6; 576 return 5; 577 } 578 579 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 580 if (VGPRs <= 24) 581 return 10; 582 if (VGPRs <= 28) 583 return 9; 584 if (VGPRs <= 32) 585 return 8; 586 if (VGPRs <= 36) 587 return 7; 588 if (VGPRs <= 40) 589 return 6; 590 if (VGPRs <= 48) 591 return 5; 592 if (VGPRs <= 64) 593 return 4; 594 if (VGPRs <= 84) 595 return 3; 596 if (VGPRs <= 128) 597 return 2; 598 return 1; 599 } 600 601 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 602 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 603 if (getGeneration() >= AMDGPUSubtarget::GFX10) 604 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. 605 606 if (MFI.hasFlatScratchInit()) { 607 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 608 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 609 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 610 return 4; // FLAT_SCRATCH, VCC (in that order). 611 } 612 613 if (isXNACKEnabled()) 614 return 4; // XNACK, VCC (in that order). 615 return 2; // VCC. 616 } 617 618 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 619 const Function &F = MF.getFunction(); 620 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 621 622 // Compute maximum number of SGPRs function can use using default/requested 623 // minimum number of waves per execution unit. 624 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 625 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 626 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 627 628 // Check if maximum number of SGPRs was explicitly requested using 629 // "amdgpu-num-sgpr" attribute. 630 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 631 unsigned Requested = AMDGPU::getIntegerAttribute( 632 F, "amdgpu-num-sgpr", MaxNumSGPRs); 633 634 // Make sure requested value does not violate subtarget's specifications. 635 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 636 Requested = 0; 637 638 // If more SGPRs are required to support the input user/system SGPRs, 639 // increase to accommodate them. 640 // 641 // FIXME: This really ends up using the requested number of SGPRs + number 642 // of reserved special registers in total. Theoretically you could re-use 643 // the last input registers for these special registers, but this would 644 // require a lot of complexity to deal with the weird aliasing. 645 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 646 if (Requested && Requested < InputNumSGPRs) 647 Requested = InputNumSGPRs; 648 649 // Make sure requested value is compatible with values implied by 650 // default/requested minimum/maximum number of waves per execution unit. 651 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 652 Requested = 0; 653 if (WavesPerEU.second && 654 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 655 Requested = 0; 656 657 if (Requested) 658 MaxNumSGPRs = Requested; 659 } 660 661 if (hasSGPRInitBug()) 662 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 663 664 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 665 MaxAddressableNumSGPRs); 666 } 667 668 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 669 const Function &F = MF.getFunction(); 670 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 671 672 // Compute maximum number of VGPRs function can use using default/requested 673 // minimum number of waves per execution unit. 674 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 675 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 676 677 // Check if maximum number of VGPRs was explicitly requested using 678 // "amdgpu-num-vgpr" attribute. 679 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 680 unsigned Requested = AMDGPU::getIntegerAttribute( 681 F, "amdgpu-num-vgpr", MaxNumVGPRs); 682 683 // Make sure requested value is compatible with values implied by 684 // default/requested minimum/maximum number of waves per execution unit. 685 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 686 Requested = 0; 687 if (WavesPerEU.second && 688 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 689 Requested = 0; 690 691 if (Requested) 692 MaxNumVGPRs = Requested; 693 } 694 695 return MaxNumVGPRs; 696 } 697 698 namespace { 699 struct MemOpClusterMutation : ScheduleDAGMutation { 700 const SIInstrInfo *TII; 701 702 MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {} 703 704 void apply(ScheduleDAGInstrs *DAG) override { 705 SUnit *SUa = nullptr; 706 // Search for two consequent memory operations and link them 707 // to prevent scheduler from moving them apart. 708 // In DAG pre-process SUnits are in the original order of 709 // the instructions before scheduling. 710 for (SUnit &SU : DAG->SUnits) { 711 MachineInstr &MI2 = *SU.getInstr(); 712 if (!MI2.mayLoad() && !MI2.mayStore()) { 713 SUa = nullptr; 714 continue; 715 } 716 if (!SUa) { 717 SUa = &SU; 718 continue; 719 } 720 721 MachineInstr &MI1 = *SUa->getInstr(); 722 if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) || 723 (TII->isFLAT(MI1) && TII->isFLAT(MI2)) || 724 (TII->isSMRD(MI1) && TII->isSMRD(MI2)) || 725 (TII->isDS(MI1) && TII->isDS(MI2))) { 726 SU.addPredBarrier(SUa); 727 728 for (const SDep &SI : SU.Preds) { 729 if (SI.getSUnit() != SUa) 730 SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial)); 731 } 732 733 if (&SU != &DAG->ExitSU) { 734 for (const SDep &SI : SUa->Succs) { 735 if (SI.getSUnit() != &SU) 736 SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial)); 737 } 738 } 739 } 740 741 SUa = &SU; 742 } 743 } 744 }; 745 } // namespace 746 747 void GCNSubtarget::getPostRAMutations( 748 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 749 Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo)); 750 } 751 752 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 753 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 754 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 755 else 756 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); 757 } 758 759 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 760 if (TM.getTargetTriple().getArch() == Triple::amdgcn) 761 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 762 else 763 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); 764 } 765