1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// Implements the AMDGPU specific subclass of TargetSubtarget. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPUSubtarget.h" 16 #include "AMDGPU.h" 17 #include "AMDGPUTargetMachine.h" 18 #include "AMDGPUCallLowering.h" 19 #include "AMDGPUInstructionSelector.h" 20 #include "AMDGPULegalizerInfo.h" 21 #include "AMDGPURegisterBankInfo.h" 22 #include "SIMachineFunctionInfo.h" 23 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 24 #include "llvm/ADT/SmallString.h" 25 #include "llvm/CodeGen/MachineScheduler.h" 26 #include "llvm/MC/MCSubtargetInfo.h" 27 #include "llvm/IR/MDBuilder.h" 28 #include "llvm/CodeGen/TargetFrameLowering.h" 29 #include <algorithm> 30 31 using namespace llvm; 32 33 #define DEBUG_TYPE "amdgpu-subtarget" 34 35 #define GET_SUBTARGETINFO_TARGET_DESC 36 #define GET_SUBTARGETINFO_CTOR 37 #define AMDGPUSubtarget GCNSubtarget 38 #include "AMDGPUGenSubtargetInfo.inc" 39 #define GET_SUBTARGETINFO_TARGET_DESC 40 #define GET_SUBTARGETINFO_CTOR 41 #undef AMDGPUSubtarget 42 #include "R600GenSubtargetInfo.inc" 43 44 GCNSubtarget::~GCNSubtarget() = default; 45 46 R600Subtarget & 47 R600Subtarget::initializeSubtargetDependencies(const Triple &TT, 48 StringRef GPU, StringRef FS) { 49 SmallString<256> FullFS("+promote-alloca,+dx10-clamp,"); 50 FullFS += FS; 51 ParseSubtargetFeatures(GPU, FullFS); 52 53 // FIXME: I don't think think Evergreen has any useful support for 54 // denormals, but should be checked. Should we issue a warning somewhere 55 // if someone tries to enable these? 56 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { 57 FP32Denormals = false; 58 } 59 60 HasMulU24 = getGeneration() >= EVERGREEN; 61 HasMulI24 = hasCaymanISA(); 62 63 return *this; 64 } 65 66 GCNSubtarget & 67 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 68 StringRef GPU, StringRef FS) { 69 // Determine default and user-specified characteristics 70 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be 71 // enabled, but some instructions do not respect them and they run at the 72 // double precision rate, so don't enable by default. 73 // 74 // We want to be able to turn these off, but making this a subtarget feature 75 // for SI has the unhelpful behavior that it unsets everything else if you 76 // disable it. 77 78 SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,"); 79 80 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 81 FullFS += "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,"; 82 83 // FIXME: I don't think think Evergreen has any useful support for 84 // denormals, but should be checked. Should we issue a warning somewhere 85 // if someone tries to enable these? 86 if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { 87 FullFS += "+fp64-fp16-denormals,"; 88 } else { 89 FullFS += "-fp32-denormals,"; 90 } 91 92 FullFS += FS; 93 94 ParseSubtargetFeatures(GPU, FullFS); 95 96 // We don't support FP64 for EG/NI atm. 97 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 98 99 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es 100 // on VI and newer hardware to avoid assertion failures due to missing ADDR64 101 // variants of MUBUF instructions. 102 if (!hasAddr64() && !FS.contains("flat-for-global")) { 103 FlatForGlobal = true; 104 } 105 106 // Set defaults if needed. 107 if (MaxPrivateElementSize == 0) 108 MaxPrivateElementSize = 4; 109 110 if (LDSBankCount == 0) 111 LDSBankCount = 32; 112 113 if (TT.getArch() == Triple::amdgcn) { 114 if (LocalMemorySize == 0) 115 LocalMemorySize = 32768; 116 117 // Do something sensible for unspecified target. 118 if (!HasMovrel && !HasVGPRIndexMode) 119 HasMovrel = true; 120 } 121 122 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 123 124 return *this; 125 } 126 127 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : 128 TargetTriple(TT), 129 Has16BitInsts(false), 130 HasMadMixInsts(false), 131 FP32Denormals(false), 132 FPExceptions(false), 133 HasSDWA(false), 134 HasVOP3PInsts(false), 135 HasMulI24(true), 136 HasMulU24(true), 137 HasInv2PiInlineImm(false), 138 HasFminFmaxLegacy(true), 139 EnablePromoteAlloca(false), 140 HasTrigReducedRange(false), 141 LocalMemorySize(0), 142 WavefrontSize(0) 143 { } 144 145 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 146 const GCNTargetMachine &TM) : 147 AMDGPUGenSubtargetInfo(TT, GPU, FS), 148 AMDGPUSubtarget(TT), 149 TargetTriple(TT), 150 Gen(SOUTHERN_ISLANDS), 151 IsaVersion(ISAVersion0_0_0), 152 InstrItins(getInstrItineraryForCPU(GPU)), 153 LDSBankCount(0), 154 MaxPrivateElementSize(0), 155 156 FastFMAF32(false), 157 HalfRate64Ops(false), 158 159 FP64FP16Denormals(false), 160 DX10Clamp(false), 161 FlatForGlobal(false), 162 AutoWaitcntBeforeBarrier(false), 163 CodeObjectV3(false), 164 UnalignedScratchAccess(false), 165 UnalignedBufferAccess(false), 166 167 HasApertureRegs(false), 168 EnableXNACK(false), 169 TrapHandler(false), 170 DebuggerInsertNops(false), 171 DebuggerEmitPrologue(false), 172 173 EnableHugePrivateBuffer(false), 174 EnableVGPRSpilling(false), 175 EnableLoadStoreOpt(false), 176 EnableUnsafeDSOffsetFolding(false), 177 EnableSIScheduler(false), 178 EnableDS128(false), 179 DumpCode(false), 180 181 FP64(false), 182 GCN3Encoding(false), 183 CIInsts(false), 184 VIInsts(false), 185 GFX9Insts(false), 186 SGPRInitBug(false), 187 HasSMemRealTime(false), 188 HasIntClamp(false), 189 HasFmaMixInsts(false), 190 HasMovrel(false), 191 HasVGPRIndexMode(false), 192 HasScalarStores(false), 193 HasScalarAtomics(false), 194 HasSDWAOmod(false), 195 HasSDWAScalar(false), 196 HasSDWASdst(false), 197 HasSDWAMac(false), 198 HasSDWAOutModsVOPC(false), 199 HasDPP(false), 200 HasR128A16(false), 201 HasDLInsts(false), 202 D16PreservesUnusedBits(false), 203 FlatAddressSpace(false), 204 FlatInstOffsets(false), 205 FlatGlobalInsts(false), 206 FlatScratchInsts(false), 207 AddNoCarryInsts(false), 208 HasUnpackedD16VMem(false), 209 210 ScalarizeGlobal(false), 211 212 FeatureDisable(false), 213 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 214 TLInfo(TM, *this), 215 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 216 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 217 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 218 RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo())); 219 InstSelector.reset(new AMDGPUInstructionSelector( 220 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 221 } 222 223 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 224 const Function &F) const { 225 if (NWaves == 1) 226 return getLocalMemorySize(); 227 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 228 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 229 unsigned MaxWaves = getMaxWavesPerEU(); 230 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 231 } 232 233 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 234 const Function &F) const { 235 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 236 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 237 unsigned MaxWaves = getMaxWavesPerEU(); 238 unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu; 239 unsigned NumWaves = Limit / (Bytes ? Bytes : 1u); 240 NumWaves = std::min(NumWaves, MaxWaves); 241 NumWaves = std::max(NumWaves, 1u); 242 return NumWaves; 243 } 244 245 unsigned 246 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 247 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 248 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 249 } 250 251 std::pair<unsigned, unsigned> 252 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 253 switch (CC) { 254 case CallingConv::AMDGPU_CS: 255 case CallingConv::AMDGPU_KERNEL: 256 case CallingConv::SPIR_KERNEL: 257 return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4); 258 case CallingConv::AMDGPU_VS: 259 case CallingConv::AMDGPU_LS: 260 case CallingConv::AMDGPU_HS: 261 case CallingConv::AMDGPU_ES: 262 case CallingConv::AMDGPU_GS: 263 case CallingConv::AMDGPU_PS: 264 return std::make_pair(1, getWavefrontSize()); 265 default: 266 return std::make_pair(1, 16 * getWavefrontSize()); 267 } 268 } 269 270 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 271 const Function &F) const { 272 // FIXME: 1024 if function. 273 // Default minimum/maximum flat work group sizes. 274 std::pair<unsigned, unsigned> Default = 275 getDefaultFlatWorkGroupSize(F.getCallingConv()); 276 277 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 278 // starts using "amdgpu-flat-work-group-size" attribute. 279 Default.second = AMDGPU::getIntegerAttribute( 280 F, "amdgpu-max-work-group-size", Default.second); 281 Default.first = std::min(Default.first, Default.second); 282 283 // Requested minimum/maximum flat work group sizes. 284 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 285 F, "amdgpu-flat-work-group-size", Default); 286 287 // Make sure requested minimum is less than requested maximum. 288 if (Requested.first > Requested.second) 289 return Default; 290 291 // Make sure requested values do not violate subtarget's specifications. 292 if (Requested.first < getMinFlatWorkGroupSize()) 293 return Default; 294 if (Requested.second > getMaxFlatWorkGroupSize()) 295 return Default; 296 297 return Requested; 298 } 299 300 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 301 const Function &F) const { 302 // Default minimum/maximum number of waves per execution unit. 303 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 304 305 // Default/requested minimum/maximum flat work group sizes. 306 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 307 308 // If minimum/maximum flat work group sizes were explicitly requested using 309 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 310 // number of waves per execution unit to values implied by requested 311 // minimum/maximum flat work group sizes. 312 unsigned MinImpliedByFlatWorkGroupSize = 313 getMaxWavesPerEU(FlatWorkGroupSizes.second); 314 bool RequestedFlatWorkGroupSize = false; 315 316 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 317 // starts using "amdgpu-flat-work-group-size" attribute. 318 if (F.hasFnAttribute("amdgpu-max-work-group-size") || 319 F.hasFnAttribute("amdgpu-flat-work-group-size")) { 320 Default.first = MinImpliedByFlatWorkGroupSize; 321 RequestedFlatWorkGroupSize = true; 322 } 323 324 // Requested minimum/maximum number of waves per execution unit. 325 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 326 F, "amdgpu-waves-per-eu", Default, true); 327 328 // Make sure requested minimum is less than requested maximum. 329 if (Requested.second && Requested.first > Requested.second) 330 return Default; 331 332 // Make sure requested values do not violate subtarget's specifications. 333 if (Requested.first < getMinWavesPerEU() || 334 Requested.first > getMaxWavesPerEU()) 335 return Default; 336 if (Requested.second > getMaxWavesPerEU()) 337 return Default; 338 339 // Make sure requested values are compatible with values implied by requested 340 // minimum/maximum flat work group sizes. 341 if (RequestedFlatWorkGroupSize && 342 Requested.first < MinImpliedByFlatWorkGroupSize) 343 return Default; 344 345 return Requested; 346 } 347 348 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 349 Function *Kernel = I->getParent()->getParent(); 350 unsigned MinSize = 0; 351 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 352 bool IdQuery = false; 353 354 // If reqd_work_group_size is present it narrows value down. 355 if (auto *CI = dyn_cast<CallInst>(I)) { 356 const Function *F = CI->getCalledFunction(); 357 if (F) { 358 unsigned Dim = UINT_MAX; 359 switch (F->getIntrinsicID()) { 360 case Intrinsic::amdgcn_workitem_id_x: 361 case Intrinsic::r600_read_tidig_x: 362 IdQuery = true; 363 LLVM_FALLTHROUGH; 364 case Intrinsic::r600_read_local_size_x: 365 Dim = 0; 366 break; 367 case Intrinsic::amdgcn_workitem_id_y: 368 case Intrinsic::r600_read_tidig_y: 369 IdQuery = true; 370 LLVM_FALLTHROUGH; 371 case Intrinsic::r600_read_local_size_y: 372 Dim = 1; 373 break; 374 case Intrinsic::amdgcn_workitem_id_z: 375 case Intrinsic::r600_read_tidig_z: 376 IdQuery = true; 377 LLVM_FALLTHROUGH; 378 case Intrinsic::r600_read_local_size_z: 379 Dim = 2; 380 break; 381 default: 382 break; 383 } 384 if (Dim <= 3) { 385 if (auto Node = Kernel->getMetadata("reqd_work_group_size")) 386 if (Node->getNumOperands() == 3) 387 MinSize = MaxSize = mdconst::extract<ConstantInt>( 388 Node->getOperand(Dim))->getZExtValue(); 389 } 390 } 391 } 392 393 if (!MaxSize) 394 return false; 395 396 // Range metadata is [Lo, Hi). For ID query we need to pass max size 397 // as Hi. For size query we need to pass Hi + 1. 398 if (IdQuery) 399 MinSize = 0; 400 else 401 ++MaxSize; 402 403 MDBuilder MDB(I->getContext()); 404 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 405 APInt(32, MaxSize)); 406 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 407 return true; 408 } 409 410 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 411 unsigned &MaxAlign) const { 412 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 413 F.getCallingConv() == CallingConv::SPIR_KERNEL); 414 415 const DataLayout &DL = F.getParent()->getDataLayout(); 416 uint64_t ExplicitArgBytes = 0; 417 MaxAlign = 1; 418 419 for (const Argument &Arg : F.args()) { 420 Type *ArgTy = Arg.getType(); 421 422 unsigned Align = DL.getABITypeAlignment(ArgTy); 423 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 424 ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize; 425 MaxAlign = std::max(MaxAlign, Align); 426 } 427 428 return ExplicitArgBytes; 429 } 430 431 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 432 unsigned &MaxAlign) const { 433 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 434 435 unsigned ExplicitOffset = getExplicitKernelArgOffset(F); 436 437 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 438 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 439 if (ImplicitBytes != 0) { 440 unsigned Alignment = getAlignmentForImplicitArgPtr(); 441 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 442 } 443 444 // Being able to dereference past the end is useful for emitting scalar loads. 445 return alignTo(TotalSize, 4); 446 } 447 448 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 449 const TargetMachine &TM) : 450 R600GenSubtargetInfo(TT, GPU, FS), 451 AMDGPUSubtarget(TT), 452 InstrInfo(*this), 453 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 454 FMA(false), 455 CaymanISA(false), 456 CFALUBug(false), 457 DX10Clamp(false), 458 HasVertexCache(false), 459 R600ALUInst(false), 460 FP64(false), 461 TexVTXClauseSize(0), 462 Gen(R600), 463 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)), 464 InstrItins(getInstrItineraryForCPU(GPU)) { } 465 466 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 467 unsigned NumRegionInstrs) const { 468 // Track register pressure so the scheduler can try to decrease 469 // pressure once register usage is above the threshold defined by 470 // SIRegisterInfo::getRegPressureSetLimit() 471 Policy.ShouldTrackPressure = true; 472 473 // Enabling both top down and bottom up scheduling seems to give us less 474 // register spills than just using one of these approaches on its own. 475 Policy.OnlyTopDown = false; 476 Policy.OnlyBottomUp = false; 477 478 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 479 if (!enableSIScheduler()) 480 Policy.ShouldTrackLaneMasks = true; 481 } 482 483 bool GCNSubtarget::isVGPRSpillingEnabled(const Function& F) const { 484 return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv()); 485 } 486 487 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 488 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 489 if (SGPRs <= 80) 490 return 10; 491 if (SGPRs <= 88) 492 return 9; 493 if (SGPRs <= 100) 494 return 8; 495 return 7; 496 } 497 if (SGPRs <= 48) 498 return 10; 499 if (SGPRs <= 56) 500 return 9; 501 if (SGPRs <= 64) 502 return 8; 503 if (SGPRs <= 72) 504 return 7; 505 if (SGPRs <= 80) 506 return 6; 507 return 5; 508 } 509 510 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 511 if (VGPRs <= 24) 512 return 10; 513 if (VGPRs <= 28) 514 return 9; 515 if (VGPRs <= 32) 516 return 8; 517 if (VGPRs <= 36) 518 return 7; 519 if (VGPRs <= 40) 520 return 6; 521 if (VGPRs <= 48) 522 return 5; 523 if (VGPRs <= 64) 524 return 4; 525 if (VGPRs <= 84) 526 return 3; 527 if (VGPRs <= 128) 528 return 2; 529 return 1; 530 } 531 532 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 533 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 534 if (MFI.hasFlatScratchInit()) { 535 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 536 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 537 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 538 return 4; // FLAT_SCRATCH, VCC (in that order). 539 } 540 541 if (isXNACKEnabled()) 542 return 4; // XNACK, VCC (in that order). 543 return 2; // VCC. 544 } 545 546 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 547 const Function &F = MF.getFunction(); 548 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 549 550 // Compute maximum number of SGPRs function can use using default/requested 551 // minimum number of waves per execution unit. 552 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 553 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 554 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 555 556 // Check if maximum number of SGPRs was explicitly requested using 557 // "amdgpu-num-sgpr" attribute. 558 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 559 unsigned Requested = AMDGPU::getIntegerAttribute( 560 F, "amdgpu-num-sgpr", MaxNumSGPRs); 561 562 // Make sure requested value does not violate subtarget's specifications. 563 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 564 Requested = 0; 565 566 // If more SGPRs are required to support the input user/system SGPRs, 567 // increase to accommodate them. 568 // 569 // FIXME: This really ends up using the requested number of SGPRs + number 570 // of reserved special registers in total. Theoretically you could re-use 571 // the last input registers for these special registers, but this would 572 // require a lot of complexity to deal with the weird aliasing. 573 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 574 if (Requested && Requested < InputNumSGPRs) 575 Requested = InputNumSGPRs; 576 577 // Make sure requested value is compatible with values implied by 578 // default/requested minimum/maximum number of waves per execution unit. 579 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 580 Requested = 0; 581 if (WavesPerEU.second && 582 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 583 Requested = 0; 584 585 if (Requested) 586 MaxNumSGPRs = Requested; 587 } 588 589 if (hasSGPRInitBug()) 590 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 591 592 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 593 MaxAddressableNumSGPRs); 594 } 595 596 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 597 const Function &F = MF.getFunction(); 598 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 599 600 // Compute maximum number of VGPRs function can use using default/requested 601 // minimum number of waves per execution unit. 602 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 603 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 604 605 // Check if maximum number of VGPRs was explicitly requested using 606 // "amdgpu-num-vgpr" attribute. 607 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 608 unsigned Requested = AMDGPU::getIntegerAttribute( 609 F, "amdgpu-num-vgpr", MaxNumVGPRs); 610 611 // Make sure requested value is compatible with values implied by 612 // default/requested minimum/maximum number of waves per execution unit. 613 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 614 Requested = 0; 615 if (WavesPerEU.second && 616 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 617 Requested = 0; 618 619 if (Requested) 620 MaxNumVGPRs = Requested; 621 } 622 623 return MaxNumVGPRs; 624 } 625 626 namespace { 627 struct MemOpClusterMutation : ScheduleDAGMutation { 628 const SIInstrInfo *TII; 629 630 MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {} 631 632 void apply(ScheduleDAGInstrs *DAGInstrs) override { 633 ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); 634 635 SUnit *SUa = nullptr; 636 // Search for two consequent memory operations and link them 637 // to prevent scheduler from moving them apart. 638 // In DAG pre-process SUnits are in the original order of 639 // the instructions before scheduling. 640 for (SUnit &SU : DAG->SUnits) { 641 MachineInstr &MI2 = *SU.getInstr(); 642 if (!MI2.mayLoad() && !MI2.mayStore()) { 643 SUa = nullptr; 644 continue; 645 } 646 if (!SUa) { 647 SUa = &SU; 648 continue; 649 } 650 651 MachineInstr &MI1 = *SUa->getInstr(); 652 if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) || 653 (TII->isFLAT(MI1) && TII->isFLAT(MI2)) || 654 (TII->isSMRD(MI1) && TII->isSMRD(MI2)) || 655 (TII->isDS(MI1) && TII->isDS(MI2))) { 656 SU.addPredBarrier(SUa); 657 658 for (const SDep &SI : SU.Preds) { 659 if (SI.getSUnit() != SUa) 660 SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial)); 661 } 662 663 if (&SU != &DAG->ExitSU) { 664 for (const SDep &SI : SUa->Succs) { 665 if (SI.getSUnit() != &SU) 666 SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial)); 667 } 668 } 669 } 670 671 SUa = &SU; 672 } 673 } 674 }; 675 } // namespace 676 677 void GCNSubtarget::getPostRAMutations( 678 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 679 Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo)); 680 } 681 682 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 683 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 684 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 685 else 686 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); 687 } 688 689 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 690 if (TM.getTargetTriple().getArch() == Triple::amdgcn) 691 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 692 else 693 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); 694 } 695