1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// Implements the AMDGPU specific subclass of TargetSubtarget. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPUSubtarget.h" 16 #include "AMDGPU.h" 17 #include "AMDGPUTargetMachine.h" 18 #include "AMDGPUCallLowering.h" 19 #include "AMDGPUInstructionSelector.h" 20 #include "AMDGPULegalizerInfo.h" 21 #include "AMDGPURegisterBankInfo.h" 22 #include "SIMachineFunctionInfo.h" 23 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 24 #include "llvm/ADT/SmallString.h" 25 #include "llvm/CodeGen/MachineScheduler.h" 26 #include "llvm/MC/MCSubtargetInfo.h" 27 #include "llvm/IR/MDBuilder.h" 28 #include "llvm/CodeGen/TargetFrameLowering.h" 29 #include <algorithm> 30 31 using namespace llvm; 32 33 #define DEBUG_TYPE "amdgpu-subtarget" 34 35 #define GET_SUBTARGETINFO_TARGET_DESC 36 #define GET_SUBTARGETINFO_CTOR 37 #define AMDGPUSubtarget GCNSubtarget 38 #include "AMDGPUGenSubtargetInfo.inc" 39 #define GET_SUBTARGETINFO_TARGET_DESC 40 #define GET_SUBTARGETINFO_CTOR 41 #undef AMDGPUSubtarget 42 #include "R600GenSubtargetInfo.inc" 43 44 GCNSubtarget::~GCNSubtarget() = default; 45 46 R600Subtarget & 47 R600Subtarget::initializeSubtargetDependencies(const Triple &TT, 48 StringRef GPU, StringRef FS) { 49 SmallString<256> FullFS("+promote-alloca,+dx10-clamp,"); 50 FullFS += FS; 51 ParseSubtargetFeatures(GPU, FullFS); 52 53 // FIXME: I don't think think Evergreen has any useful support for 54 // denormals, but should be checked. Should we issue a warning somewhere 55 // if someone tries to enable these? 56 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { 57 FP32Denormals = false; 58 } 59 60 HasMulU24 = getGeneration() >= EVERGREEN; 61 HasMulI24 = hasCaymanISA(); 62 63 return *this; 64 } 65 66 GCNSubtarget & 67 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 68 StringRef GPU, StringRef FS) { 69 // Determine default and user-specified characteristics 70 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be 71 // enabled, but some instructions do not respect them and they run at the 72 // double precision rate, so don't enable by default. 73 // 74 // We want to be able to turn these off, but making this a subtarget feature 75 // for SI has the unhelpful behavior that it unsets everything else if you 76 // disable it. 77 // 78 // Similarly we want enable-prt-strict-null to be on by default and not to 79 // unset everything else if it is disabled 80 81 SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,"); 82 83 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 84 FullFS += "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,"; 85 86 // FIXME: I don't think think Evergreen has any useful support for 87 // denormals, but should be checked. Should we issue a warning somewhere 88 // if someone tries to enable these? 89 if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { 90 FullFS += "+fp64-fp16-denormals,"; 91 } else { 92 FullFS += "-fp32-denormals,"; 93 } 94 95 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS 96 97 FullFS += FS; 98 99 ParseSubtargetFeatures(GPU, FullFS); 100 101 // We don't support FP64 for EG/NI atm. 102 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 103 104 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es 105 // on VI and newer hardware to avoid assertion failures due to missing ADDR64 106 // variants of MUBUF instructions. 107 if (!hasAddr64() && !FS.contains("flat-for-global")) { 108 FlatForGlobal = true; 109 } 110 111 // Set defaults if needed. 112 if (MaxPrivateElementSize == 0) 113 MaxPrivateElementSize = 4; 114 115 if (LDSBankCount == 0) 116 LDSBankCount = 32; 117 118 if (TT.getArch() == Triple::amdgcn) { 119 if (LocalMemorySize == 0) 120 LocalMemorySize = 32768; 121 122 // Do something sensible for unspecified target. 123 if (!HasMovrel && !HasVGPRIndexMode) 124 HasMovrel = true; 125 } 126 127 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 128 129 return *this; 130 } 131 132 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : 133 TargetTriple(TT), 134 Has16BitInsts(false), 135 HasMadMixInsts(false), 136 FP32Denormals(false), 137 FPExceptions(false), 138 HasSDWA(false), 139 HasVOP3PInsts(false), 140 HasMulI24(true), 141 HasMulU24(true), 142 HasInv2PiInlineImm(false), 143 HasFminFmaxLegacy(true), 144 EnablePromoteAlloca(false), 145 HasTrigReducedRange(false), 146 LocalMemorySize(0), 147 WavefrontSize(0) 148 { } 149 150 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 151 const GCNTargetMachine &TM) : 152 AMDGPUGenSubtargetInfo(TT, GPU, FS), 153 AMDGPUSubtarget(TT), 154 TargetTriple(TT), 155 Gen(SOUTHERN_ISLANDS), 156 IsaVersion(ISAVersion0_0_0), 157 InstrItins(getInstrItineraryForCPU(GPU)), 158 LDSBankCount(0), 159 MaxPrivateElementSize(0), 160 161 FastFMAF32(false), 162 HalfRate64Ops(false), 163 164 FP64FP16Denormals(false), 165 DX10Clamp(false), 166 FlatForGlobal(false), 167 AutoWaitcntBeforeBarrier(false), 168 CodeObjectV3(false), 169 UnalignedScratchAccess(false), 170 UnalignedBufferAccess(false), 171 172 HasApertureRegs(false), 173 EnableXNACK(false), 174 TrapHandler(false), 175 DebuggerInsertNops(false), 176 DebuggerEmitPrologue(false), 177 178 EnableHugePrivateBuffer(false), 179 EnableLoadStoreOpt(false), 180 EnableUnsafeDSOffsetFolding(false), 181 EnableSIScheduler(false), 182 EnableDS128(false), 183 EnablePRTStrictNull(false), 184 DumpCode(false), 185 186 FP64(false), 187 GCN3Encoding(false), 188 CIInsts(false), 189 VIInsts(false), 190 GFX9Insts(false), 191 SGPRInitBug(false), 192 HasSMemRealTime(false), 193 HasIntClamp(false), 194 HasFmaMixInsts(false), 195 HasMovrel(false), 196 HasVGPRIndexMode(false), 197 HasScalarStores(false), 198 HasScalarAtomics(false), 199 HasSDWAOmod(false), 200 HasSDWAScalar(false), 201 HasSDWASdst(false), 202 HasSDWAMac(false), 203 HasSDWAOutModsVOPC(false), 204 HasDPP(false), 205 HasR128A16(false), 206 HasDLInsts(false), 207 HasDotInsts(false), 208 EnableSRAMECC(false), 209 FlatAddressSpace(false), 210 FlatInstOffsets(false), 211 FlatGlobalInsts(false), 212 FlatScratchInsts(false), 213 AddNoCarryInsts(false), 214 HasUnpackedD16VMem(false), 215 216 ScalarizeGlobal(false), 217 218 FeatureDisable(false), 219 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 220 TLInfo(TM, *this), 221 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 222 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 223 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 224 RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo())); 225 InstSelector.reset(new AMDGPUInstructionSelector( 226 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 227 } 228 229 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 230 const Function &F) const { 231 if (NWaves == 1) 232 return getLocalMemorySize(); 233 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 234 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 235 unsigned MaxWaves = getMaxWavesPerEU(); 236 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 237 } 238 239 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 240 const Function &F) const { 241 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 242 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 243 unsigned MaxWaves = getMaxWavesPerEU(); 244 unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu; 245 unsigned NumWaves = Limit / (Bytes ? Bytes : 1u); 246 NumWaves = std::min(NumWaves, MaxWaves); 247 NumWaves = std::max(NumWaves, 1u); 248 return NumWaves; 249 } 250 251 unsigned 252 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 253 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 254 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 255 } 256 257 std::pair<unsigned, unsigned> 258 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 259 switch (CC) { 260 case CallingConv::AMDGPU_CS: 261 case CallingConv::AMDGPU_KERNEL: 262 case CallingConv::SPIR_KERNEL: 263 return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4); 264 case CallingConv::AMDGPU_VS: 265 case CallingConv::AMDGPU_LS: 266 case CallingConv::AMDGPU_HS: 267 case CallingConv::AMDGPU_ES: 268 case CallingConv::AMDGPU_GS: 269 case CallingConv::AMDGPU_PS: 270 return std::make_pair(1, getWavefrontSize()); 271 default: 272 return std::make_pair(1, 16 * getWavefrontSize()); 273 } 274 } 275 276 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 277 const Function &F) const { 278 // FIXME: 1024 if function. 279 // Default minimum/maximum flat work group sizes. 280 std::pair<unsigned, unsigned> Default = 281 getDefaultFlatWorkGroupSize(F.getCallingConv()); 282 283 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 284 // starts using "amdgpu-flat-work-group-size" attribute. 285 Default.second = AMDGPU::getIntegerAttribute( 286 F, "amdgpu-max-work-group-size", Default.second); 287 Default.first = std::min(Default.first, Default.second); 288 289 // Requested minimum/maximum flat work group sizes. 290 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 291 F, "amdgpu-flat-work-group-size", Default); 292 293 // Make sure requested minimum is less than requested maximum. 294 if (Requested.first > Requested.second) 295 return Default; 296 297 // Make sure requested values do not violate subtarget's specifications. 298 if (Requested.first < getMinFlatWorkGroupSize()) 299 return Default; 300 if (Requested.second > getMaxFlatWorkGroupSize()) 301 return Default; 302 303 return Requested; 304 } 305 306 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 307 const Function &F) const { 308 // Default minimum/maximum number of waves per execution unit. 309 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 310 311 // Default/requested minimum/maximum flat work group sizes. 312 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 313 314 // If minimum/maximum flat work group sizes were explicitly requested using 315 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 316 // number of waves per execution unit to values implied by requested 317 // minimum/maximum flat work group sizes. 318 unsigned MinImpliedByFlatWorkGroupSize = 319 getMaxWavesPerEU(FlatWorkGroupSizes.second); 320 bool RequestedFlatWorkGroupSize = false; 321 322 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 323 // starts using "amdgpu-flat-work-group-size" attribute. 324 if (F.hasFnAttribute("amdgpu-max-work-group-size") || 325 F.hasFnAttribute("amdgpu-flat-work-group-size")) { 326 Default.first = MinImpliedByFlatWorkGroupSize; 327 RequestedFlatWorkGroupSize = true; 328 } 329 330 // Requested minimum/maximum number of waves per execution unit. 331 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 332 F, "amdgpu-waves-per-eu", Default, true); 333 334 // Make sure requested minimum is less than requested maximum. 335 if (Requested.second && Requested.first > Requested.second) 336 return Default; 337 338 // Make sure requested values do not violate subtarget's specifications. 339 if (Requested.first < getMinWavesPerEU() || 340 Requested.first > getMaxWavesPerEU()) 341 return Default; 342 if (Requested.second > getMaxWavesPerEU()) 343 return Default; 344 345 // Make sure requested values are compatible with values implied by requested 346 // minimum/maximum flat work group sizes. 347 if (RequestedFlatWorkGroupSize && 348 Requested.first < MinImpliedByFlatWorkGroupSize) 349 return Default; 350 351 return Requested; 352 } 353 354 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 355 Function *Kernel = I->getParent()->getParent(); 356 unsigned MinSize = 0; 357 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 358 bool IdQuery = false; 359 360 // If reqd_work_group_size is present it narrows value down. 361 if (auto *CI = dyn_cast<CallInst>(I)) { 362 const Function *F = CI->getCalledFunction(); 363 if (F) { 364 unsigned Dim = UINT_MAX; 365 switch (F->getIntrinsicID()) { 366 case Intrinsic::amdgcn_workitem_id_x: 367 case Intrinsic::r600_read_tidig_x: 368 IdQuery = true; 369 LLVM_FALLTHROUGH; 370 case Intrinsic::r600_read_local_size_x: 371 Dim = 0; 372 break; 373 case Intrinsic::amdgcn_workitem_id_y: 374 case Intrinsic::r600_read_tidig_y: 375 IdQuery = true; 376 LLVM_FALLTHROUGH; 377 case Intrinsic::r600_read_local_size_y: 378 Dim = 1; 379 break; 380 case Intrinsic::amdgcn_workitem_id_z: 381 case Intrinsic::r600_read_tidig_z: 382 IdQuery = true; 383 LLVM_FALLTHROUGH; 384 case Intrinsic::r600_read_local_size_z: 385 Dim = 2; 386 break; 387 default: 388 break; 389 } 390 if (Dim <= 3) { 391 if (auto Node = Kernel->getMetadata("reqd_work_group_size")) 392 if (Node->getNumOperands() == 3) 393 MinSize = MaxSize = mdconst::extract<ConstantInt>( 394 Node->getOperand(Dim))->getZExtValue(); 395 } 396 } 397 } 398 399 if (!MaxSize) 400 return false; 401 402 // Range metadata is [Lo, Hi). For ID query we need to pass max size 403 // as Hi. For size query we need to pass Hi + 1. 404 if (IdQuery) 405 MinSize = 0; 406 else 407 ++MaxSize; 408 409 MDBuilder MDB(I->getContext()); 410 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 411 APInt(32, MaxSize)); 412 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 413 return true; 414 } 415 416 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 417 unsigned &MaxAlign) const { 418 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 419 F.getCallingConv() == CallingConv::SPIR_KERNEL); 420 421 const DataLayout &DL = F.getParent()->getDataLayout(); 422 uint64_t ExplicitArgBytes = 0; 423 MaxAlign = 1; 424 425 for (const Argument &Arg : F.args()) { 426 Type *ArgTy = Arg.getType(); 427 428 unsigned Align = DL.getABITypeAlignment(ArgTy); 429 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 430 ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize; 431 MaxAlign = std::max(MaxAlign, Align); 432 } 433 434 return ExplicitArgBytes; 435 } 436 437 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 438 unsigned &MaxAlign) const { 439 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 440 441 unsigned ExplicitOffset = getExplicitKernelArgOffset(F); 442 443 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 444 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 445 if (ImplicitBytes != 0) { 446 unsigned Alignment = getAlignmentForImplicitArgPtr(); 447 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 448 } 449 450 // Being able to dereference past the end is useful for emitting scalar loads. 451 return alignTo(TotalSize, 4); 452 } 453 454 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 455 const TargetMachine &TM) : 456 R600GenSubtargetInfo(TT, GPU, FS), 457 AMDGPUSubtarget(TT), 458 InstrInfo(*this), 459 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 460 FMA(false), 461 CaymanISA(false), 462 CFALUBug(false), 463 DX10Clamp(false), 464 HasVertexCache(false), 465 R600ALUInst(false), 466 FP64(false), 467 TexVTXClauseSize(0), 468 Gen(R600), 469 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)), 470 InstrItins(getInstrItineraryForCPU(GPU)) { } 471 472 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 473 unsigned NumRegionInstrs) const { 474 // Track register pressure so the scheduler can try to decrease 475 // pressure once register usage is above the threshold defined by 476 // SIRegisterInfo::getRegPressureSetLimit() 477 Policy.ShouldTrackPressure = true; 478 479 // Enabling both top down and bottom up scheduling seems to give us less 480 // register spills than just using one of these approaches on its own. 481 Policy.OnlyTopDown = false; 482 Policy.OnlyBottomUp = false; 483 484 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 485 if (!enableSIScheduler()) 486 Policy.ShouldTrackLaneMasks = true; 487 } 488 489 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 490 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 491 if (SGPRs <= 80) 492 return 10; 493 if (SGPRs <= 88) 494 return 9; 495 if (SGPRs <= 100) 496 return 8; 497 return 7; 498 } 499 if (SGPRs <= 48) 500 return 10; 501 if (SGPRs <= 56) 502 return 9; 503 if (SGPRs <= 64) 504 return 8; 505 if (SGPRs <= 72) 506 return 7; 507 if (SGPRs <= 80) 508 return 6; 509 return 5; 510 } 511 512 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 513 if (VGPRs <= 24) 514 return 10; 515 if (VGPRs <= 28) 516 return 9; 517 if (VGPRs <= 32) 518 return 8; 519 if (VGPRs <= 36) 520 return 7; 521 if (VGPRs <= 40) 522 return 6; 523 if (VGPRs <= 48) 524 return 5; 525 if (VGPRs <= 64) 526 return 4; 527 if (VGPRs <= 84) 528 return 3; 529 if (VGPRs <= 128) 530 return 2; 531 return 1; 532 } 533 534 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 535 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 536 if (MFI.hasFlatScratchInit()) { 537 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 538 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 539 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 540 return 4; // FLAT_SCRATCH, VCC (in that order). 541 } 542 543 if (isXNACKEnabled()) 544 return 4; // XNACK, VCC (in that order). 545 return 2; // VCC. 546 } 547 548 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 549 const Function &F = MF.getFunction(); 550 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 551 552 // Compute maximum number of SGPRs function can use using default/requested 553 // minimum number of waves per execution unit. 554 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 555 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 556 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 557 558 // Check if maximum number of SGPRs was explicitly requested using 559 // "amdgpu-num-sgpr" attribute. 560 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 561 unsigned Requested = AMDGPU::getIntegerAttribute( 562 F, "amdgpu-num-sgpr", MaxNumSGPRs); 563 564 // Make sure requested value does not violate subtarget's specifications. 565 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 566 Requested = 0; 567 568 // If more SGPRs are required to support the input user/system SGPRs, 569 // increase to accommodate them. 570 // 571 // FIXME: This really ends up using the requested number of SGPRs + number 572 // of reserved special registers in total. Theoretically you could re-use 573 // the last input registers for these special registers, but this would 574 // require a lot of complexity to deal with the weird aliasing. 575 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 576 if (Requested && Requested < InputNumSGPRs) 577 Requested = InputNumSGPRs; 578 579 // Make sure requested value is compatible with values implied by 580 // default/requested minimum/maximum number of waves per execution unit. 581 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 582 Requested = 0; 583 if (WavesPerEU.second && 584 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 585 Requested = 0; 586 587 if (Requested) 588 MaxNumSGPRs = Requested; 589 } 590 591 if (hasSGPRInitBug()) 592 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 593 594 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 595 MaxAddressableNumSGPRs); 596 } 597 598 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 599 const Function &F = MF.getFunction(); 600 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 601 602 // Compute maximum number of VGPRs function can use using default/requested 603 // minimum number of waves per execution unit. 604 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 605 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 606 607 // Check if maximum number of VGPRs was explicitly requested using 608 // "amdgpu-num-vgpr" attribute. 609 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 610 unsigned Requested = AMDGPU::getIntegerAttribute( 611 F, "amdgpu-num-vgpr", MaxNumVGPRs); 612 613 // Make sure requested value is compatible with values implied by 614 // default/requested minimum/maximum number of waves per execution unit. 615 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 616 Requested = 0; 617 if (WavesPerEU.second && 618 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 619 Requested = 0; 620 621 if (Requested) 622 MaxNumVGPRs = Requested; 623 } 624 625 return MaxNumVGPRs; 626 } 627 628 namespace { 629 struct MemOpClusterMutation : ScheduleDAGMutation { 630 const SIInstrInfo *TII; 631 632 MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {} 633 634 void apply(ScheduleDAGInstrs *DAGInstrs) override { 635 ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); 636 637 SUnit *SUa = nullptr; 638 // Search for two consequent memory operations and link them 639 // to prevent scheduler from moving them apart. 640 // In DAG pre-process SUnits are in the original order of 641 // the instructions before scheduling. 642 for (SUnit &SU : DAG->SUnits) { 643 MachineInstr &MI2 = *SU.getInstr(); 644 if (!MI2.mayLoad() && !MI2.mayStore()) { 645 SUa = nullptr; 646 continue; 647 } 648 if (!SUa) { 649 SUa = &SU; 650 continue; 651 } 652 653 MachineInstr &MI1 = *SUa->getInstr(); 654 if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) || 655 (TII->isFLAT(MI1) && TII->isFLAT(MI2)) || 656 (TII->isSMRD(MI1) && TII->isSMRD(MI2)) || 657 (TII->isDS(MI1) && TII->isDS(MI2))) { 658 SU.addPredBarrier(SUa); 659 660 for (const SDep &SI : SU.Preds) { 661 if (SI.getSUnit() != SUa) 662 SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial)); 663 } 664 665 if (&SU != &DAG->ExitSU) { 666 for (const SDep &SI : SUa->Succs) { 667 if (SI.getSUnit() != &SU) 668 SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial)); 669 } 670 } 671 } 672 673 SUa = &SU; 674 } 675 } 676 }; 677 } // namespace 678 679 void GCNSubtarget::getPostRAMutations( 680 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 681 Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo)); 682 } 683 684 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 685 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 686 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 687 else 688 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); 689 } 690 691 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 692 if (TM.getTargetTriple().getArch() == Triple::amdgcn) 693 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 694 else 695 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); 696 } 697