1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// Implements the AMDGPU specific subclass of TargetSubtarget. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPUSubtarget.h" 16 #include "AMDGPU.h" 17 #include "AMDGPUTargetMachine.h" 18 #include "AMDGPUCallLowering.h" 19 #include "AMDGPUInstructionSelector.h" 20 #include "AMDGPULegalizerInfo.h" 21 #include "AMDGPURegisterBankInfo.h" 22 #include "SIMachineFunctionInfo.h" 23 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 24 #include "llvm/ADT/SmallString.h" 25 #include "llvm/CodeGen/MachineScheduler.h" 26 #include "llvm/MC/MCSubtargetInfo.h" 27 #include "llvm/IR/MDBuilder.h" 28 #include "llvm/CodeGen/TargetFrameLowering.h" 29 #include <algorithm> 30 31 using namespace llvm; 32 33 #define DEBUG_TYPE "amdgpu-subtarget" 34 35 #define GET_SUBTARGETINFO_TARGET_DESC 36 #define GET_SUBTARGETINFO_CTOR 37 #define AMDGPUSubtarget GCNSubtarget 38 #include "AMDGPUGenSubtargetInfo.inc" 39 #define GET_SUBTARGETINFO_TARGET_DESC 40 #define GET_SUBTARGETINFO_CTOR 41 #undef AMDGPUSubtarget 42 #include "R600GenSubtargetInfo.inc" 43 44 GCNSubtarget::~GCNSubtarget() = default; 45 46 R600Subtarget & 47 R600Subtarget::initializeSubtargetDependencies(const Triple &TT, 48 StringRef GPU, StringRef FS) { 49 SmallString<256> FullFS("+promote-alloca,+dx10-clamp,"); 50 FullFS += FS; 51 ParseSubtargetFeatures(GPU, FullFS); 52 53 // FIXME: I don't think think Evergreen has any useful support for 54 // denormals, but should be checked. Should we issue a warning somewhere 55 // if someone tries to enable these? 56 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { 57 FP32Denormals = false; 58 } 59 60 HasMulU24 = getGeneration() >= EVERGREEN; 61 HasMulI24 = hasCaymanISA(); 62 63 return *this; 64 } 65 66 GCNSubtarget & 67 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, 68 StringRef GPU, StringRef FS) { 69 // Determine default and user-specified characteristics 70 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be 71 // enabled, but some instructions do not respect them and they run at the 72 // double precision rate, so don't enable by default. 73 // 74 // We want to be able to turn these off, but making this a subtarget feature 75 // for SI has the unhelpful behavior that it unsets everything else if you 76 // disable it. 77 78 SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,"); 79 80 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 81 FullFS += "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,"; 82 83 // FIXME: I don't think think Evergreen has any useful support for 84 // denormals, but should be checked. Should we issue a warning somewhere 85 // if someone tries to enable these? 86 if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { 87 FullFS += "+fp64-fp16-denormals,"; 88 } else { 89 FullFS += "-fp32-denormals,"; 90 } 91 92 FullFS += FS; 93 94 ParseSubtargetFeatures(GPU, FullFS); 95 96 // We don't support FP64 for EG/NI atm. 97 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); 98 99 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es 100 // on VI and newer hardware to avoid assertion failures due to missing ADDR64 101 // variants of MUBUF instructions. 102 if (!hasAddr64() && !FS.contains("flat-for-global")) { 103 FlatForGlobal = true; 104 } 105 106 // Set defaults if needed. 107 if (MaxPrivateElementSize == 0) 108 MaxPrivateElementSize = 4; 109 110 if (LDSBankCount == 0) 111 LDSBankCount = 32; 112 113 if (TT.getArch() == Triple::amdgcn) { 114 if (LocalMemorySize == 0) 115 LocalMemorySize = 32768; 116 117 // Do something sensible for unspecified target. 118 if (!HasMovrel && !HasVGPRIndexMode) 119 HasMovrel = true; 120 } 121 122 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; 123 124 return *this; 125 } 126 127 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : 128 TargetTriple(TT), 129 Has16BitInsts(false), 130 HasMadMixInsts(false), 131 FP32Denormals(false), 132 FPExceptions(false), 133 HasSDWA(false), 134 HasVOP3PInsts(false), 135 HasMulI24(true), 136 HasMulU24(true), 137 HasInv2PiInlineImm(false), 138 HasFminFmaxLegacy(true), 139 EnablePromoteAlloca(false), 140 HasTrigReducedRange(false), 141 LocalMemorySize(0), 142 WavefrontSize(0) 143 { } 144 145 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 146 const GCNTargetMachine &TM) : 147 AMDGPUGenSubtargetInfo(TT, GPU, FS), 148 AMDGPUSubtarget(TT), 149 TargetTriple(TT), 150 Gen(SOUTHERN_ISLANDS), 151 IsaVersion(ISAVersion0_0_0), 152 InstrItins(getInstrItineraryForCPU(GPU)), 153 LDSBankCount(0), 154 MaxPrivateElementSize(0), 155 156 FastFMAF32(false), 157 HalfRate64Ops(false), 158 159 FP64FP16Denormals(false), 160 DX10Clamp(false), 161 FlatForGlobal(false), 162 AutoWaitcntBeforeBarrier(false), 163 CodeObjectV3(false), 164 UnalignedScratchAccess(false), 165 UnalignedBufferAccess(false), 166 167 HasApertureRegs(false), 168 EnableXNACK(false), 169 TrapHandler(false), 170 DebuggerInsertNops(false), 171 DebuggerEmitPrologue(false), 172 173 EnableHugePrivateBuffer(false), 174 EnableLoadStoreOpt(false), 175 EnableUnsafeDSOffsetFolding(false), 176 EnableSIScheduler(false), 177 EnableDS128(false), 178 DumpCode(false), 179 180 FP64(false), 181 GCN3Encoding(false), 182 CIInsts(false), 183 VIInsts(false), 184 GFX9Insts(false), 185 SGPRInitBug(false), 186 HasSMemRealTime(false), 187 HasIntClamp(false), 188 HasFmaMixInsts(false), 189 HasMovrel(false), 190 HasVGPRIndexMode(false), 191 HasScalarStores(false), 192 HasScalarAtomics(false), 193 HasSDWAOmod(false), 194 HasSDWAScalar(false), 195 HasSDWASdst(false), 196 HasSDWAMac(false), 197 HasSDWAOutModsVOPC(false), 198 HasDPP(false), 199 HasR128A16(false), 200 HasDLInsts(false), 201 EnableSRAMECC(false), 202 FlatAddressSpace(false), 203 FlatInstOffsets(false), 204 FlatGlobalInsts(false), 205 FlatScratchInsts(false), 206 AddNoCarryInsts(false), 207 HasUnpackedD16VMem(false), 208 209 ScalarizeGlobal(false), 210 211 FeatureDisable(false), 212 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), 213 TLInfo(TM, *this), 214 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { 215 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 216 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); 217 RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo())); 218 InstSelector.reset(new AMDGPUInstructionSelector( 219 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); 220 } 221 222 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 223 const Function &F) const { 224 if (NWaves == 1) 225 return getLocalMemorySize(); 226 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 227 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 228 unsigned MaxWaves = getMaxWavesPerEU(); 229 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 230 } 231 232 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 233 const Function &F) const { 234 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 235 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 236 unsigned MaxWaves = getMaxWavesPerEU(); 237 unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu; 238 unsigned NumWaves = Limit / (Bytes ? Bytes : 1u); 239 NumWaves = std::min(NumWaves, MaxWaves); 240 NumWaves = std::max(NumWaves, 1u); 241 return NumWaves; 242 } 243 244 unsigned 245 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { 246 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); 247 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); 248 } 249 250 std::pair<unsigned, unsigned> 251 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { 252 switch (CC) { 253 case CallingConv::AMDGPU_CS: 254 case CallingConv::AMDGPU_KERNEL: 255 case CallingConv::SPIR_KERNEL: 256 return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4); 257 case CallingConv::AMDGPU_VS: 258 case CallingConv::AMDGPU_LS: 259 case CallingConv::AMDGPU_HS: 260 case CallingConv::AMDGPU_ES: 261 case CallingConv::AMDGPU_GS: 262 case CallingConv::AMDGPU_PS: 263 return std::make_pair(1, getWavefrontSize()); 264 default: 265 return std::make_pair(1, 16 * getWavefrontSize()); 266 } 267 } 268 269 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 270 const Function &F) const { 271 // FIXME: 1024 if function. 272 // Default minimum/maximum flat work group sizes. 273 std::pair<unsigned, unsigned> Default = 274 getDefaultFlatWorkGroupSize(F.getCallingConv()); 275 276 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 277 // starts using "amdgpu-flat-work-group-size" attribute. 278 Default.second = AMDGPU::getIntegerAttribute( 279 F, "amdgpu-max-work-group-size", Default.second); 280 Default.first = std::min(Default.first, Default.second); 281 282 // Requested minimum/maximum flat work group sizes. 283 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 284 F, "amdgpu-flat-work-group-size", Default); 285 286 // Make sure requested minimum is less than requested maximum. 287 if (Requested.first > Requested.second) 288 return Default; 289 290 // Make sure requested values do not violate subtarget's specifications. 291 if (Requested.first < getMinFlatWorkGroupSize()) 292 return Default; 293 if (Requested.second > getMaxFlatWorkGroupSize()) 294 return Default; 295 296 return Requested; 297 } 298 299 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 300 const Function &F) const { 301 // Default minimum/maximum number of waves per execution unit. 302 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 303 304 // Default/requested minimum/maximum flat work group sizes. 305 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 306 307 // If minimum/maximum flat work group sizes were explicitly requested using 308 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 309 // number of waves per execution unit to values implied by requested 310 // minimum/maximum flat work group sizes. 311 unsigned MinImpliedByFlatWorkGroupSize = 312 getMaxWavesPerEU(FlatWorkGroupSizes.second); 313 bool RequestedFlatWorkGroupSize = false; 314 315 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 316 // starts using "amdgpu-flat-work-group-size" attribute. 317 if (F.hasFnAttribute("amdgpu-max-work-group-size") || 318 F.hasFnAttribute("amdgpu-flat-work-group-size")) { 319 Default.first = MinImpliedByFlatWorkGroupSize; 320 RequestedFlatWorkGroupSize = true; 321 } 322 323 // Requested minimum/maximum number of waves per execution unit. 324 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 325 F, "amdgpu-waves-per-eu", Default, true); 326 327 // Make sure requested minimum is less than requested maximum. 328 if (Requested.second && Requested.first > Requested.second) 329 return Default; 330 331 // Make sure requested values do not violate subtarget's specifications. 332 if (Requested.first < getMinWavesPerEU() || 333 Requested.first > getMaxWavesPerEU()) 334 return Default; 335 if (Requested.second > getMaxWavesPerEU()) 336 return Default; 337 338 // Make sure requested values are compatible with values implied by requested 339 // minimum/maximum flat work group sizes. 340 if (RequestedFlatWorkGroupSize && 341 Requested.first < MinImpliedByFlatWorkGroupSize) 342 return Default; 343 344 return Requested; 345 } 346 347 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 348 Function *Kernel = I->getParent()->getParent(); 349 unsigned MinSize = 0; 350 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 351 bool IdQuery = false; 352 353 // If reqd_work_group_size is present it narrows value down. 354 if (auto *CI = dyn_cast<CallInst>(I)) { 355 const Function *F = CI->getCalledFunction(); 356 if (F) { 357 unsigned Dim = UINT_MAX; 358 switch (F->getIntrinsicID()) { 359 case Intrinsic::amdgcn_workitem_id_x: 360 case Intrinsic::r600_read_tidig_x: 361 IdQuery = true; 362 LLVM_FALLTHROUGH; 363 case Intrinsic::r600_read_local_size_x: 364 Dim = 0; 365 break; 366 case Intrinsic::amdgcn_workitem_id_y: 367 case Intrinsic::r600_read_tidig_y: 368 IdQuery = true; 369 LLVM_FALLTHROUGH; 370 case Intrinsic::r600_read_local_size_y: 371 Dim = 1; 372 break; 373 case Intrinsic::amdgcn_workitem_id_z: 374 case Intrinsic::r600_read_tidig_z: 375 IdQuery = true; 376 LLVM_FALLTHROUGH; 377 case Intrinsic::r600_read_local_size_z: 378 Dim = 2; 379 break; 380 default: 381 break; 382 } 383 if (Dim <= 3) { 384 if (auto Node = Kernel->getMetadata("reqd_work_group_size")) 385 if (Node->getNumOperands() == 3) 386 MinSize = MaxSize = mdconst::extract<ConstantInt>( 387 Node->getOperand(Dim))->getZExtValue(); 388 } 389 } 390 } 391 392 if (!MaxSize) 393 return false; 394 395 // Range metadata is [Lo, Hi). For ID query we need to pass max size 396 // as Hi. For size query we need to pass Hi + 1. 397 if (IdQuery) 398 MinSize = 0; 399 else 400 ++MaxSize; 401 402 MDBuilder MDB(I->getContext()); 403 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 404 APInt(32, MaxSize)); 405 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 406 return true; 407 } 408 409 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, 410 unsigned &MaxAlign) const { 411 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 412 F.getCallingConv() == CallingConv::SPIR_KERNEL); 413 414 const DataLayout &DL = F.getParent()->getDataLayout(); 415 uint64_t ExplicitArgBytes = 0; 416 MaxAlign = 1; 417 418 for (const Argument &Arg : F.args()) { 419 Type *ArgTy = Arg.getType(); 420 421 unsigned Align = DL.getABITypeAlignment(ArgTy); 422 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); 423 ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize; 424 MaxAlign = std::max(MaxAlign, Align); 425 } 426 427 return ExplicitArgBytes; 428 } 429 430 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, 431 unsigned &MaxAlign) const { 432 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); 433 434 unsigned ExplicitOffset = getExplicitKernelArgOffset(F); 435 436 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; 437 unsigned ImplicitBytes = getImplicitArgNumBytes(F); 438 if (ImplicitBytes != 0) { 439 unsigned Alignment = getAlignmentForImplicitArgPtr(); 440 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 441 } 442 443 // Being able to dereference past the end is useful for emitting scalar loads. 444 return alignTo(TotalSize, 4); 445 } 446 447 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 448 const TargetMachine &TM) : 449 R600GenSubtargetInfo(TT, GPU, FS), 450 AMDGPUSubtarget(TT), 451 InstrInfo(*this), 452 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 453 FMA(false), 454 CaymanISA(false), 455 CFALUBug(false), 456 DX10Clamp(false), 457 HasVertexCache(false), 458 R600ALUInst(false), 459 FP64(false), 460 TexVTXClauseSize(0), 461 Gen(R600), 462 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)), 463 InstrItins(getInstrItineraryForCPU(GPU)) { } 464 465 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 466 unsigned NumRegionInstrs) const { 467 // Track register pressure so the scheduler can try to decrease 468 // pressure once register usage is above the threshold defined by 469 // SIRegisterInfo::getRegPressureSetLimit() 470 Policy.ShouldTrackPressure = true; 471 472 // Enabling both top down and bottom up scheduling seems to give us less 473 // register spills than just using one of these approaches on its own. 474 Policy.OnlyTopDown = false; 475 Policy.OnlyBottomUp = false; 476 477 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 478 if (!enableSIScheduler()) 479 Policy.ShouldTrackLaneMasks = true; 480 } 481 482 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 483 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { 484 if (SGPRs <= 80) 485 return 10; 486 if (SGPRs <= 88) 487 return 9; 488 if (SGPRs <= 100) 489 return 8; 490 return 7; 491 } 492 if (SGPRs <= 48) 493 return 10; 494 if (SGPRs <= 56) 495 return 9; 496 if (SGPRs <= 64) 497 return 8; 498 if (SGPRs <= 72) 499 return 7; 500 if (SGPRs <= 80) 501 return 6; 502 return 5; 503 } 504 505 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 506 if (VGPRs <= 24) 507 return 10; 508 if (VGPRs <= 28) 509 return 9; 510 if (VGPRs <= 32) 511 return 8; 512 if (VGPRs <= 36) 513 return 7; 514 if (VGPRs <= 40) 515 return 6; 516 if (VGPRs <= 48) 517 return 5; 518 if (VGPRs <= 64) 519 return 4; 520 if (VGPRs <= 84) 521 return 3; 522 if (VGPRs <= 128) 523 return 2; 524 return 1; 525 } 526 527 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 528 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 529 if (MFI.hasFlatScratchInit()) { 530 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 531 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 532 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 533 return 4; // FLAT_SCRATCH, VCC (in that order). 534 } 535 536 if (isXNACKEnabled()) 537 return 4; // XNACK, VCC (in that order). 538 return 2; // VCC. 539 } 540 541 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 542 const Function &F = MF.getFunction(); 543 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 544 545 // Compute maximum number of SGPRs function can use using default/requested 546 // minimum number of waves per execution unit. 547 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 548 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 549 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 550 551 // Check if maximum number of SGPRs was explicitly requested using 552 // "amdgpu-num-sgpr" attribute. 553 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 554 unsigned Requested = AMDGPU::getIntegerAttribute( 555 F, "amdgpu-num-sgpr", MaxNumSGPRs); 556 557 // Make sure requested value does not violate subtarget's specifications. 558 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 559 Requested = 0; 560 561 // If more SGPRs are required to support the input user/system SGPRs, 562 // increase to accommodate them. 563 // 564 // FIXME: This really ends up using the requested number of SGPRs + number 565 // of reserved special registers in total. Theoretically you could re-use 566 // the last input registers for these special registers, but this would 567 // require a lot of complexity to deal with the weird aliasing. 568 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 569 if (Requested && Requested < InputNumSGPRs) 570 Requested = InputNumSGPRs; 571 572 // Make sure requested value is compatible with values implied by 573 // default/requested minimum/maximum number of waves per execution unit. 574 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 575 Requested = 0; 576 if (WavesPerEU.second && 577 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 578 Requested = 0; 579 580 if (Requested) 581 MaxNumSGPRs = Requested; 582 } 583 584 if (hasSGPRInitBug()) 585 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 586 587 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 588 MaxAddressableNumSGPRs); 589 } 590 591 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 592 const Function &F = MF.getFunction(); 593 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 594 595 // Compute maximum number of VGPRs function can use using default/requested 596 // minimum number of waves per execution unit. 597 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 598 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 599 600 // Check if maximum number of VGPRs was explicitly requested using 601 // "amdgpu-num-vgpr" attribute. 602 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 603 unsigned Requested = AMDGPU::getIntegerAttribute( 604 F, "amdgpu-num-vgpr", MaxNumVGPRs); 605 606 // Make sure requested value is compatible with values implied by 607 // default/requested minimum/maximum number of waves per execution unit. 608 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 609 Requested = 0; 610 if (WavesPerEU.second && 611 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 612 Requested = 0; 613 614 if (Requested) 615 MaxNumVGPRs = Requested; 616 } 617 618 return MaxNumVGPRs; 619 } 620 621 namespace { 622 struct MemOpClusterMutation : ScheduleDAGMutation { 623 const SIInstrInfo *TII; 624 625 MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {} 626 627 void apply(ScheduleDAGInstrs *DAGInstrs) override { 628 ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); 629 630 SUnit *SUa = nullptr; 631 // Search for two consequent memory operations and link them 632 // to prevent scheduler from moving them apart. 633 // In DAG pre-process SUnits are in the original order of 634 // the instructions before scheduling. 635 for (SUnit &SU : DAG->SUnits) { 636 MachineInstr &MI2 = *SU.getInstr(); 637 if (!MI2.mayLoad() && !MI2.mayStore()) { 638 SUa = nullptr; 639 continue; 640 } 641 if (!SUa) { 642 SUa = &SU; 643 continue; 644 } 645 646 MachineInstr &MI1 = *SUa->getInstr(); 647 if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) || 648 (TII->isFLAT(MI1) && TII->isFLAT(MI2)) || 649 (TII->isSMRD(MI1) && TII->isSMRD(MI2)) || 650 (TII->isDS(MI1) && TII->isDS(MI2))) { 651 SU.addPredBarrier(SUa); 652 653 for (const SDep &SI : SU.Preds) { 654 if (SI.getSUnit() != SUa) 655 SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial)); 656 } 657 658 if (&SU != &DAG->ExitSU) { 659 for (const SDep &SI : SUa->Succs) { 660 if (SI.getSUnit() != &SU) 661 SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial)); 662 } 663 } 664 } 665 666 SUa = &SU; 667 } 668 } 669 }; 670 } // namespace 671 672 void GCNSubtarget::getPostRAMutations( 673 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { 674 Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo)); 675 } 676 677 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { 678 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) 679 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); 680 else 681 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); 682 } 683 684 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { 685 if (TM.getTargetTriple().getArch() == Triple::amdgcn) 686 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); 687 else 688 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); 689 } 690