1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief Implements the AMDGPU specific subclass of TargetSubtarget. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPUSubtarget.h" 16 #include "AMDGPU.h" 17 #include "AMDGPUTargetMachine.h" 18 #ifdef LLVM_BUILD_GLOBAL_ISEL 19 #include "AMDGPUCallLowering.h" 20 #include "AMDGPUInstructionSelector.h" 21 #include "AMDGPULegalizerInfo.h" 22 #include "AMDGPURegisterBankInfo.h" 23 #endif 24 #include "SIMachineFunctionInfo.h" 25 #include "llvm/ADT/SmallString.h" 26 #include "llvm/CodeGen/MachineScheduler.h" 27 #include "llvm/IR/MDBuilder.h" 28 #include "llvm/Target/TargetFrameLowering.h" 29 #include <algorithm> 30 31 using namespace llvm; 32 33 #define DEBUG_TYPE "amdgpu-subtarget" 34 35 #define GET_SUBTARGETINFO_TARGET_DESC 36 #define GET_SUBTARGETINFO_CTOR 37 #include "AMDGPUGenSubtargetInfo.inc" 38 39 AMDGPUSubtarget::~AMDGPUSubtarget() = default; 40 41 AMDGPUSubtarget & 42 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT, 43 StringRef GPU, StringRef FS) { 44 // Determine default and user-specified characteristics 45 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be 46 // enabled, but some instructions do not respect them and they run at the 47 // double precision rate, so don't enable by default. 48 // 49 // We want to be able to turn these off, but making this a subtarget feature 50 // for SI has the unhelpful behavior that it unsets everything else if you 51 // disable it. 52 53 SmallString<256> FullFS("+promote-alloca,+fp64-fp16-denormals,+dx10-clamp,+load-store-opt,"); 54 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 55 FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,"; 56 57 FullFS += FS; 58 59 ParseSubtargetFeatures(GPU, FullFS); 60 61 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es 62 // on VI and newer hardware to avoid assertion failures due to missing ADDR64 63 // variants of MUBUF instructions. 64 if (!hasAddr64() && !FS.contains("flat-for-global")) { 65 FlatForGlobal = true; 66 } 67 68 // FIXME: I don't think think Evergreen has any useful support for 69 // denormals, but should be checked. Should we issue a warning somewhere 70 // if someone tries to enable these? 71 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { 72 FP64FP16Denormals = false; 73 FP32Denormals = false; 74 } 75 76 // Set defaults if needed. 77 if (MaxPrivateElementSize == 0) 78 MaxPrivateElementSize = 4; 79 80 return *this; 81 } 82 83 #ifdef LLVM_BUILD_GLOBAL_ISEL 84 namespace { 85 86 struct SIGISelActualAccessor : public GISelAccessor { 87 std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo; 88 std::unique_ptr<InstructionSelector> InstSelector; 89 std::unique_ptr<LegalizerInfo> Legalizer; 90 std::unique_ptr<RegisterBankInfo> RegBankInfo; 91 const AMDGPUCallLowering *getCallLowering() const override { 92 return CallLoweringInfo.get(); 93 } 94 const InstructionSelector *getInstructionSelector() const override { 95 return InstSelector.get(); 96 } 97 const LegalizerInfo *getLegalizerInfo() const override { 98 return Legalizer.get(); 99 } 100 const RegisterBankInfo *getRegBankInfo() const override { 101 return RegBankInfo.get(); 102 } 103 }; 104 105 } // end anonymous namespace 106 #endif 107 108 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 109 const TargetMachine &TM) 110 : AMDGPUGenSubtargetInfo(TT, GPU, FS), 111 TargetTriple(TT), 112 Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600), 113 IsaVersion(ISAVersion0_0_0), 114 WavefrontSize(64), 115 LocalMemorySize(0), 116 LDSBankCount(0), 117 MaxPrivateElementSize(0), 118 119 FastFMAF32(false), 120 HalfRate64Ops(false), 121 122 FP32Denormals(false), 123 FP64FP16Denormals(false), 124 FPExceptions(false), 125 DX10Clamp(false), 126 FlatForGlobal(false), 127 AutoWaitcntBeforeBarrier(false), 128 UnalignedScratchAccess(false), 129 UnalignedBufferAccess(false), 130 131 HasApertureRegs(false), 132 EnableXNACK(false), 133 TrapHandler(false), 134 DebuggerInsertNops(false), 135 DebuggerReserveRegs(false), 136 DebuggerEmitPrologue(false), 137 138 EnableVGPRSpilling(false), 139 EnablePromoteAlloca(false), 140 EnableLoadStoreOpt(false), 141 EnableUnsafeDSOffsetFolding(false), 142 EnableSIScheduler(false), 143 DumpCode(false), 144 145 FP64(false), 146 IsGCN(false), 147 GCN1Encoding(false), 148 GCN3Encoding(false), 149 CIInsts(false), 150 GFX9Insts(false), 151 SGPRInitBug(false), 152 HasSMemRealTime(false), 153 Has16BitInsts(false), 154 HasVOP3PInsts(false), 155 HasMovrel(false), 156 HasVGPRIndexMode(false), 157 HasScalarStores(false), 158 HasInv2PiInlineImm(false), 159 HasSDWA(false), 160 HasSDWAOmod(false), 161 HasSDWAScalar(false), 162 HasSDWASdst(false), 163 HasSDWAMac(false), 164 HasSDWAOutModsVOPC(false), 165 HasDPP(false), 166 FlatAddressSpace(false), 167 FlatInstOffsets(false), 168 FlatGlobalInsts(false), 169 FlatScratchInsts(false), 170 171 R600ALUInst(false), 172 CaymanISA(false), 173 CFALUBug(false), 174 HasVertexCache(false), 175 TexVTXClauseSize(0), 176 ScalarizeGlobal(false), 177 178 FeatureDisable(false), 179 InstrItins(getInstrItineraryForCPU(GPU)) { 180 AS = AMDGPU::getAMDGPUAS(TT); 181 initializeSubtargetDependencies(TT, GPU, FS); 182 } 183 184 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 185 const Function &F) const { 186 if (NWaves == 1) 187 return getLocalMemorySize(); 188 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 189 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 190 unsigned MaxWaves = getMaxWavesPerEU(); 191 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 192 } 193 194 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 195 const Function &F) const { 196 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 197 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 198 unsigned MaxWaves = getMaxWavesPerEU(); 199 unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu; 200 unsigned NumWaves = Limit / (Bytes ? Bytes : 1u); 201 NumWaves = std::min(NumWaves, MaxWaves); 202 NumWaves = std::max(NumWaves, 1u); 203 return NumWaves; 204 } 205 206 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 207 const Function &F) const { 208 // Default minimum/maximum flat work group sizes. 209 std::pair<unsigned, unsigned> Default = 210 AMDGPU::isCompute(F.getCallingConv()) ? 211 std::pair<unsigned, unsigned>(getWavefrontSize() * 2, 212 getWavefrontSize() * 4) : 213 std::pair<unsigned, unsigned>(1, getWavefrontSize()); 214 215 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 216 // starts using "amdgpu-flat-work-group-size" attribute. 217 Default.second = AMDGPU::getIntegerAttribute( 218 F, "amdgpu-max-work-group-size", Default.second); 219 Default.first = std::min(Default.first, Default.second); 220 221 // Requested minimum/maximum flat work group sizes. 222 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 223 F, "amdgpu-flat-work-group-size", Default); 224 225 // Make sure requested minimum is less than requested maximum. 226 if (Requested.first > Requested.second) 227 return Default; 228 229 // Make sure requested values do not violate subtarget's specifications. 230 if (Requested.first < getMinFlatWorkGroupSize()) 231 return Default; 232 if (Requested.second > getMaxFlatWorkGroupSize()) 233 return Default; 234 235 return Requested; 236 } 237 238 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 239 const Function &F) const { 240 // Default minimum/maximum number of waves per execution unit. 241 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 242 243 // Default/requested minimum/maximum flat work group sizes. 244 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 245 246 // If minimum/maximum flat work group sizes were explicitly requested using 247 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 248 // number of waves per execution unit to values implied by requested 249 // minimum/maximum flat work group sizes. 250 unsigned MinImpliedByFlatWorkGroupSize = 251 getMaxWavesPerEU(FlatWorkGroupSizes.second); 252 bool RequestedFlatWorkGroupSize = false; 253 254 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 255 // starts using "amdgpu-flat-work-group-size" attribute. 256 if (F.hasFnAttribute("amdgpu-max-work-group-size") || 257 F.hasFnAttribute("amdgpu-flat-work-group-size")) { 258 Default.first = MinImpliedByFlatWorkGroupSize; 259 RequestedFlatWorkGroupSize = true; 260 } 261 262 // Requested minimum/maximum number of waves per execution unit. 263 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 264 F, "amdgpu-waves-per-eu", Default, true); 265 266 // Make sure requested minimum is less than requested maximum. 267 if (Requested.second && Requested.first > Requested.second) 268 return Default; 269 270 // Make sure requested values do not violate subtarget's specifications. 271 if (Requested.first < getMinWavesPerEU() || 272 Requested.first > getMaxWavesPerEU()) 273 return Default; 274 if (Requested.second > getMaxWavesPerEU()) 275 return Default; 276 277 // Make sure requested values are compatible with values implied by requested 278 // minimum/maximum flat work group sizes. 279 if (RequestedFlatWorkGroupSize && 280 Requested.first > MinImpliedByFlatWorkGroupSize) 281 return Default; 282 283 return Requested; 284 } 285 286 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 287 Function *Kernel = I->getParent()->getParent(); 288 unsigned MinSize = 0; 289 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 290 bool IdQuery = false; 291 292 // If reqd_work_group_size is present it narrows value down. 293 if (auto *CI = dyn_cast<CallInst>(I)) { 294 const Function *F = CI->getCalledFunction(); 295 if (F) { 296 unsigned Dim = UINT_MAX; 297 switch (F->getIntrinsicID()) { 298 case Intrinsic::amdgcn_workitem_id_x: 299 case Intrinsic::r600_read_tidig_x: 300 IdQuery = true; 301 LLVM_FALLTHROUGH; 302 case Intrinsic::r600_read_local_size_x: 303 Dim = 0; 304 break; 305 case Intrinsic::amdgcn_workitem_id_y: 306 case Intrinsic::r600_read_tidig_y: 307 IdQuery = true; 308 LLVM_FALLTHROUGH; 309 case Intrinsic::r600_read_local_size_y: 310 Dim = 1; 311 break; 312 case Intrinsic::amdgcn_workitem_id_z: 313 case Intrinsic::r600_read_tidig_z: 314 IdQuery = true; 315 LLVM_FALLTHROUGH; 316 case Intrinsic::r600_read_local_size_z: 317 Dim = 2; 318 break; 319 default: 320 break; 321 } 322 if (Dim <= 3) { 323 if (auto Node = Kernel->getMetadata("reqd_work_group_size")) 324 if (Node->getNumOperands() == 3) 325 MinSize = MaxSize = mdconst::extract<ConstantInt>( 326 Node->getOperand(Dim))->getZExtValue(); 327 } 328 } 329 } 330 331 if (!MaxSize) 332 return false; 333 334 // Range metadata is [Lo, Hi). For ID query we need to pass max size 335 // as Hi. For size query we need to pass Hi + 1. 336 if (IdQuery) 337 MinSize = 0; 338 else 339 ++MaxSize; 340 341 MDBuilder MDB(I->getContext()); 342 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 343 APInt(32, MaxSize)); 344 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 345 return true; 346 } 347 348 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 349 const TargetMachine &TM) : 350 AMDGPUSubtarget(TT, GPU, FS, TM), 351 InstrInfo(*this), 352 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 353 TLInfo(TM, *this) {} 354 355 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS, 356 const TargetMachine &TM) 357 : AMDGPUSubtarget(TT, GPU, FS, TM), InstrInfo(*this), 358 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 359 TLInfo(TM, *this) { 360 #ifndef LLVM_BUILD_GLOBAL_ISEL 361 GISelAccessor *GISel = new GISelAccessor(); 362 #else 363 SIGISelActualAccessor *GISel = new SIGISelActualAccessor(); 364 GISel->CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 365 GISel->Legalizer.reset(new AMDGPULegalizerInfo()); 366 367 GISel->RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo())); 368 GISel->InstSelector.reset(new AMDGPUInstructionSelector( 369 *this, *static_cast<AMDGPURegisterBankInfo *>(GISel->RegBankInfo.get()))); 370 #endif 371 setGISelAccessor(*GISel); 372 } 373 374 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 375 unsigned NumRegionInstrs) const { 376 // Track register pressure so the scheduler can try to decrease 377 // pressure once register usage is above the threshold defined by 378 // SIRegisterInfo::getRegPressureSetLimit() 379 Policy.ShouldTrackPressure = true; 380 381 // Enabling both top down and bottom up scheduling seems to give us less 382 // register spills than just using one of these approaches on its own. 383 Policy.OnlyTopDown = false; 384 Policy.OnlyBottomUp = false; 385 386 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 387 if (!enableSIScheduler()) 388 Policy.ShouldTrackLaneMasks = true; 389 } 390 391 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const { 392 return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv()); 393 } 394 395 unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF, 396 unsigned ExplicitArgBytes) const { 397 unsigned ImplicitBytes = getImplicitArgNumBytes(MF); 398 if (ImplicitBytes == 0) 399 return ExplicitArgBytes; 400 401 unsigned Alignment = getAlignmentForImplicitArgPtr(); 402 return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 403 } 404 405 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 406 if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 407 if (SGPRs <= 80) 408 return 10; 409 if (SGPRs <= 88) 410 return 9; 411 if (SGPRs <= 100) 412 return 8; 413 return 7; 414 } 415 if (SGPRs <= 48) 416 return 10; 417 if (SGPRs <= 56) 418 return 9; 419 if (SGPRs <= 64) 420 return 8; 421 if (SGPRs <= 72) 422 return 7; 423 if (SGPRs <= 80) 424 return 6; 425 return 5; 426 } 427 428 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 429 if (VGPRs <= 24) 430 return 10; 431 if (VGPRs <= 28) 432 return 9; 433 if (VGPRs <= 32) 434 return 8; 435 if (VGPRs <= 36) 436 return 7; 437 if (VGPRs <= 40) 438 return 6; 439 if (VGPRs <= 48) 440 return 5; 441 if (VGPRs <= 64) 442 return 4; 443 if (VGPRs <= 84) 444 return 3; 445 if (VGPRs <= 128) 446 return 2; 447 return 1; 448 } 449 450 unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 451 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 452 if (MFI.hasFlatScratchInit()) { 453 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 454 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 455 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 456 return 4; // FLAT_SCRATCH, VCC (in that order). 457 } 458 459 if (isXNACKEnabled()) 460 return 4; // XNACK, VCC (in that order). 461 return 2; // VCC. 462 } 463 464 unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 465 const Function &F = *MF.getFunction(); 466 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 467 468 // Compute maximum number of SGPRs function can use using default/requested 469 // minimum number of waves per execution unit. 470 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 471 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 472 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 473 474 // Check if maximum number of SGPRs was explicitly requested using 475 // "amdgpu-num-sgpr" attribute. 476 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 477 unsigned Requested = AMDGPU::getIntegerAttribute( 478 F, "amdgpu-num-sgpr", MaxNumSGPRs); 479 480 // Make sure requested value does not violate subtarget's specifications. 481 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 482 Requested = 0; 483 484 // If more SGPRs are required to support the input user/system SGPRs, 485 // increase to accommodate them. 486 // 487 // FIXME: This really ends up using the requested number of SGPRs + number 488 // of reserved special registers in total. Theoretically you could re-use 489 // the last input registers for these special registers, but this would 490 // require a lot of complexity to deal with the weird aliasing. 491 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 492 if (Requested && Requested < InputNumSGPRs) 493 Requested = InputNumSGPRs; 494 495 // Make sure requested value is compatible with values implied by 496 // default/requested minimum/maximum number of waves per execution unit. 497 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 498 Requested = 0; 499 if (WavesPerEU.second && 500 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 501 Requested = 0; 502 503 if (Requested) 504 MaxNumSGPRs = Requested; 505 } 506 507 if (hasSGPRInitBug()) 508 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 509 510 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 511 MaxAddressableNumSGPRs); 512 } 513 514 unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 515 const Function &F = *MF.getFunction(); 516 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 517 518 // Compute maximum number of VGPRs function can use using default/requested 519 // minimum number of waves per execution unit. 520 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 521 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 522 523 // Check if maximum number of VGPRs was explicitly requested using 524 // "amdgpu-num-vgpr" attribute. 525 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 526 unsigned Requested = AMDGPU::getIntegerAttribute( 527 F, "amdgpu-num-vgpr", MaxNumVGPRs); 528 529 // Make sure requested value does not violate subtarget's specifications. 530 if (Requested && Requested <= getReservedNumVGPRs(MF)) 531 Requested = 0; 532 533 // Make sure requested value is compatible with values implied by 534 // default/requested minimum/maximum number of waves per execution unit. 535 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 536 Requested = 0; 537 if (WavesPerEU.second && 538 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 539 Requested = 0; 540 541 if (Requested) 542 MaxNumVGPRs = Requested; 543 } 544 545 return MaxNumVGPRs - getReservedNumVGPRs(MF); 546 } 547