1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief Implements the AMDGPU specific subclass of TargetSubtarget. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPUSubtarget.h" 16 #include "AMDGPU.h" 17 #include "AMDGPUTargetMachine.h" 18 #include "AMDGPUCallLowering.h" 19 #include "AMDGPUInstructionSelector.h" 20 #include "AMDGPULegalizerInfo.h" 21 #include "AMDGPURegisterBankInfo.h" 22 #include "SIMachineFunctionInfo.h" 23 #include "llvm/ADT/SmallString.h" 24 #include "llvm/CodeGen/MachineScheduler.h" 25 #include "llvm/IR/MDBuilder.h" 26 #include "llvm/Target/TargetFrameLowering.h" 27 #include <algorithm> 28 29 using namespace llvm; 30 31 #define DEBUG_TYPE "amdgpu-subtarget" 32 33 #define GET_SUBTARGETINFO_TARGET_DESC 34 #define GET_SUBTARGETINFO_CTOR 35 #include "AMDGPUGenSubtargetInfo.inc" 36 37 AMDGPUSubtarget::~AMDGPUSubtarget() = default; 38 39 AMDGPUSubtarget & 40 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT, 41 StringRef GPU, StringRef FS) { 42 // Determine default and user-specified characteristics 43 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be 44 // enabled, but some instructions do not respect them and they run at the 45 // double precision rate, so don't enable by default. 46 // 47 // We want to be able to turn these off, but making this a subtarget feature 48 // for SI has the unhelpful behavior that it unsets everything else if you 49 // disable it. 50 51 SmallString<256> FullFS("+promote-alloca,+fp64-fp16-denormals,+dx10-clamp,+load-store-opt,"); 52 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 53 FullFS += "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,"; 54 55 FullFS += FS; 56 57 ParseSubtargetFeatures(GPU, FullFS); 58 59 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es 60 // on VI and newer hardware to avoid assertion failures due to missing ADDR64 61 // variants of MUBUF instructions. 62 if (!hasAddr64() && !FS.contains("flat-for-global")) { 63 FlatForGlobal = true; 64 } 65 66 // FIXME: I don't think think Evergreen has any useful support for 67 // denormals, but should be checked. Should we issue a warning somewhere 68 // if someone tries to enable these? 69 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { 70 FP64FP16Denormals = false; 71 FP32Denormals = false; 72 } 73 74 // Set defaults if needed. 75 if (MaxPrivateElementSize == 0) 76 MaxPrivateElementSize = 4; 77 78 if (LDSBankCount == 0) 79 LDSBankCount = 32; 80 81 if (TT.getArch() == Triple::amdgcn) { 82 if (LocalMemorySize == 0) 83 LocalMemorySize = 32768; 84 85 // Do something sensible for unspecified target. 86 if (!HasMovrel && !HasVGPRIndexMode) 87 HasMovrel = true; 88 } 89 90 return *this; 91 } 92 93 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 94 const TargetMachine &TM) 95 : AMDGPUGenSubtargetInfo(TT, GPU, FS), 96 TargetTriple(TT), 97 Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600), 98 IsaVersion(ISAVersion0_0_0), 99 WavefrontSize(64), 100 LocalMemorySize(0), 101 LDSBankCount(0), 102 MaxPrivateElementSize(0), 103 104 FastFMAF32(false), 105 HalfRate64Ops(false), 106 107 FP32Denormals(false), 108 FP64FP16Denormals(false), 109 FPExceptions(false), 110 DX10Clamp(false), 111 FlatForGlobal(false), 112 AutoWaitcntBeforeBarrier(false), 113 UnalignedScratchAccess(false), 114 UnalignedBufferAccess(false), 115 116 HasApertureRegs(false), 117 EnableXNACK(false), 118 TrapHandler(false), 119 DebuggerInsertNops(false), 120 DebuggerReserveRegs(false), 121 DebuggerEmitPrologue(false), 122 123 EnableVGPRSpilling(false), 124 EnablePromoteAlloca(false), 125 EnableLoadStoreOpt(false), 126 EnableUnsafeDSOffsetFolding(false), 127 EnableSIScheduler(false), 128 DumpCode(false), 129 130 FP64(false), 131 IsGCN(false), 132 GCN3Encoding(false), 133 CIInsts(false), 134 GFX9Insts(false), 135 SGPRInitBug(false), 136 HasSMemRealTime(false), 137 Has16BitInsts(false), 138 HasIntClamp(false), 139 HasVOP3PInsts(false), 140 HasMovrel(false), 141 HasVGPRIndexMode(false), 142 HasScalarStores(false), 143 HasInv2PiInlineImm(false), 144 HasSDWA(false), 145 HasSDWAOmod(false), 146 HasSDWAScalar(false), 147 HasSDWASdst(false), 148 HasSDWAMac(false), 149 HasSDWAOutModsVOPC(false), 150 HasDPP(false), 151 FlatAddressSpace(false), 152 FlatInstOffsets(false), 153 FlatGlobalInsts(false), 154 FlatScratchInsts(false), 155 AddNoCarryInsts(false), 156 157 R600ALUInst(false), 158 CaymanISA(false), 159 CFALUBug(false), 160 HasVertexCache(false), 161 TexVTXClauseSize(0), 162 ScalarizeGlobal(false), 163 164 FeatureDisable(false), 165 InstrItins(getInstrItineraryForCPU(GPU)) { 166 AS = AMDGPU::getAMDGPUAS(TT); 167 initializeSubtargetDependencies(TT, GPU, FS); 168 } 169 170 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 171 const Function &F) const { 172 if (NWaves == 1) 173 return getLocalMemorySize(); 174 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 175 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 176 unsigned MaxWaves = getMaxWavesPerEU(); 177 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 178 } 179 180 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 181 const Function &F) const { 182 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 183 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 184 unsigned MaxWaves = getMaxWavesPerEU(); 185 unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu; 186 unsigned NumWaves = Limit / (Bytes ? Bytes : 1u); 187 NumWaves = std::min(NumWaves, MaxWaves); 188 NumWaves = std::max(NumWaves, 1u); 189 return NumWaves; 190 } 191 192 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 193 const Function &F) const { 194 // Default minimum/maximum flat work group sizes. 195 std::pair<unsigned, unsigned> Default = 196 AMDGPU::isCompute(F.getCallingConv()) ? 197 std::pair<unsigned, unsigned>(getWavefrontSize() * 2, 198 getWavefrontSize() * 4) : 199 std::pair<unsigned, unsigned>(1, getWavefrontSize()); 200 201 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 202 // starts using "amdgpu-flat-work-group-size" attribute. 203 Default.second = AMDGPU::getIntegerAttribute( 204 F, "amdgpu-max-work-group-size", Default.second); 205 Default.first = std::min(Default.first, Default.second); 206 207 // Requested minimum/maximum flat work group sizes. 208 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 209 F, "amdgpu-flat-work-group-size", Default); 210 211 // Make sure requested minimum is less than requested maximum. 212 if (Requested.first > Requested.second) 213 return Default; 214 215 // Make sure requested values do not violate subtarget's specifications. 216 if (Requested.first < getMinFlatWorkGroupSize()) 217 return Default; 218 if (Requested.second > getMaxFlatWorkGroupSize()) 219 return Default; 220 221 return Requested; 222 } 223 224 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 225 const Function &F) const { 226 // Default minimum/maximum number of waves per execution unit. 227 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 228 229 // Default/requested minimum/maximum flat work group sizes. 230 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 231 232 // If minimum/maximum flat work group sizes were explicitly requested using 233 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 234 // number of waves per execution unit to values implied by requested 235 // minimum/maximum flat work group sizes. 236 unsigned MinImpliedByFlatWorkGroupSize = 237 getMaxWavesPerEU(FlatWorkGroupSizes.second); 238 bool RequestedFlatWorkGroupSize = false; 239 240 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 241 // starts using "amdgpu-flat-work-group-size" attribute. 242 if (F.hasFnAttribute("amdgpu-max-work-group-size") || 243 F.hasFnAttribute("amdgpu-flat-work-group-size")) { 244 Default.first = MinImpliedByFlatWorkGroupSize; 245 RequestedFlatWorkGroupSize = true; 246 } 247 248 // Requested minimum/maximum number of waves per execution unit. 249 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 250 F, "amdgpu-waves-per-eu", Default, true); 251 252 // Make sure requested minimum is less than requested maximum. 253 if (Requested.second && Requested.first > Requested.second) 254 return Default; 255 256 // Make sure requested values do not violate subtarget's specifications. 257 if (Requested.first < getMinWavesPerEU() || 258 Requested.first > getMaxWavesPerEU()) 259 return Default; 260 if (Requested.second > getMaxWavesPerEU()) 261 return Default; 262 263 // Make sure requested values are compatible with values implied by requested 264 // minimum/maximum flat work group sizes. 265 if (RequestedFlatWorkGroupSize && 266 Requested.first < MinImpliedByFlatWorkGroupSize) 267 return Default; 268 269 return Requested; 270 } 271 272 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 273 Function *Kernel = I->getParent()->getParent(); 274 unsigned MinSize = 0; 275 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 276 bool IdQuery = false; 277 278 // If reqd_work_group_size is present it narrows value down. 279 if (auto *CI = dyn_cast<CallInst>(I)) { 280 const Function *F = CI->getCalledFunction(); 281 if (F) { 282 unsigned Dim = UINT_MAX; 283 switch (F->getIntrinsicID()) { 284 case Intrinsic::amdgcn_workitem_id_x: 285 case Intrinsic::r600_read_tidig_x: 286 IdQuery = true; 287 LLVM_FALLTHROUGH; 288 case Intrinsic::r600_read_local_size_x: 289 Dim = 0; 290 break; 291 case Intrinsic::amdgcn_workitem_id_y: 292 case Intrinsic::r600_read_tidig_y: 293 IdQuery = true; 294 LLVM_FALLTHROUGH; 295 case Intrinsic::r600_read_local_size_y: 296 Dim = 1; 297 break; 298 case Intrinsic::amdgcn_workitem_id_z: 299 case Intrinsic::r600_read_tidig_z: 300 IdQuery = true; 301 LLVM_FALLTHROUGH; 302 case Intrinsic::r600_read_local_size_z: 303 Dim = 2; 304 break; 305 default: 306 break; 307 } 308 if (Dim <= 3) { 309 if (auto Node = Kernel->getMetadata("reqd_work_group_size")) 310 if (Node->getNumOperands() == 3) 311 MinSize = MaxSize = mdconst::extract<ConstantInt>( 312 Node->getOperand(Dim))->getZExtValue(); 313 } 314 } 315 } 316 317 if (!MaxSize) 318 return false; 319 320 // Range metadata is [Lo, Hi). For ID query we need to pass max size 321 // as Hi. For size query we need to pass Hi + 1. 322 if (IdQuery) 323 MinSize = 0; 324 else 325 ++MaxSize; 326 327 MDBuilder MDB(I->getContext()); 328 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 329 APInt(32, MaxSize)); 330 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 331 return true; 332 } 333 334 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 335 const TargetMachine &TM) : 336 AMDGPUSubtarget(TT, GPU, FS, TM), 337 InstrInfo(*this), 338 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 339 TLInfo(TM, *this) {} 340 341 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS, 342 const TargetMachine &TM) 343 : AMDGPUSubtarget(TT, GPU, FS, TM), InstrInfo(*this), 344 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 345 TLInfo(TM, *this) { 346 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 347 Legalizer.reset(new AMDGPULegalizerInfo()); 348 349 RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo())); 350 InstSelector.reset(new AMDGPUInstructionSelector( 351 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()))); 352 } 353 354 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 355 unsigned NumRegionInstrs) const { 356 // Track register pressure so the scheduler can try to decrease 357 // pressure once register usage is above the threshold defined by 358 // SIRegisterInfo::getRegPressureSetLimit() 359 Policy.ShouldTrackPressure = true; 360 361 // Enabling both top down and bottom up scheduling seems to give us less 362 // register spills than just using one of these approaches on its own. 363 Policy.OnlyTopDown = false; 364 Policy.OnlyBottomUp = false; 365 366 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 367 if (!enableSIScheduler()) 368 Policy.ShouldTrackLaneMasks = true; 369 } 370 371 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const { 372 return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv()); 373 } 374 375 unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF, 376 unsigned ExplicitArgBytes) const { 377 unsigned ImplicitBytes = getImplicitArgNumBytes(MF); 378 if (ImplicitBytes == 0) 379 return ExplicitArgBytes; 380 381 unsigned Alignment = getAlignmentForImplicitArgPtr(); 382 return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 383 } 384 385 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 386 if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 387 if (SGPRs <= 80) 388 return 10; 389 if (SGPRs <= 88) 390 return 9; 391 if (SGPRs <= 100) 392 return 8; 393 return 7; 394 } 395 if (SGPRs <= 48) 396 return 10; 397 if (SGPRs <= 56) 398 return 9; 399 if (SGPRs <= 64) 400 return 8; 401 if (SGPRs <= 72) 402 return 7; 403 if (SGPRs <= 80) 404 return 6; 405 return 5; 406 } 407 408 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 409 if (VGPRs <= 24) 410 return 10; 411 if (VGPRs <= 28) 412 return 9; 413 if (VGPRs <= 32) 414 return 8; 415 if (VGPRs <= 36) 416 return 7; 417 if (VGPRs <= 40) 418 return 6; 419 if (VGPRs <= 48) 420 return 5; 421 if (VGPRs <= 64) 422 return 4; 423 if (VGPRs <= 84) 424 return 3; 425 if (VGPRs <= 128) 426 return 2; 427 return 1; 428 } 429 430 unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 431 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 432 if (MFI.hasFlatScratchInit()) { 433 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 434 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 435 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 436 return 4; // FLAT_SCRATCH, VCC (in that order). 437 } 438 439 if (isXNACKEnabled()) 440 return 4; // XNACK, VCC (in that order). 441 return 2; // VCC. 442 } 443 444 unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 445 const Function &F = *MF.getFunction(); 446 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 447 448 // Compute maximum number of SGPRs function can use using default/requested 449 // minimum number of waves per execution unit. 450 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 451 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 452 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 453 454 // Check if maximum number of SGPRs was explicitly requested using 455 // "amdgpu-num-sgpr" attribute. 456 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 457 unsigned Requested = AMDGPU::getIntegerAttribute( 458 F, "amdgpu-num-sgpr", MaxNumSGPRs); 459 460 // Make sure requested value does not violate subtarget's specifications. 461 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 462 Requested = 0; 463 464 // If more SGPRs are required to support the input user/system SGPRs, 465 // increase to accommodate them. 466 // 467 // FIXME: This really ends up using the requested number of SGPRs + number 468 // of reserved special registers in total. Theoretically you could re-use 469 // the last input registers for these special registers, but this would 470 // require a lot of complexity to deal with the weird aliasing. 471 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 472 if (Requested && Requested < InputNumSGPRs) 473 Requested = InputNumSGPRs; 474 475 // Make sure requested value is compatible with values implied by 476 // default/requested minimum/maximum number of waves per execution unit. 477 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 478 Requested = 0; 479 if (WavesPerEU.second && 480 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 481 Requested = 0; 482 483 if (Requested) 484 MaxNumSGPRs = Requested; 485 } 486 487 if (hasSGPRInitBug()) 488 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 489 490 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 491 MaxAddressableNumSGPRs); 492 } 493 494 unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 495 const Function &F = *MF.getFunction(); 496 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 497 498 // Compute maximum number of VGPRs function can use using default/requested 499 // minimum number of waves per execution unit. 500 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 501 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 502 503 // Check if maximum number of VGPRs was explicitly requested using 504 // "amdgpu-num-vgpr" attribute. 505 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 506 unsigned Requested = AMDGPU::getIntegerAttribute( 507 F, "amdgpu-num-vgpr", MaxNumVGPRs); 508 509 // Make sure requested value does not violate subtarget's specifications. 510 if (Requested && Requested <= getReservedNumVGPRs(MF)) 511 Requested = 0; 512 513 // Make sure requested value is compatible with values implied by 514 // default/requested minimum/maximum number of waves per execution unit. 515 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 516 Requested = 0; 517 if (WavesPerEU.second && 518 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 519 Requested = 0; 520 521 if (Requested) 522 MaxNumVGPRs = Requested; 523 } 524 525 return MaxNumVGPRs - getReservedNumVGPRs(MF); 526 } 527