1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief Implements the AMDGPU specific subclass of TargetSubtarget. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPUSubtarget.h" 16 #include "SIMachineFunctionInfo.h" 17 #include "llvm/ADT/SmallString.h" 18 #include "llvm/CodeGen/MachineScheduler.h" 19 #include "llvm/IR/MDBuilder.h" 20 #include "llvm/Target/TargetFrameLowering.h" 21 #include <algorithm> 22 23 using namespace llvm; 24 25 #define DEBUG_TYPE "amdgpu-subtarget" 26 27 #define GET_SUBTARGETINFO_TARGET_DESC 28 #define GET_SUBTARGETINFO_CTOR 29 #include "AMDGPUGenSubtargetInfo.inc" 30 31 AMDGPUSubtarget::~AMDGPUSubtarget() = default; 32 33 AMDGPUSubtarget & 34 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT, 35 StringRef GPU, StringRef FS) { 36 // Determine default and user-specified characteristics 37 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be 38 // enabled, but some instructions do not respect them and they run at the 39 // double precision rate, so don't enable by default. 40 // 41 // We want to be able to turn these off, but making this a subtarget feature 42 // for SI has the unhelpful behavior that it unsets everything else if you 43 // disable it. 44 45 SmallString<256> FullFS("+promote-alloca,+fp64-fp16-denormals,+dx10-clamp,+load-store-opt,"); 46 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 47 FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,"; 48 49 FullFS += FS; 50 51 ParseSubtargetFeatures(GPU, FullFS); 52 53 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es 54 // on VI and newer hardware to avoid assertion failures due to missing ADDR64 55 // variants of MUBUF instructions. 56 if (!hasAddr64() && !FS.contains("flat-for-global")) { 57 FlatForGlobal = true; 58 } 59 60 // FIXME: I don't think think Evergreen has any useful support for 61 // denormals, but should be checked. Should we issue a warning somewhere 62 // if someone tries to enable these? 63 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { 64 FP64FP16Denormals = false; 65 FP32Denormals = false; 66 } 67 68 // Set defaults if needed. 69 if (MaxPrivateElementSize == 0) 70 MaxPrivateElementSize = 4; 71 72 return *this; 73 } 74 75 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 76 const TargetMachine &TM) 77 : AMDGPUGenSubtargetInfo(TT, GPU, FS), 78 TargetTriple(TT), 79 Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600), 80 IsaVersion(ISAVersion0_0_0), 81 WavefrontSize(64), 82 LocalMemorySize(0), 83 LDSBankCount(0), 84 MaxPrivateElementSize(0), 85 86 FastFMAF32(false), 87 HalfRate64Ops(false), 88 89 FP32Denormals(false), 90 FP64FP16Denormals(false), 91 FPExceptions(false), 92 DX10Clamp(false), 93 FlatForGlobal(false), 94 AutoWaitcntBeforeBarrier(false), 95 UnalignedScratchAccess(false), 96 UnalignedBufferAccess(false), 97 98 HasApertureRegs(false), 99 EnableXNACK(false), 100 TrapHandler(false), 101 DebuggerInsertNops(false), 102 DebuggerReserveRegs(false), 103 DebuggerEmitPrologue(false), 104 105 EnableVGPRSpilling(false), 106 EnablePromoteAlloca(false), 107 EnableLoadStoreOpt(false), 108 EnableUnsafeDSOffsetFolding(false), 109 EnableSIScheduler(false), 110 DumpCode(false), 111 112 FP64(false), 113 IsGCN(false), 114 GCN1Encoding(false), 115 GCN3Encoding(false), 116 CIInsts(false), 117 GFX9Insts(false), 118 SGPRInitBug(false), 119 HasSMemRealTime(false), 120 Has16BitInsts(false), 121 HasVOP3PInsts(false), 122 HasMovrel(false), 123 HasVGPRIndexMode(false), 124 HasScalarStores(false), 125 HasInv2PiInlineImm(false), 126 HasSDWA(false), 127 HasSDWAOmod(false), 128 HasSDWAScalar(false), 129 HasSDWASdst(false), 130 HasSDWAMac(false), 131 HasSDWAClampVOPC(false), 132 HasDPP(false), 133 FlatAddressSpace(false), 134 FlatInstOffsets(false), 135 FlatGlobalInsts(false), 136 FlatScratchInsts(false), 137 138 R600ALUInst(false), 139 CaymanISA(false), 140 CFALUBug(false), 141 HasVertexCache(false), 142 TexVTXClauseSize(0), 143 ScalarizeGlobal(false), 144 145 FeatureDisable(false), 146 InstrItins(getInstrItineraryForCPU(GPU)) { 147 AS = AMDGPU::getAMDGPUAS(TT); 148 initializeSubtargetDependencies(TT, GPU, FS); 149 } 150 151 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 152 const Function &F) const { 153 if (NWaves == 1) 154 return getLocalMemorySize(); 155 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 156 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 157 unsigned MaxWaves = getMaxWavesPerEU(); 158 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 159 } 160 161 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 162 const Function &F) const { 163 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 164 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 165 unsigned MaxWaves = getMaxWavesPerEU(); 166 unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu; 167 unsigned NumWaves = Limit / (Bytes ? Bytes : 1u); 168 NumWaves = std::min(NumWaves, MaxWaves); 169 NumWaves = std::max(NumWaves, 1u); 170 return NumWaves; 171 } 172 173 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 174 const Function &F) const { 175 // Default minimum/maximum flat work group sizes. 176 std::pair<unsigned, unsigned> Default = 177 AMDGPU::isCompute(F.getCallingConv()) ? 178 std::pair<unsigned, unsigned>(getWavefrontSize() * 2, 179 getWavefrontSize() * 4) : 180 std::pair<unsigned, unsigned>(1, getWavefrontSize()); 181 182 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 183 // starts using "amdgpu-flat-work-group-size" attribute. 184 Default.second = AMDGPU::getIntegerAttribute( 185 F, "amdgpu-max-work-group-size", Default.second); 186 Default.first = std::min(Default.first, Default.second); 187 188 // Requested minimum/maximum flat work group sizes. 189 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 190 F, "amdgpu-flat-work-group-size", Default); 191 192 // Make sure requested minimum is less than requested maximum. 193 if (Requested.first > Requested.second) 194 return Default; 195 196 // Make sure requested values do not violate subtarget's specifications. 197 if (Requested.first < getMinFlatWorkGroupSize()) 198 return Default; 199 if (Requested.second > getMaxFlatWorkGroupSize()) 200 return Default; 201 202 return Requested; 203 } 204 205 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 206 const Function &F) const { 207 // Default minimum/maximum number of waves per execution unit. 208 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 209 210 // Default/requested minimum/maximum flat work group sizes. 211 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 212 213 // If minimum/maximum flat work group sizes were explicitly requested using 214 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 215 // number of waves per execution unit to values implied by requested 216 // minimum/maximum flat work group sizes. 217 unsigned MinImpliedByFlatWorkGroupSize = 218 getMaxWavesPerEU(FlatWorkGroupSizes.second); 219 bool RequestedFlatWorkGroupSize = false; 220 221 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 222 // starts using "amdgpu-flat-work-group-size" attribute. 223 if (F.hasFnAttribute("amdgpu-max-work-group-size") || 224 F.hasFnAttribute("amdgpu-flat-work-group-size")) { 225 Default.first = MinImpliedByFlatWorkGroupSize; 226 RequestedFlatWorkGroupSize = true; 227 } 228 229 // Requested minimum/maximum number of waves per execution unit. 230 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 231 F, "amdgpu-waves-per-eu", Default, true); 232 233 // Make sure requested minimum is less than requested maximum. 234 if (Requested.second && Requested.first > Requested.second) 235 return Default; 236 237 // Make sure requested values do not violate subtarget's specifications. 238 if (Requested.first < getMinWavesPerEU() || 239 Requested.first > getMaxWavesPerEU()) 240 return Default; 241 if (Requested.second > getMaxWavesPerEU()) 242 return Default; 243 244 // Make sure requested values are compatible with values implied by requested 245 // minimum/maximum flat work group sizes. 246 if (RequestedFlatWorkGroupSize && 247 Requested.first > MinImpliedByFlatWorkGroupSize) 248 return Default; 249 250 return Requested; 251 } 252 253 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 254 Function *Kernel = I->getParent()->getParent(); 255 unsigned MinSize = 0; 256 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 257 bool IdQuery = false; 258 259 // If reqd_work_group_size is present it narrows value down. 260 if (auto *CI = dyn_cast<CallInst>(I)) { 261 const Function *F = CI->getCalledFunction(); 262 if (F) { 263 unsigned Dim = UINT_MAX; 264 switch (F->getIntrinsicID()) { 265 case Intrinsic::amdgcn_workitem_id_x: 266 case Intrinsic::r600_read_tidig_x: 267 IdQuery = true; 268 case Intrinsic::r600_read_local_size_x: 269 Dim = 0; 270 break; 271 case Intrinsic::amdgcn_workitem_id_y: 272 case Intrinsic::r600_read_tidig_y: 273 IdQuery = true; 274 case Intrinsic::r600_read_local_size_y: 275 Dim = 1; 276 break; 277 case Intrinsic::amdgcn_workitem_id_z: 278 case Intrinsic::r600_read_tidig_z: 279 IdQuery = true; 280 case Intrinsic::r600_read_local_size_z: 281 Dim = 2; 282 break; 283 default: 284 break; 285 } 286 if (Dim <= 3) { 287 if (auto Node = Kernel->getMetadata("reqd_work_group_size")) 288 if (Node->getNumOperands() == 3) 289 MinSize = MaxSize = mdconst::extract<ConstantInt>( 290 Node->getOperand(Dim))->getZExtValue(); 291 } 292 } 293 } 294 295 if (!MaxSize) 296 return false; 297 298 // Range metadata is [Lo, Hi). For ID query we need to pass max size 299 // as Hi. For size query we need to pass Hi + 1. 300 if (IdQuery) 301 MinSize = 0; 302 else 303 ++MaxSize; 304 305 MDBuilder MDB(I->getContext()); 306 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 307 APInt(32, MaxSize)); 308 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 309 return true; 310 } 311 312 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 313 const TargetMachine &TM) : 314 AMDGPUSubtarget(TT, GPU, FS, TM), 315 InstrInfo(*this), 316 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 317 TLInfo(TM, *this) {} 318 319 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS, 320 const TargetMachine &TM) : 321 AMDGPUSubtarget(TT, GPU, FS, TM), 322 InstrInfo(*this), 323 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 324 TLInfo(TM, *this) {} 325 326 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 327 unsigned NumRegionInstrs) const { 328 // Track register pressure so the scheduler can try to decrease 329 // pressure once register usage is above the threshold defined by 330 // SIRegisterInfo::getRegPressureSetLimit() 331 Policy.ShouldTrackPressure = true; 332 333 // Enabling both top down and bottom up scheduling seems to give us less 334 // register spills than just using one of these approaches on its own. 335 Policy.OnlyTopDown = false; 336 Policy.OnlyBottomUp = false; 337 338 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 339 if (!enableSIScheduler()) 340 Policy.ShouldTrackLaneMasks = true; 341 } 342 343 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const { 344 return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv()); 345 } 346 347 unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF, 348 unsigned ExplicitArgBytes) const { 349 unsigned ImplicitBytes = getImplicitArgNumBytes(MF); 350 if (ImplicitBytes == 0) 351 return ExplicitArgBytes; 352 353 unsigned Alignment = getAlignmentForImplicitArgPtr(); 354 return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 355 } 356 357 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 358 if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 359 if (SGPRs <= 80) 360 return 10; 361 if (SGPRs <= 88) 362 return 9; 363 if (SGPRs <= 100) 364 return 8; 365 return 7; 366 } 367 if (SGPRs <= 48) 368 return 10; 369 if (SGPRs <= 56) 370 return 9; 371 if (SGPRs <= 64) 372 return 8; 373 if (SGPRs <= 72) 374 return 7; 375 if (SGPRs <= 80) 376 return 6; 377 return 5; 378 } 379 380 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 381 if (VGPRs <= 24) 382 return 10; 383 if (VGPRs <= 28) 384 return 9; 385 if (VGPRs <= 32) 386 return 8; 387 if (VGPRs <= 36) 388 return 7; 389 if (VGPRs <= 40) 390 return 6; 391 if (VGPRs <= 48) 392 return 5; 393 if (VGPRs <= 64) 394 return 4; 395 if (VGPRs <= 84) 396 return 3; 397 if (VGPRs <= 128) 398 return 2; 399 return 1; 400 } 401 402 unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 403 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 404 if (MFI.hasFlatScratchInit()) { 405 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 406 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 407 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 408 return 4; // FLAT_SCRATCH, VCC (in that order). 409 } 410 411 if (isXNACKEnabled()) 412 return 4; // XNACK, VCC (in that order). 413 return 2; // VCC. 414 } 415 416 unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 417 const Function &F = *MF.getFunction(); 418 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 419 420 // Compute maximum number of SGPRs function can use using default/requested 421 // minimum number of waves per execution unit. 422 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 423 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 424 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 425 426 // Check if maximum number of SGPRs was explicitly requested using 427 // "amdgpu-num-sgpr" attribute. 428 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 429 unsigned Requested = AMDGPU::getIntegerAttribute( 430 F, "amdgpu-num-sgpr", MaxNumSGPRs); 431 432 // Make sure requested value does not violate subtarget's specifications. 433 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 434 Requested = 0; 435 436 // If more SGPRs are required to support the input user/system SGPRs, 437 // increase to accommodate them. 438 // 439 // FIXME: This really ends up using the requested number of SGPRs + number 440 // of reserved special registers in total. Theoretically you could re-use 441 // the last input registers for these special registers, but this would 442 // require a lot of complexity to deal with the weird aliasing. 443 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 444 if (Requested && Requested < InputNumSGPRs) 445 Requested = InputNumSGPRs; 446 447 // Make sure requested value is compatible with values implied by 448 // default/requested minimum/maximum number of waves per execution unit. 449 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 450 Requested = 0; 451 if (WavesPerEU.second && 452 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 453 Requested = 0; 454 455 if (Requested) 456 MaxNumSGPRs = Requested; 457 } 458 459 if (hasSGPRInitBug()) 460 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 461 462 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 463 MaxAddressableNumSGPRs); 464 } 465 466 unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 467 const Function &F = *MF.getFunction(); 468 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 469 470 // Compute maximum number of VGPRs function can use using default/requested 471 // minimum number of waves per execution unit. 472 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 473 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 474 475 // Check if maximum number of VGPRs was explicitly requested using 476 // "amdgpu-num-vgpr" attribute. 477 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 478 unsigned Requested = AMDGPU::getIntegerAttribute( 479 F, "amdgpu-num-vgpr", MaxNumVGPRs); 480 481 // Make sure requested value does not violate subtarget's specifications. 482 if (Requested && Requested <= getReservedNumVGPRs(MF)) 483 Requested = 0; 484 485 // Make sure requested value is compatible with values implied by 486 // default/requested minimum/maximum number of waves per execution unit. 487 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 488 Requested = 0; 489 if (WavesPerEU.second && 490 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 491 Requested = 0; 492 493 if (Requested) 494 MaxNumVGPRs = Requested; 495 } 496 497 return MaxNumVGPRs - getReservedNumVGPRs(MF); 498 } 499