1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief Implements the AMDGPU specific subclass of TargetSubtarget. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPUSubtarget.h" 16 #include "SIMachineFunctionInfo.h" 17 #include "llvm/ADT/SmallString.h" 18 #include "llvm/CodeGen/MachineScheduler.h" 19 #include "llvm/Target/TargetFrameLowering.h" 20 #include <algorithm> 21 22 using namespace llvm; 23 24 #define DEBUG_TYPE "amdgpu-subtarget" 25 26 #define GET_SUBTARGETINFO_ENUM 27 #define GET_SUBTARGETINFO_TARGET_DESC 28 #define GET_SUBTARGETINFO_CTOR 29 #include "AMDGPUGenSubtargetInfo.inc" 30 31 AMDGPUSubtarget::~AMDGPUSubtarget() = default; 32 33 AMDGPUSubtarget & 34 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT, 35 StringRef GPU, StringRef FS) { 36 // Determine default and user-specified characteristics 37 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be 38 // enabled, but some instructions do not respect them and they run at the 39 // double precision rate, so don't enable by default. 40 // 41 // We want to be able to turn these off, but making this a subtarget feature 42 // for SI has the unhelpful behavior that it unsets everything else if you 43 // disable it. 44 45 SmallString<256> FullFS("+promote-alloca,+fp64-fp16-denormals,+dx10-clamp,+load-store-opt,"); 46 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 47 FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,"; 48 49 FullFS += FS; 50 51 ParseSubtargetFeatures(GPU, FullFS); 52 53 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es 54 // on VI and newer hardware to avoid assertion failures due to missing ADDR64 55 // variants of MUBUF instructions. 56 if (!hasAddr64() && !FS.contains("flat-for-global")) { 57 FlatForGlobal = true; 58 } 59 60 // FIXME: I don't think think Evergreen has any useful support for 61 // denormals, but should be checked. Should we issue a warning somewhere 62 // if someone tries to enable these? 63 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { 64 FP64FP16Denormals = false; 65 FP32Denormals = false; 66 } 67 68 // Set defaults if needed. 69 if (MaxPrivateElementSize == 0) 70 MaxPrivateElementSize = 4; 71 72 return *this; 73 } 74 75 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 76 const TargetMachine &TM) 77 : AMDGPUGenSubtargetInfo(TT, GPU, FS), 78 TargetTriple(TT), 79 Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600), 80 IsaVersion(ISAVersion0_0_0), 81 WavefrontSize(64), 82 LocalMemorySize(0), 83 LDSBankCount(0), 84 MaxPrivateElementSize(0), 85 86 FastFMAF32(false), 87 HalfRate64Ops(false), 88 89 FP32Denormals(false), 90 FP64FP16Denormals(false), 91 FPExceptions(false), 92 DX10Clamp(false), 93 FlatForGlobal(false), 94 UnalignedScratchAccess(false), 95 UnalignedBufferAccess(false), 96 97 HasApertureRegs(false), 98 EnableXNACK(false), 99 TrapHandler(false), 100 DebuggerInsertNops(false), 101 DebuggerReserveRegs(false), 102 DebuggerEmitPrologue(false), 103 104 EnableVGPRSpilling(false), 105 EnablePromoteAlloca(false), 106 EnableLoadStoreOpt(false), 107 EnableUnsafeDSOffsetFolding(false), 108 EnableSIScheduler(false), 109 DumpCode(false), 110 111 FP64(false), 112 IsGCN(false), 113 GCN1Encoding(false), 114 GCN3Encoding(false), 115 CIInsts(false), 116 GFX9Insts(false), 117 SGPRInitBug(false), 118 HasSMemRealTime(false), 119 Has16BitInsts(false), 120 HasVOP3PInsts(false), 121 HasMovrel(false), 122 HasVGPRIndexMode(false), 123 HasScalarStores(false), 124 HasInv2PiInlineImm(false), 125 HasSDWA(false), 126 HasDPP(false), 127 FlatAddressSpace(false), 128 129 R600ALUInst(false), 130 CaymanISA(false), 131 CFALUBug(false), 132 HasVertexCache(false), 133 TexVTXClauseSize(0), 134 ScalarizeGlobal(false), 135 136 FeatureDisable(false), 137 InstrItins(getInstrItineraryForCPU(GPU)) { 138 AS = AMDGPU::getAMDGPUAS(TT); 139 initializeSubtargetDependencies(TT, GPU, FS); 140 } 141 142 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 143 const Function &F) const { 144 if (NWaves == 1) 145 return getLocalMemorySize(); 146 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 147 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 148 unsigned MaxWaves = getMaxWavesPerEU(); 149 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 150 } 151 152 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 153 const Function &F) const { 154 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 155 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 156 unsigned MaxWaves = getMaxWavesPerEU(); 157 unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu; 158 unsigned NumWaves = Limit / (Bytes ? Bytes : 1u); 159 NumWaves = std::min(NumWaves, MaxWaves); 160 NumWaves = std::max(NumWaves, 1u); 161 return NumWaves; 162 } 163 164 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 165 const Function &F) const { 166 // Default minimum/maximum flat work group sizes. 167 std::pair<unsigned, unsigned> Default = 168 AMDGPU::isCompute(F.getCallingConv()) ? 169 std::pair<unsigned, unsigned>(getWavefrontSize() * 2, 170 getWavefrontSize() * 4) : 171 std::pair<unsigned, unsigned>(1, getWavefrontSize()); 172 173 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 174 // starts using "amdgpu-flat-work-group-size" attribute. 175 Default.second = AMDGPU::getIntegerAttribute( 176 F, "amdgpu-max-work-group-size", Default.second); 177 Default.first = std::min(Default.first, Default.second); 178 179 // Requested minimum/maximum flat work group sizes. 180 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 181 F, "amdgpu-flat-work-group-size", Default); 182 183 // Make sure requested minimum is less than requested maximum. 184 if (Requested.first > Requested.second) 185 return Default; 186 187 // Make sure requested values do not violate subtarget's specifications. 188 if (Requested.first < getMinFlatWorkGroupSize()) 189 return Default; 190 if (Requested.second > getMaxFlatWorkGroupSize()) 191 return Default; 192 193 return Requested; 194 } 195 196 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 197 const Function &F) const { 198 // Default minimum/maximum number of waves per execution unit. 199 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 200 201 // Default/requested minimum/maximum flat work group sizes. 202 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 203 204 // If minimum/maximum flat work group sizes were explicitly requested using 205 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 206 // number of waves per execution unit to values implied by requested 207 // minimum/maximum flat work group sizes. 208 unsigned MinImpliedByFlatWorkGroupSize = 209 getMaxWavesPerEU(FlatWorkGroupSizes.second); 210 bool RequestedFlatWorkGroupSize = false; 211 212 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 213 // starts using "amdgpu-flat-work-group-size" attribute. 214 if (F.hasFnAttribute("amdgpu-max-work-group-size") || 215 F.hasFnAttribute("amdgpu-flat-work-group-size")) { 216 Default.first = MinImpliedByFlatWorkGroupSize; 217 RequestedFlatWorkGroupSize = true; 218 } 219 220 // Requested minimum/maximum number of waves per execution unit. 221 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 222 F, "amdgpu-waves-per-eu", Default, true); 223 224 // Make sure requested minimum is less than requested maximum. 225 if (Requested.second && Requested.first > Requested.second) 226 return Default; 227 228 // Make sure requested values do not violate subtarget's specifications. 229 if (Requested.first < getMinWavesPerEU() || 230 Requested.first > getMaxWavesPerEU()) 231 return Default; 232 if (Requested.second > getMaxWavesPerEU()) 233 return Default; 234 235 // Make sure requested values are compatible with values implied by requested 236 // minimum/maximum flat work group sizes. 237 if (RequestedFlatWorkGroupSize && 238 Requested.first > MinImpliedByFlatWorkGroupSize) 239 return Default; 240 241 return Requested; 242 } 243 244 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 245 const TargetMachine &TM) : 246 AMDGPUSubtarget(TT, GPU, FS, TM), 247 InstrInfo(*this), 248 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 249 TLInfo(TM, *this) {} 250 251 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS, 252 const TargetMachine &TM) : 253 AMDGPUSubtarget(TT, GPU, FS, TM), 254 InstrInfo(*this), 255 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 256 TLInfo(TM, *this) {} 257 258 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 259 unsigned NumRegionInstrs) const { 260 // Track register pressure so the scheduler can try to decrease 261 // pressure once register usage is above the threshold defined by 262 // SIRegisterInfo::getRegPressureSetLimit() 263 Policy.ShouldTrackPressure = true; 264 265 // Enabling both top down and bottom up scheduling seems to give us less 266 // register spills than just using one of these approaches on its own. 267 Policy.OnlyTopDown = false; 268 Policy.OnlyBottomUp = false; 269 270 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 271 if (!enableSIScheduler()) 272 Policy.ShouldTrackLaneMasks = true; 273 } 274 275 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const { 276 return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv()); 277 } 278 279 unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF, 280 unsigned ExplicitArgBytes) const { 281 unsigned ImplicitBytes = getImplicitArgNumBytes(MF); 282 if (ImplicitBytes == 0) 283 return ExplicitArgBytes; 284 285 unsigned Alignment = getAlignmentForImplicitArgPtr(); 286 return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 287 } 288 289 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 290 if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 291 if (SGPRs <= 80) 292 return 10; 293 if (SGPRs <= 88) 294 return 9; 295 if (SGPRs <= 100) 296 return 8; 297 return 7; 298 } 299 if (SGPRs <= 48) 300 return 10; 301 if (SGPRs <= 56) 302 return 9; 303 if (SGPRs <= 64) 304 return 8; 305 if (SGPRs <= 72) 306 return 7; 307 if (SGPRs <= 80) 308 return 6; 309 return 5; 310 } 311 312 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 313 if (VGPRs <= 24) 314 return 10; 315 if (VGPRs <= 28) 316 return 9; 317 if (VGPRs <= 32) 318 return 8; 319 if (VGPRs <= 36) 320 return 7; 321 if (VGPRs <= 40) 322 return 6; 323 if (VGPRs <= 48) 324 return 5; 325 if (VGPRs <= 64) 326 return 4; 327 if (VGPRs <= 84) 328 return 3; 329 if (VGPRs <= 128) 330 return 2; 331 return 1; 332 } 333 334 unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 335 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 336 if (MFI.hasFlatScratchInit()) { 337 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 338 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 339 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 340 return 4; // FLAT_SCRATCH, VCC (in that order). 341 } 342 343 if (isXNACKEnabled()) 344 return 4; // XNACK, VCC (in that order). 345 return 2; // VCC. 346 } 347 348 unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 349 const Function &F = *MF.getFunction(); 350 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 351 352 // Compute maximum number of SGPRs function can use using default/requested 353 // minimum number of waves per execution unit. 354 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 355 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 356 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 357 358 // Check if maximum number of SGPRs was explicitly requested using 359 // "amdgpu-num-sgpr" attribute. 360 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 361 unsigned Requested = AMDGPU::getIntegerAttribute( 362 F, "amdgpu-num-sgpr", MaxNumSGPRs); 363 364 // Make sure requested value does not violate subtarget's specifications. 365 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 366 Requested = 0; 367 368 // If more SGPRs are required to support the input user/system SGPRs, 369 // increase to accommodate them. 370 // 371 // FIXME: This really ends up using the requested number of SGPRs + number 372 // of reserved special registers in total. Theoretically you could re-use 373 // the last input registers for these special registers, but this would 374 // require a lot of complexity to deal with the weird aliasing. 375 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 376 if (Requested && Requested < InputNumSGPRs) 377 Requested = InputNumSGPRs; 378 379 // Make sure requested value is compatible with values implied by 380 // default/requested minimum/maximum number of waves per execution unit. 381 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 382 Requested = 0; 383 if (WavesPerEU.second && 384 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 385 Requested = 0; 386 387 if (Requested) 388 MaxNumSGPRs = Requested; 389 } 390 391 if (hasSGPRInitBug()) 392 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 393 394 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 395 MaxAddressableNumSGPRs); 396 } 397 398 unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 399 const Function &F = *MF.getFunction(); 400 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 401 402 // Compute maximum number of VGPRs function can use using default/requested 403 // minimum number of waves per execution unit. 404 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 405 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 406 407 // Check if maximum number of VGPRs was explicitly requested using 408 // "amdgpu-num-vgpr" attribute. 409 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 410 unsigned Requested = AMDGPU::getIntegerAttribute( 411 F, "amdgpu-num-vgpr", MaxNumVGPRs); 412 413 // Make sure requested value does not violate subtarget's specifications. 414 if (Requested && Requested <= getReservedNumVGPRs(MF)) 415 Requested = 0; 416 417 // Make sure requested value is compatible with values implied by 418 // default/requested minimum/maximum number of waves per execution unit. 419 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 420 Requested = 0; 421 if (WavesPerEU.second && 422 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 423 Requested = 0; 424 425 if (Requested) 426 MaxNumVGPRs = Requested; 427 } 428 429 return MaxNumVGPRs - getReservedNumVGPRs(MF); 430 } 431