1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief Implements the AMDGPU specific subclass of TargetSubtarget. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPUSubtarget.h" 16 #include "SIMachineFunctionInfo.h" 17 #include "llvm/ADT/SmallString.h" 18 #include "llvm/CodeGen/MachineScheduler.h" 19 #include "llvm/Target/TargetFrameLowering.h" 20 #include <algorithm> 21 22 using namespace llvm; 23 24 #define DEBUG_TYPE "amdgpu-subtarget" 25 26 #define GET_SUBTARGETINFO_ENUM 27 #define GET_SUBTARGETINFO_TARGET_DESC 28 #define GET_SUBTARGETINFO_CTOR 29 #include "AMDGPUGenSubtargetInfo.inc" 30 31 AMDGPUSubtarget::~AMDGPUSubtarget() = default; 32 33 AMDGPUSubtarget & 34 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT, 35 StringRef GPU, StringRef FS) { 36 // Determine default and user-specified characteristics 37 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be 38 // enabled, but some instructions do not respect them and they run at the 39 // double precision rate, so don't enable by default. 40 // 41 // We want to be able to turn these off, but making this a subtarget feature 42 // for SI has the unhelpful behavior that it unsets everything else if you 43 // disable it. 44 45 SmallString<256> FullFS("+promote-alloca,+fp64-fp16-denormals,+load-store-opt,"); 46 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 47 FullFS += "+flat-for-global,+unaligned-buffer-access,"; 48 49 FullFS += FS; 50 51 ParseSubtargetFeatures(GPU, FullFS); 52 53 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es 54 // on VI and newer hardware to avoid assertion failures due to missing ADDR64 55 // variants of MUBUF instructions. 56 if (!hasAddr64() && !FS.contains("flat-for-global")) { 57 FlatForGlobal = true; 58 } 59 60 // FIXME: I don't think think Evergreen has any useful support for 61 // denormals, but should be checked. Should we issue a warning somewhere 62 // if someone tries to enable these? 63 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { 64 FP64FP16Denormals = false; 65 FP32Denormals = false; 66 } 67 68 // Set defaults if needed. 69 if (MaxPrivateElementSize == 0) 70 MaxPrivateElementSize = 4; 71 72 return *this; 73 } 74 75 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 76 const TargetMachine &TM) 77 : AMDGPUGenSubtargetInfo(TT, GPU, FS), 78 TargetTriple(TT), 79 Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600), 80 IsaVersion(ISAVersion0_0_0), 81 WavefrontSize(64), 82 LocalMemorySize(0), 83 LDSBankCount(0), 84 MaxPrivateElementSize(0), 85 86 FastFMAF32(false), 87 HalfRate64Ops(false), 88 89 FP32Denormals(false), 90 FP64FP16Denormals(false), 91 FPExceptions(false), 92 FlatForGlobal(false), 93 UnalignedScratchAccess(false), 94 UnalignedBufferAccess(false), 95 96 EnableXNACK(false), 97 DebuggerInsertNops(false), 98 DebuggerReserveRegs(false), 99 DebuggerEmitPrologue(false), 100 101 EnableVGPRSpilling(false), 102 EnablePromoteAlloca(false), 103 EnableLoadStoreOpt(false), 104 EnableUnsafeDSOffsetFolding(false), 105 EnableSIScheduler(false), 106 DumpCode(false), 107 108 FP64(false), 109 IsGCN(false), 110 GCN1Encoding(false), 111 GCN3Encoding(false), 112 CIInsts(false), 113 SGPRInitBug(false), 114 HasSMemRealTime(false), 115 Has16BitInsts(false), 116 HasMovrel(false), 117 HasVGPRIndexMode(false), 118 HasScalarStores(false), 119 HasInv2PiInlineImm(false), 120 HasSDWA(false), 121 HasDPP(false), 122 FlatAddressSpace(false), 123 124 R600ALUInst(false), 125 CaymanISA(false), 126 CFALUBug(false), 127 HasVertexCache(false), 128 TexVTXClauseSize(0), 129 ScalarizeGlobal(false), 130 131 FeatureDisable(false), 132 InstrItins(getInstrItineraryForCPU(GPU)) { 133 initializeSubtargetDependencies(TT, GPU, FS); 134 } 135 136 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 137 const Function &F) const { 138 if (NWaves == 1) 139 return getLocalMemorySize(); 140 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 141 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 142 unsigned MaxWaves = getMaxWavesPerEU(); 143 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 144 } 145 146 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 147 const Function &F) const { 148 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 149 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 150 unsigned MaxWaves = getMaxWavesPerEU(); 151 unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu; 152 unsigned NumWaves = Limit / (Bytes ? Bytes : 1u); 153 NumWaves = std::min(NumWaves, MaxWaves); 154 NumWaves = std::max(NumWaves, 1u); 155 return NumWaves; 156 } 157 158 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 159 const Function &F) const { 160 // Default minimum/maximum flat work group sizes. 161 std::pair<unsigned, unsigned> Default = 162 AMDGPU::isCompute(F.getCallingConv()) ? 163 std::pair<unsigned, unsigned>(getWavefrontSize() * 2, 164 getWavefrontSize() * 4) : 165 std::pair<unsigned, unsigned>(1, getWavefrontSize()); 166 167 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 168 // starts using "amdgpu-flat-work-group-size" attribute. 169 Default.second = AMDGPU::getIntegerAttribute( 170 F, "amdgpu-max-work-group-size", Default.second); 171 Default.first = std::min(Default.first, Default.second); 172 173 // Requested minimum/maximum flat work group sizes. 174 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 175 F, "amdgpu-flat-work-group-size", Default); 176 177 // Make sure requested minimum is less than requested maximum. 178 if (Requested.first > Requested.second) 179 return Default; 180 181 // Make sure requested values do not violate subtarget's specifications. 182 if (Requested.first < getMinFlatWorkGroupSize()) 183 return Default; 184 if (Requested.second > getMaxFlatWorkGroupSize()) 185 return Default; 186 187 return Requested; 188 } 189 190 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 191 const Function &F) const { 192 // Default minimum/maximum number of waves per execution unit. 193 std::pair<unsigned, unsigned> Default(1, 0); 194 195 // Default/requested minimum/maximum flat work group sizes. 196 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 197 198 // If minimum/maximum flat work group sizes were explicitly requested using 199 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 200 // number of waves per execution unit to values implied by requested 201 // minimum/maximum flat work group sizes. 202 unsigned MinImpliedByFlatWorkGroupSize = 203 getMaxWavesPerEU(FlatWorkGroupSizes.second); 204 bool RequestedFlatWorkGroupSize = false; 205 206 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 207 // starts using "amdgpu-flat-work-group-size" attribute. 208 if (F.hasFnAttribute("amdgpu-max-work-group-size") || 209 F.hasFnAttribute("amdgpu-flat-work-group-size")) { 210 Default.first = MinImpliedByFlatWorkGroupSize; 211 RequestedFlatWorkGroupSize = true; 212 } 213 214 // Requested minimum/maximum number of waves per execution unit. 215 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 216 F, "amdgpu-waves-per-eu", Default, true); 217 218 // Make sure requested minimum is less than requested maximum. 219 if (Requested.second && Requested.first > Requested.second) 220 return Default; 221 222 // Make sure requested values do not violate subtarget's specifications. 223 if (Requested.first < getMinWavesPerEU() || 224 Requested.first > getMaxWavesPerEU()) 225 return Default; 226 if (Requested.second > getMaxWavesPerEU()) 227 return Default; 228 229 // Make sure requested values are compatible with values implied by requested 230 // minimum/maximum flat work group sizes. 231 if (RequestedFlatWorkGroupSize && 232 Requested.first > MinImpliedByFlatWorkGroupSize) 233 return Default; 234 235 return Requested; 236 } 237 238 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 239 const TargetMachine &TM) : 240 AMDGPUSubtarget(TT, GPU, FS, TM), 241 InstrInfo(*this), 242 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 243 TLInfo(TM, *this) {} 244 245 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS, 246 const TargetMachine &TM) : 247 AMDGPUSubtarget(TT, GPU, FS, TM), 248 InstrInfo(*this), 249 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 250 TLInfo(TM, *this) {} 251 252 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 253 unsigned NumRegionInstrs) const { 254 // Track register pressure so the scheduler can try to decrease 255 // pressure once register usage is above the threshold defined by 256 // SIRegisterInfo::getRegPressureSetLimit() 257 Policy.ShouldTrackPressure = true; 258 259 // Enabling both top down and bottom up scheduling seems to give us less 260 // register spills than just using one of these approaches on its own. 261 Policy.OnlyTopDown = false; 262 Policy.OnlyBottomUp = false; 263 264 Policy.ShouldTrackLaneMasks = enableSubRegLiveness(); 265 } 266 267 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const { 268 return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv()); 269 } 270 271 unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF, 272 unsigned ExplicitArgBytes) const { 273 unsigned ImplicitBytes = getImplicitArgNumBytes(MF); 274 if (ImplicitBytes == 0) 275 return ExplicitArgBytes; 276 277 unsigned Alignment = getAlignmentForImplicitArgPtr(); 278 return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 279 } 280 281 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 282 if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 283 if (SGPRs <= 80) 284 return 10; 285 if (SGPRs <= 88) 286 return 9; 287 if (SGPRs <= 100) 288 return 8; 289 return 7; 290 } 291 if (SGPRs <= 48) 292 return 10; 293 if (SGPRs <= 56) 294 return 9; 295 if (SGPRs <= 64) 296 return 8; 297 if (SGPRs <= 72) 298 return 7; 299 if (SGPRs <= 80) 300 return 6; 301 return 5; 302 } 303 304 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 305 if (VGPRs <= 24) 306 return 10; 307 if (VGPRs <= 28) 308 return 9; 309 if (VGPRs <= 32) 310 return 8; 311 if (VGPRs <= 36) 312 return 7; 313 if (VGPRs <= 40) 314 return 6; 315 if (VGPRs <= 48) 316 return 5; 317 if (VGPRs <= 64) 318 return 4; 319 if (VGPRs <= 84) 320 return 3; 321 if (VGPRs <= 128) 322 return 2; 323 return 1; 324 } 325 326 unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 327 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 328 if (MFI.hasFlatScratchInit()) { 329 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 330 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 331 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 332 return 4; // FLAT_SCRATCH, VCC (in that order). 333 } 334 335 if (isXNACKEnabled()) 336 return 4; // XNACK, VCC (in that order). 337 return 2; // VCC. 338 } 339 340 unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 341 const Function &F = *MF.getFunction(); 342 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 343 344 // Compute maximum number of SGPRs function can use using default/requested 345 // minimum number of waves per execution unit. 346 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 347 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 348 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 349 350 // Check if maximum number of SGPRs was explicitly requested using 351 // "amdgpu-num-sgpr" attribute. 352 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 353 unsigned Requested = AMDGPU::getIntegerAttribute( 354 F, "amdgpu-num-sgpr", MaxNumSGPRs); 355 356 // Make sure requested value does not violate subtarget's specifications. 357 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 358 Requested = 0; 359 360 // If more SGPRs are required to support the input user/system SGPRs, 361 // increase to accommodate them. 362 // 363 // FIXME: This really ends up using the requested number of SGPRs + number 364 // of reserved special registers in total. Theoretically you could re-use 365 // the last input registers for these special registers, but this would 366 // require a lot of complexity to deal with the weird aliasing. 367 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 368 if (Requested && Requested < InputNumSGPRs) 369 Requested = InputNumSGPRs; 370 371 // Make sure requested value is compatible with values implied by 372 // default/requested minimum/maximum number of waves per execution unit. 373 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 374 Requested = 0; 375 if (WavesPerEU.second && 376 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 377 Requested = 0; 378 379 if (Requested) 380 MaxNumSGPRs = Requested; 381 } 382 383 if (hasSGPRInitBug()) 384 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 385 386 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 387 MaxAddressableNumSGPRs); 388 } 389 390 unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 391 const Function &F = *MF.getFunction(); 392 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 393 394 // Compute maximum number of VGPRs function can use using default/requested 395 // minimum number of waves per execution unit. 396 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 397 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 398 399 // Check if maximum number of VGPRs was explicitly requested using 400 // "amdgpu-num-vgpr" attribute. 401 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 402 unsigned Requested = AMDGPU::getIntegerAttribute( 403 F, "amdgpu-num-vgpr", MaxNumVGPRs); 404 405 // Make sure requested value does not violate subtarget's specifications. 406 if (Requested && Requested <= getReservedNumVGPRs(MF)) 407 Requested = 0; 408 409 // Make sure requested value is compatible with values implied by 410 // default/requested minimum/maximum number of waves per execution unit. 411 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 412 Requested = 0; 413 if (WavesPerEU.second && 414 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 415 Requested = 0; 416 417 if (Requested) 418 MaxNumVGPRs = Requested; 419 } 420 421 return MaxNumVGPRs - getReservedNumVGPRs(MF); 422 } 423