1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief Implements the AMDGPU specific subclass of TargetSubtarget. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPUSubtarget.h" 16 #include "SIMachineFunctionInfo.h" 17 #include "llvm/ADT/SmallString.h" 18 #include "llvm/CodeGen/MachineScheduler.h" 19 #include "llvm/Target/TargetFrameLowering.h" 20 #include <algorithm> 21 22 using namespace llvm; 23 24 #define DEBUG_TYPE "amdgpu-subtarget" 25 26 #define GET_SUBTARGETINFO_ENUM 27 #define GET_SUBTARGETINFO_TARGET_DESC 28 #define GET_SUBTARGETINFO_CTOR 29 #include "AMDGPUGenSubtargetInfo.inc" 30 31 AMDGPUSubtarget::~AMDGPUSubtarget() = default; 32 33 AMDGPUSubtarget & 34 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT, 35 StringRef GPU, StringRef FS) { 36 // Determine default and user-specified characteristics 37 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be 38 // enabled, but some instructions do not respect them and they run at the 39 // double precision rate, so don't enable by default. 40 // 41 // We want to be able to turn these off, but making this a subtarget feature 42 // for SI has the unhelpful behavior that it unsets everything else if you 43 // disable it. 44 45 SmallString<256> FullFS("+promote-alloca,+fp64-fp16-denormals,+load-store-opt,"); 46 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 47 FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,"; 48 49 FullFS += FS; 50 51 ParseSubtargetFeatures(GPU, FullFS); 52 53 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es 54 // on VI and newer hardware to avoid assertion failures due to missing ADDR64 55 // variants of MUBUF instructions. 56 if (!hasAddr64() && !FS.contains("flat-for-global")) { 57 FlatForGlobal = true; 58 } 59 60 // FIXME: I don't think think Evergreen has any useful support for 61 // denormals, but should be checked. Should we issue a warning somewhere 62 // if someone tries to enable these? 63 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { 64 FP64FP16Denormals = false; 65 FP32Denormals = false; 66 } 67 68 // Set defaults if needed. 69 if (MaxPrivateElementSize == 0) 70 MaxPrivateElementSize = 4; 71 72 return *this; 73 } 74 75 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 76 const TargetMachine &TM) 77 : AMDGPUGenSubtargetInfo(TT, GPU, FS), 78 TargetTriple(TT), 79 Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600), 80 IsaVersion(ISAVersion0_0_0), 81 WavefrontSize(64), 82 LocalMemorySize(0), 83 LDSBankCount(0), 84 MaxPrivateElementSize(0), 85 86 FastFMAF32(false), 87 HalfRate64Ops(false), 88 89 FP32Denormals(false), 90 FP64FP16Denormals(false), 91 FPExceptions(false), 92 FlatForGlobal(false), 93 UnalignedScratchAccess(false), 94 UnalignedBufferAccess(false), 95 96 EnableXNACK(false), 97 TrapHandler(false), 98 DebuggerInsertNops(false), 99 DebuggerReserveRegs(false), 100 DebuggerEmitPrologue(false), 101 102 EnableVGPRSpilling(false), 103 EnablePromoteAlloca(false), 104 EnableLoadStoreOpt(false), 105 EnableUnsafeDSOffsetFolding(false), 106 EnableSIScheduler(false), 107 DumpCode(false), 108 109 FP64(false), 110 IsGCN(false), 111 GCN1Encoding(false), 112 GCN3Encoding(false), 113 CIInsts(false), 114 SGPRInitBug(false), 115 HasSMemRealTime(false), 116 Has16BitInsts(false), 117 HasMovrel(false), 118 HasVGPRIndexMode(false), 119 HasScalarStores(false), 120 HasInv2PiInlineImm(false), 121 HasSDWA(false), 122 HasDPP(false), 123 FlatAddressSpace(false), 124 125 R600ALUInst(false), 126 CaymanISA(false), 127 CFALUBug(false), 128 HasVertexCache(false), 129 TexVTXClauseSize(0), 130 ScalarizeGlobal(false), 131 132 FeatureDisable(false), 133 InstrItins(getInstrItineraryForCPU(GPU)) { 134 initializeSubtargetDependencies(TT, GPU, FS); 135 } 136 137 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 138 const Function &F) const { 139 if (NWaves == 1) 140 return getLocalMemorySize(); 141 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 142 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 143 unsigned MaxWaves = getMaxWavesPerEU(); 144 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 145 } 146 147 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 148 const Function &F) const { 149 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 150 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 151 unsigned MaxWaves = getMaxWavesPerEU(); 152 unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu; 153 unsigned NumWaves = Limit / (Bytes ? Bytes : 1u); 154 NumWaves = std::min(NumWaves, MaxWaves); 155 NumWaves = std::max(NumWaves, 1u); 156 return NumWaves; 157 } 158 159 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 160 const Function &F) const { 161 // Default minimum/maximum flat work group sizes. 162 std::pair<unsigned, unsigned> Default = 163 AMDGPU::isCompute(F.getCallingConv()) ? 164 std::pair<unsigned, unsigned>(getWavefrontSize() * 2, 165 getWavefrontSize() * 4) : 166 std::pair<unsigned, unsigned>(1, getWavefrontSize()); 167 168 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 169 // starts using "amdgpu-flat-work-group-size" attribute. 170 Default.second = AMDGPU::getIntegerAttribute( 171 F, "amdgpu-max-work-group-size", Default.second); 172 Default.first = std::min(Default.first, Default.second); 173 174 // Requested minimum/maximum flat work group sizes. 175 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 176 F, "amdgpu-flat-work-group-size", Default); 177 178 // Make sure requested minimum is less than requested maximum. 179 if (Requested.first > Requested.second) 180 return Default; 181 182 // Make sure requested values do not violate subtarget's specifications. 183 if (Requested.first < getMinFlatWorkGroupSize()) 184 return Default; 185 if (Requested.second > getMaxFlatWorkGroupSize()) 186 return Default; 187 188 return Requested; 189 } 190 191 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 192 const Function &F) const { 193 // Default minimum/maximum number of waves per execution unit. 194 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 195 196 // Default/requested minimum/maximum flat work group sizes. 197 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 198 199 // If minimum/maximum flat work group sizes were explicitly requested using 200 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 201 // number of waves per execution unit to values implied by requested 202 // minimum/maximum flat work group sizes. 203 unsigned MinImpliedByFlatWorkGroupSize = 204 getMaxWavesPerEU(FlatWorkGroupSizes.second); 205 bool RequestedFlatWorkGroupSize = false; 206 207 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 208 // starts using "amdgpu-flat-work-group-size" attribute. 209 if (F.hasFnAttribute("amdgpu-max-work-group-size") || 210 F.hasFnAttribute("amdgpu-flat-work-group-size")) { 211 Default.first = MinImpliedByFlatWorkGroupSize; 212 RequestedFlatWorkGroupSize = true; 213 } 214 215 // Requested minimum/maximum number of waves per execution unit. 216 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 217 F, "amdgpu-waves-per-eu", Default, true); 218 219 // Make sure requested minimum is less than requested maximum. 220 if (Requested.second && Requested.first > Requested.second) 221 return Default; 222 223 // Make sure requested values do not violate subtarget's specifications. 224 if (Requested.first < getMinWavesPerEU() || 225 Requested.first > getMaxWavesPerEU()) 226 return Default; 227 if (Requested.second > getMaxWavesPerEU()) 228 return Default; 229 230 // Make sure requested values are compatible with values implied by requested 231 // minimum/maximum flat work group sizes. 232 if (RequestedFlatWorkGroupSize && 233 Requested.first > MinImpliedByFlatWorkGroupSize) 234 return Default; 235 236 return Requested; 237 } 238 239 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 240 const TargetMachine &TM) : 241 AMDGPUSubtarget(TT, GPU, FS, TM), 242 InstrInfo(*this), 243 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 244 TLInfo(TM, *this) {} 245 246 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS, 247 const TargetMachine &TM) : 248 AMDGPUSubtarget(TT, GPU, FS, TM), 249 InstrInfo(*this), 250 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 251 TLInfo(TM, *this) {} 252 253 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 254 unsigned NumRegionInstrs) const { 255 // Track register pressure so the scheduler can try to decrease 256 // pressure once register usage is above the threshold defined by 257 // SIRegisterInfo::getRegPressureSetLimit() 258 Policy.ShouldTrackPressure = true; 259 260 // Enabling both top down and bottom up scheduling seems to give us less 261 // register spills than just using one of these approaches on its own. 262 Policy.OnlyTopDown = false; 263 Policy.OnlyBottomUp = false; 264 265 Policy.ShouldTrackLaneMasks = enableSubRegLiveness(); 266 } 267 268 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const { 269 return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv()); 270 } 271 272 unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF, 273 unsigned ExplicitArgBytes) const { 274 unsigned ImplicitBytes = getImplicitArgNumBytes(MF); 275 if (ImplicitBytes == 0) 276 return ExplicitArgBytes; 277 278 unsigned Alignment = getAlignmentForImplicitArgPtr(); 279 return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 280 } 281 282 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 283 if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 284 if (SGPRs <= 80) 285 return 10; 286 if (SGPRs <= 88) 287 return 9; 288 if (SGPRs <= 100) 289 return 8; 290 return 7; 291 } 292 if (SGPRs <= 48) 293 return 10; 294 if (SGPRs <= 56) 295 return 9; 296 if (SGPRs <= 64) 297 return 8; 298 if (SGPRs <= 72) 299 return 7; 300 if (SGPRs <= 80) 301 return 6; 302 return 5; 303 } 304 305 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 306 if (VGPRs <= 24) 307 return 10; 308 if (VGPRs <= 28) 309 return 9; 310 if (VGPRs <= 32) 311 return 8; 312 if (VGPRs <= 36) 313 return 7; 314 if (VGPRs <= 40) 315 return 6; 316 if (VGPRs <= 48) 317 return 5; 318 if (VGPRs <= 64) 319 return 4; 320 if (VGPRs <= 84) 321 return 3; 322 if (VGPRs <= 128) 323 return 2; 324 return 1; 325 } 326 327 unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 328 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 329 if (MFI.hasFlatScratchInit()) { 330 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 331 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 332 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 333 return 4; // FLAT_SCRATCH, VCC (in that order). 334 } 335 336 if (isXNACKEnabled()) 337 return 4; // XNACK, VCC (in that order). 338 return 2; // VCC. 339 } 340 341 unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 342 const Function &F = *MF.getFunction(); 343 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 344 345 // Compute maximum number of SGPRs function can use using default/requested 346 // minimum number of waves per execution unit. 347 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 348 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 349 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 350 351 // Check if maximum number of SGPRs was explicitly requested using 352 // "amdgpu-num-sgpr" attribute. 353 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 354 unsigned Requested = AMDGPU::getIntegerAttribute( 355 F, "amdgpu-num-sgpr", MaxNumSGPRs); 356 357 // Make sure requested value does not violate subtarget's specifications. 358 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 359 Requested = 0; 360 361 // If more SGPRs are required to support the input user/system SGPRs, 362 // increase to accommodate them. 363 // 364 // FIXME: This really ends up using the requested number of SGPRs + number 365 // of reserved special registers in total. Theoretically you could re-use 366 // the last input registers for these special registers, but this would 367 // require a lot of complexity to deal with the weird aliasing. 368 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 369 if (Requested && Requested < InputNumSGPRs) 370 Requested = InputNumSGPRs; 371 372 // Make sure requested value is compatible with values implied by 373 // default/requested minimum/maximum number of waves per execution unit. 374 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 375 Requested = 0; 376 if (WavesPerEU.second && 377 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 378 Requested = 0; 379 380 if (Requested) 381 MaxNumSGPRs = Requested; 382 } 383 384 if (hasSGPRInitBug()) 385 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 386 387 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 388 MaxAddressableNumSGPRs); 389 } 390 391 unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 392 const Function &F = *MF.getFunction(); 393 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 394 395 // Compute maximum number of VGPRs function can use using default/requested 396 // minimum number of waves per execution unit. 397 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 398 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 399 400 // Check if maximum number of VGPRs was explicitly requested using 401 // "amdgpu-num-vgpr" attribute. 402 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 403 unsigned Requested = AMDGPU::getIntegerAttribute( 404 F, "amdgpu-num-vgpr", MaxNumVGPRs); 405 406 // Make sure requested value does not violate subtarget's specifications. 407 if (Requested && Requested <= getReservedNumVGPRs(MF)) 408 Requested = 0; 409 410 // Make sure requested value is compatible with values implied by 411 // default/requested minimum/maximum number of waves per execution unit. 412 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 413 Requested = 0; 414 if (WavesPerEU.second && 415 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 416 Requested = 0; 417 418 if (Requested) 419 MaxNumVGPRs = Requested; 420 } 421 422 return MaxNumVGPRs - getReservedNumVGPRs(MF); 423 } 424