1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief Implements the AMDGPU specific subclass of TargetSubtarget. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPUSubtarget.h" 16 #include "SIMachineFunctionInfo.h" 17 #include "llvm/ADT/SmallString.h" 18 #include "llvm/CodeGen/MachineScheduler.h" 19 #include "llvm/Target/TargetFrameLowering.h" 20 #include <algorithm> 21 22 using namespace llvm; 23 24 #define DEBUG_TYPE "amdgpu-subtarget" 25 26 #define GET_SUBTARGETINFO_ENUM 27 #define GET_SUBTARGETINFO_TARGET_DESC 28 #define GET_SUBTARGETINFO_CTOR 29 #include "AMDGPUGenSubtargetInfo.inc" 30 31 AMDGPUSubtarget::~AMDGPUSubtarget() = default; 32 33 AMDGPUSubtarget & 34 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT, 35 StringRef GPU, StringRef FS) { 36 // Determine default and user-specified characteristics 37 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be 38 // enabled, but some instructions do not respect them and they run at the 39 // double precision rate, so don't enable by default. 40 // 41 // We want to be able to turn these off, but making this a subtarget feature 42 // for SI has the unhelpful behavior that it unsets everything else if you 43 // disable it. 44 45 SmallString<256> FullFS("+promote-alloca,+fp64-fp16-denormals,+dx10-clamp,+load-store-opt,"); 46 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 47 FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,"; 48 49 FullFS += FS; 50 51 ParseSubtargetFeatures(GPU, FullFS); 52 53 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es 54 // on VI and newer hardware to avoid assertion failures due to missing ADDR64 55 // variants of MUBUF instructions. 56 if (!hasAddr64() && !FS.contains("flat-for-global")) { 57 FlatForGlobal = true; 58 } 59 60 // FIXME: I don't think think Evergreen has any useful support for 61 // denormals, but should be checked. Should we issue a warning somewhere 62 // if someone tries to enable these? 63 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { 64 FP64FP16Denormals = false; 65 FP32Denormals = false; 66 } 67 68 // Set defaults if needed. 69 if (MaxPrivateElementSize == 0) 70 MaxPrivateElementSize = 4; 71 72 return *this; 73 } 74 75 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 76 const TargetMachine &TM) 77 : AMDGPUGenSubtargetInfo(TT, GPU, FS), 78 TargetTriple(TT), 79 Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600), 80 IsaVersion(ISAVersion0_0_0), 81 WavefrontSize(64), 82 LocalMemorySize(0), 83 LDSBankCount(0), 84 MaxPrivateElementSize(0), 85 86 FastFMAF32(false), 87 HalfRate64Ops(false), 88 89 FP32Denormals(false), 90 FP64FP16Denormals(false), 91 FPExceptions(false), 92 DX10Clamp(false), 93 FlatForGlobal(false), 94 UnalignedScratchAccess(false), 95 UnalignedBufferAccess(false), 96 97 HasApertureRegs(false), 98 EnableXNACK(false), 99 TrapHandler(false), 100 DebuggerInsertNops(false), 101 DebuggerReserveRegs(false), 102 DebuggerEmitPrologue(false), 103 104 EnableVGPRSpilling(false), 105 EnablePromoteAlloca(false), 106 EnableLoadStoreOpt(false), 107 EnableUnsafeDSOffsetFolding(false), 108 EnableSIScheduler(false), 109 DumpCode(false), 110 111 FP64(false), 112 IsGCN(false), 113 GCN1Encoding(false), 114 GCN3Encoding(false), 115 CIInsts(false), 116 GFX9Insts(false), 117 SGPRInitBug(false), 118 HasSMemRealTime(false), 119 Has16BitInsts(false), 120 HasVOP3PInsts(false), 121 HasMovrel(false), 122 HasVGPRIndexMode(false), 123 HasScalarStores(false), 124 HasInv2PiInlineImm(false), 125 HasSDWA(false), 126 HasDPP(false), 127 FlatAddressSpace(false), 128 129 R600ALUInst(false), 130 CaymanISA(false), 131 CFALUBug(false), 132 HasVertexCache(false), 133 TexVTXClauseSize(0), 134 ScalarizeGlobal(false), 135 136 FeatureDisable(false), 137 InstrItins(getInstrItineraryForCPU(GPU)) { 138 initializeSubtargetDependencies(TT, GPU, FS); 139 } 140 141 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 142 const Function &F) const { 143 if (NWaves == 1) 144 return getLocalMemorySize(); 145 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 146 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 147 unsigned MaxWaves = getMaxWavesPerEU(); 148 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 149 } 150 151 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 152 const Function &F) const { 153 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 154 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 155 unsigned MaxWaves = getMaxWavesPerEU(); 156 unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu; 157 unsigned NumWaves = Limit / (Bytes ? Bytes : 1u); 158 NumWaves = std::min(NumWaves, MaxWaves); 159 NumWaves = std::max(NumWaves, 1u); 160 return NumWaves; 161 } 162 163 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 164 const Function &F) const { 165 // Default minimum/maximum flat work group sizes. 166 std::pair<unsigned, unsigned> Default = 167 AMDGPU::isCompute(F.getCallingConv()) ? 168 std::pair<unsigned, unsigned>(getWavefrontSize() * 2, 169 getWavefrontSize() * 4) : 170 std::pair<unsigned, unsigned>(1, getWavefrontSize()); 171 172 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 173 // starts using "amdgpu-flat-work-group-size" attribute. 174 Default.second = AMDGPU::getIntegerAttribute( 175 F, "amdgpu-max-work-group-size", Default.second); 176 Default.first = std::min(Default.first, Default.second); 177 178 // Requested minimum/maximum flat work group sizes. 179 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 180 F, "amdgpu-flat-work-group-size", Default); 181 182 // Make sure requested minimum is less than requested maximum. 183 if (Requested.first > Requested.second) 184 return Default; 185 186 // Make sure requested values do not violate subtarget's specifications. 187 if (Requested.first < getMinFlatWorkGroupSize()) 188 return Default; 189 if (Requested.second > getMaxFlatWorkGroupSize()) 190 return Default; 191 192 return Requested; 193 } 194 195 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 196 const Function &F) const { 197 // Default minimum/maximum number of waves per execution unit. 198 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 199 200 // Default/requested minimum/maximum flat work group sizes. 201 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 202 203 // If minimum/maximum flat work group sizes were explicitly requested using 204 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 205 // number of waves per execution unit to values implied by requested 206 // minimum/maximum flat work group sizes. 207 unsigned MinImpliedByFlatWorkGroupSize = 208 getMaxWavesPerEU(FlatWorkGroupSizes.second); 209 bool RequestedFlatWorkGroupSize = false; 210 211 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 212 // starts using "amdgpu-flat-work-group-size" attribute. 213 if (F.hasFnAttribute("amdgpu-max-work-group-size") || 214 F.hasFnAttribute("amdgpu-flat-work-group-size")) { 215 Default.first = MinImpliedByFlatWorkGroupSize; 216 RequestedFlatWorkGroupSize = true; 217 } 218 219 // Requested minimum/maximum number of waves per execution unit. 220 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 221 F, "amdgpu-waves-per-eu", Default, true); 222 223 // Make sure requested minimum is less than requested maximum. 224 if (Requested.second && Requested.first > Requested.second) 225 return Default; 226 227 // Make sure requested values do not violate subtarget's specifications. 228 if (Requested.first < getMinWavesPerEU() || 229 Requested.first > getMaxWavesPerEU()) 230 return Default; 231 if (Requested.second > getMaxWavesPerEU()) 232 return Default; 233 234 // Make sure requested values are compatible with values implied by requested 235 // minimum/maximum flat work group sizes. 236 if (RequestedFlatWorkGroupSize && 237 Requested.first > MinImpliedByFlatWorkGroupSize) 238 return Default; 239 240 return Requested; 241 } 242 243 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 244 const TargetMachine &TM) : 245 AMDGPUSubtarget(TT, GPU, FS, TM), 246 InstrInfo(*this), 247 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 248 TLInfo(TM, *this) {} 249 250 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS, 251 const TargetMachine &TM) : 252 AMDGPUSubtarget(TT, GPU, FS, TM), 253 InstrInfo(*this), 254 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 255 TLInfo(TM, *this) {} 256 257 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 258 unsigned NumRegionInstrs) const { 259 // Track register pressure so the scheduler can try to decrease 260 // pressure once register usage is above the threshold defined by 261 // SIRegisterInfo::getRegPressureSetLimit() 262 Policy.ShouldTrackPressure = true; 263 264 // Enabling both top down and bottom up scheduling seems to give us less 265 // register spills than just using one of these approaches on its own. 266 Policy.OnlyTopDown = false; 267 Policy.OnlyBottomUp = false; 268 269 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 270 if (!enableSIScheduler()) 271 Policy.ShouldTrackLaneMasks = true; 272 } 273 274 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const { 275 return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv()); 276 } 277 278 unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF, 279 unsigned ExplicitArgBytes) const { 280 unsigned ImplicitBytes = getImplicitArgNumBytes(MF); 281 if (ImplicitBytes == 0) 282 return ExplicitArgBytes; 283 284 unsigned Alignment = getAlignmentForImplicitArgPtr(); 285 return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 286 } 287 288 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 289 if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 290 if (SGPRs <= 80) 291 return 10; 292 if (SGPRs <= 88) 293 return 9; 294 if (SGPRs <= 100) 295 return 8; 296 return 7; 297 } 298 if (SGPRs <= 48) 299 return 10; 300 if (SGPRs <= 56) 301 return 9; 302 if (SGPRs <= 64) 303 return 8; 304 if (SGPRs <= 72) 305 return 7; 306 if (SGPRs <= 80) 307 return 6; 308 return 5; 309 } 310 311 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 312 if (VGPRs <= 24) 313 return 10; 314 if (VGPRs <= 28) 315 return 9; 316 if (VGPRs <= 32) 317 return 8; 318 if (VGPRs <= 36) 319 return 7; 320 if (VGPRs <= 40) 321 return 6; 322 if (VGPRs <= 48) 323 return 5; 324 if (VGPRs <= 64) 325 return 4; 326 if (VGPRs <= 84) 327 return 3; 328 if (VGPRs <= 128) 329 return 2; 330 return 1; 331 } 332 333 unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 334 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 335 if (MFI.hasFlatScratchInit()) { 336 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 337 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 338 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 339 return 4; // FLAT_SCRATCH, VCC (in that order). 340 } 341 342 if (isXNACKEnabled()) 343 return 4; // XNACK, VCC (in that order). 344 return 2; // VCC. 345 } 346 347 unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 348 const Function &F = *MF.getFunction(); 349 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 350 351 // Compute maximum number of SGPRs function can use using default/requested 352 // minimum number of waves per execution unit. 353 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 354 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 355 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 356 357 // Check if maximum number of SGPRs was explicitly requested using 358 // "amdgpu-num-sgpr" attribute. 359 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 360 unsigned Requested = AMDGPU::getIntegerAttribute( 361 F, "amdgpu-num-sgpr", MaxNumSGPRs); 362 363 // Make sure requested value does not violate subtarget's specifications. 364 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 365 Requested = 0; 366 367 // If more SGPRs are required to support the input user/system SGPRs, 368 // increase to accommodate them. 369 // 370 // FIXME: This really ends up using the requested number of SGPRs + number 371 // of reserved special registers in total. Theoretically you could re-use 372 // the last input registers for these special registers, but this would 373 // require a lot of complexity to deal with the weird aliasing. 374 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 375 if (Requested && Requested < InputNumSGPRs) 376 Requested = InputNumSGPRs; 377 378 // Make sure requested value is compatible with values implied by 379 // default/requested minimum/maximum number of waves per execution unit. 380 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 381 Requested = 0; 382 if (WavesPerEU.second && 383 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 384 Requested = 0; 385 386 if (Requested) 387 MaxNumSGPRs = Requested; 388 } 389 390 if (hasSGPRInitBug()) 391 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 392 393 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 394 MaxAddressableNumSGPRs); 395 } 396 397 unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 398 const Function &F = *MF.getFunction(); 399 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 400 401 // Compute maximum number of VGPRs function can use using default/requested 402 // minimum number of waves per execution unit. 403 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 404 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 405 406 // Check if maximum number of VGPRs was explicitly requested using 407 // "amdgpu-num-vgpr" attribute. 408 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 409 unsigned Requested = AMDGPU::getIntegerAttribute( 410 F, "amdgpu-num-vgpr", MaxNumVGPRs); 411 412 // Make sure requested value does not violate subtarget's specifications. 413 if (Requested && Requested <= getReservedNumVGPRs(MF)) 414 Requested = 0; 415 416 // Make sure requested value is compatible with values implied by 417 // default/requested minimum/maximum number of waves per execution unit. 418 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 419 Requested = 0; 420 if (WavesPerEU.second && 421 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 422 Requested = 0; 423 424 if (Requested) 425 MaxNumVGPRs = Requested; 426 } 427 428 return MaxNumVGPRs - getReservedNumVGPRs(MF); 429 } 430