1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief Implements the AMDGPU specific subclass of TargetSubtarget. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPUSubtarget.h" 16 #include "llvm/ADT/SmallString.h" 17 #include "llvm/CodeGen/MachineScheduler.h" 18 #include "llvm/Target/TargetFrameLowering.h" 19 #include <algorithm> 20 21 using namespace llvm; 22 23 #define DEBUG_TYPE "amdgpu-subtarget" 24 25 #define GET_SUBTARGETINFO_ENUM 26 #define GET_SUBTARGETINFO_TARGET_DESC 27 #define GET_SUBTARGETINFO_CTOR 28 #include "AMDGPUGenSubtargetInfo.inc" 29 30 AMDGPUSubtarget::~AMDGPUSubtarget() = default; 31 32 AMDGPUSubtarget & 33 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT, 34 StringRef GPU, StringRef FS) { 35 // Determine default and user-specified characteristics 36 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be 37 // enabled, but some instructions do not respect them and they run at the 38 // double precision rate, so don't enable by default. 39 // 40 // We want to be able to turn these off, but making this a subtarget feature 41 // for SI has the unhelpful behavior that it unsets everything else if you 42 // disable it. 43 44 SmallString<256> FullFS("+promote-alloca,+fp64-fp16-denormals,+load-store-opt,"); 45 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 46 FullFS += "+flat-for-global,+unaligned-buffer-access,"; 47 48 FullFS += FS; 49 50 ParseSubtargetFeatures(GPU, FullFS); 51 52 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es 53 // on VI and newer hardware to avoid assertion failures due to missing ADDR64 54 // variants of MUBUF instructions. 55 if (!hasAddr64() && !FS.contains("flat-for-global")) { 56 FlatForGlobal = true; 57 } 58 59 // FIXME: I don't think think Evergreen has any useful support for 60 // denormals, but should be checked. Should we issue a warning somewhere 61 // if someone tries to enable these? 62 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { 63 FP64FP16Denormals = false; 64 FP32Denormals = false; 65 } 66 67 // Set defaults if needed. 68 if (MaxPrivateElementSize == 0) 69 MaxPrivateElementSize = 4; 70 71 return *this; 72 } 73 74 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 75 const TargetMachine &TM) 76 : AMDGPUGenSubtargetInfo(TT, GPU, FS), 77 TargetTriple(TT), 78 Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600), 79 IsaVersion(ISAVersion0_0_0), 80 WavefrontSize(64), 81 LocalMemorySize(0), 82 LDSBankCount(0), 83 MaxPrivateElementSize(0), 84 85 FastFMAF32(false), 86 HalfRate64Ops(false), 87 88 FP32Denormals(false), 89 FP64FP16Denormals(false), 90 FPExceptions(false), 91 FlatForGlobal(false), 92 UnalignedScratchAccess(false), 93 UnalignedBufferAccess(false), 94 95 EnableXNACK(false), 96 DebuggerInsertNops(false), 97 DebuggerReserveRegs(false), 98 DebuggerEmitPrologue(false), 99 100 EnableVGPRSpilling(false), 101 EnablePromoteAlloca(false), 102 EnableLoadStoreOpt(false), 103 EnableUnsafeDSOffsetFolding(false), 104 EnableSIScheduler(false), 105 DumpCode(false), 106 107 FP64(false), 108 IsGCN(false), 109 GCN1Encoding(false), 110 GCN3Encoding(false), 111 CIInsts(false), 112 SGPRInitBug(false), 113 HasSMemRealTime(false), 114 Has16BitInsts(false), 115 HasMovrel(false), 116 HasVGPRIndexMode(false), 117 HasScalarStores(false), 118 HasInv2PiInlineImm(false), 119 HasSDWA(false), 120 HasDPP(false), 121 FlatAddressSpace(false), 122 123 R600ALUInst(false), 124 CaymanISA(false), 125 CFALUBug(false), 126 HasVertexCache(false), 127 TexVTXClauseSize(0), 128 ScalarizeGlobal(false), 129 130 FeatureDisable(false), 131 InstrItins(getInstrItineraryForCPU(GPU)) { 132 initializeSubtargetDependencies(TT, GPU, FS); 133 } 134 135 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 136 const Function &F) const { 137 if (NWaves == 1) 138 return getLocalMemorySize(); 139 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 140 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 141 unsigned MaxWaves = getMaxWavesPerEU(); 142 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 143 } 144 145 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 146 const Function &F) const { 147 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 148 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 149 unsigned MaxWaves = getMaxWavesPerEU(); 150 unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu; 151 unsigned NumWaves = Limit / (Bytes ? Bytes : 1u); 152 NumWaves = std::min(NumWaves, MaxWaves); 153 NumWaves = std::max(NumWaves, 1u); 154 return NumWaves; 155 } 156 157 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 158 const Function &F) const { 159 // Default minimum/maximum flat work group sizes. 160 std::pair<unsigned, unsigned> Default = 161 AMDGPU::isCompute(F.getCallingConv()) ? 162 std::pair<unsigned, unsigned>(getWavefrontSize() * 2, 163 getWavefrontSize() * 4) : 164 std::pair<unsigned, unsigned>(1, getWavefrontSize()); 165 166 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 167 // starts using "amdgpu-flat-work-group-size" attribute. 168 Default.second = AMDGPU::getIntegerAttribute( 169 F, "amdgpu-max-work-group-size", Default.second); 170 Default.first = std::min(Default.first, Default.second); 171 172 // Requested minimum/maximum flat work group sizes. 173 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 174 F, "amdgpu-flat-work-group-size", Default); 175 176 // Make sure requested minimum is less than requested maximum. 177 if (Requested.first > Requested.second) 178 return Default; 179 180 // Make sure requested values do not violate subtarget's specifications. 181 if (Requested.first < getMinFlatWorkGroupSize()) 182 return Default; 183 if (Requested.second > getMaxFlatWorkGroupSize()) 184 return Default; 185 186 return Requested; 187 } 188 189 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 190 const Function &F) const { 191 // Default minimum/maximum number of waves per execution unit. 192 std::pair<unsigned, unsigned> Default(1, 0); 193 194 // Default/requested minimum/maximum flat work group sizes. 195 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 196 197 // If minimum/maximum flat work group sizes were explicitly requested using 198 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 199 // number of waves per execution unit to values implied by requested 200 // minimum/maximum flat work group sizes. 201 unsigned MinImpliedByFlatWorkGroupSize = 202 getMaxWavesPerEU(FlatWorkGroupSizes.second); 203 bool RequestedFlatWorkGroupSize = false; 204 205 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 206 // starts using "amdgpu-flat-work-group-size" attribute. 207 if (F.hasFnAttribute("amdgpu-max-work-group-size") || 208 F.hasFnAttribute("amdgpu-flat-work-group-size")) { 209 Default.first = MinImpliedByFlatWorkGroupSize; 210 RequestedFlatWorkGroupSize = true; 211 } 212 213 // Requested minimum/maximum number of waves per execution unit. 214 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 215 F, "amdgpu-waves-per-eu", Default, true); 216 217 // Make sure requested minimum is less than requested maximum. 218 if (Requested.second && Requested.first > Requested.second) 219 return Default; 220 221 // Make sure requested values do not violate subtarget's specifications. 222 if (Requested.first < getMinWavesPerEU() || 223 Requested.first > getMaxWavesPerEU()) 224 return Default; 225 if (Requested.second > getMaxWavesPerEU()) 226 return Default; 227 228 // Make sure requested values are compatible with values implied by requested 229 // minimum/maximum flat work group sizes. 230 if (RequestedFlatWorkGroupSize && 231 Requested.first > MinImpliedByFlatWorkGroupSize) 232 return Default; 233 234 return Requested; 235 } 236 237 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 238 const TargetMachine &TM) : 239 AMDGPUSubtarget(TT, GPU, FS, TM), 240 InstrInfo(*this), 241 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 242 TLInfo(TM, *this) {} 243 244 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS, 245 const TargetMachine &TM) : 246 AMDGPUSubtarget(TT, GPU, FS, TM), 247 InstrInfo(*this), 248 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 249 TLInfo(TM, *this) {} 250 251 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 252 unsigned NumRegionInstrs) const { 253 // Track register pressure so the scheduler can try to decrease 254 // pressure once register usage is above the threshold defined by 255 // SIRegisterInfo::getRegPressureSetLimit() 256 Policy.ShouldTrackPressure = true; 257 258 // Enabling both top down and bottom up scheduling seems to give us less 259 // register spills than just using one of these approaches on its own. 260 Policy.OnlyTopDown = false; 261 Policy.OnlyBottomUp = false; 262 263 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 264 if (!enableSIScheduler()) 265 Policy.ShouldTrackLaneMasks = true; 266 } 267 268 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const { 269 return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv()); 270 } 271 272 unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF, 273 unsigned ExplicitArgBytes) const { 274 unsigned ImplicitBytes = getImplicitArgNumBytes(MF); 275 if (ImplicitBytes == 0) 276 return ExplicitArgBytes; 277 278 unsigned Alignment = getAlignmentForImplicitArgPtr(); 279 return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 280 } 281 282 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 283 if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 284 if (SGPRs <= 80) 285 return 10; 286 if (SGPRs <= 88) 287 return 9; 288 if (SGPRs <= 100) 289 return 8; 290 return 7; 291 } 292 if (SGPRs <= 48) 293 return 10; 294 if (SGPRs <= 56) 295 return 9; 296 if (SGPRs <= 64) 297 return 8; 298 if (SGPRs <= 72) 299 return 7; 300 if (SGPRs <= 80) 301 return 6; 302 return 5; 303 } 304 305 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 306 if (VGPRs <= 24) 307 return 10; 308 if (VGPRs <= 28) 309 return 9; 310 if (VGPRs <= 32) 311 return 8; 312 if (VGPRs <= 36) 313 return 7; 314 if (VGPRs <= 40) 315 return 6; 316 if (VGPRs <= 48) 317 return 5; 318 if (VGPRs <= 64) 319 return 4; 320 if (VGPRs <= 84) 321 return 3; 322 if (VGPRs <= 128) 323 return 2; 324 return 1; 325 } 326 327 unsigned SISubtarget::getMaxNumSGPRs() const { 328 if (hasSGPRInitBug()) 329 return SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG; 330 331 if (getGeneration() >= VOLCANIC_ISLANDS) 332 return 102; 333 334 return 104; 335 } 336