1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief Implements the AMDGPU specific subclass of TargetSubtarget. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPUSubtarget.h" 16 #include "llvm/ADT/SmallString.h" 17 #include "llvm/CodeGen/MachineScheduler.h" 18 #include "llvm/Target/TargetFrameLowering.h" 19 #include <algorithm> 20 21 using namespace llvm; 22 23 #define DEBUG_TYPE "amdgpu-subtarget" 24 25 #define GET_SUBTARGETINFO_ENUM 26 #define GET_SUBTARGETINFO_TARGET_DESC 27 #define GET_SUBTARGETINFO_CTOR 28 #include "AMDGPUGenSubtargetInfo.inc" 29 30 AMDGPUSubtarget::~AMDGPUSubtarget() = default; 31 32 AMDGPUSubtarget & 33 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT, 34 StringRef GPU, StringRef FS) { 35 // Determine default and user-specified characteristics 36 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be 37 // enabled, but some instructions do not respect them and they run at the 38 // double precision rate, so don't enable by default. 39 // 40 // We want to be able to turn these off, but making this a subtarget feature 41 // for SI has the unhelpful behavior that it unsets everything else if you 42 // disable it. 43 44 SmallString<256> FullFS("+promote-alloca,+fp64-fp16-denormals,+load-store-opt,"); 45 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 46 FullFS += "+flat-for-global,+unaligned-buffer-access,"; 47 48 FullFS += FS; 49 50 ParseSubtargetFeatures(GPU, FullFS); 51 52 // FIXME: I don't think think Evergreen has any useful support for 53 // denormals, but should be checked. Should we issue a warning somewhere 54 // if someone tries to enable these? 55 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { 56 FP64FP16Denormals = false; 57 FP32Denormals = false; 58 } 59 60 // Set defaults if needed. 61 if (MaxPrivateElementSize == 0) 62 MaxPrivateElementSize = 4; 63 64 return *this; 65 } 66 67 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 68 const TargetMachine &TM) 69 : AMDGPUGenSubtargetInfo(TT, GPU, FS), 70 TargetTriple(TT), 71 Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600), 72 IsaVersion(ISAVersion0_0_0), 73 WavefrontSize(64), 74 LocalMemorySize(0), 75 LDSBankCount(0), 76 MaxPrivateElementSize(0), 77 78 FastFMAF32(false), 79 HalfRate64Ops(false), 80 81 FP32Denormals(false), 82 FP64FP16Denormals(false), 83 FPExceptions(false), 84 FlatForGlobal(false), 85 NoAddr64(false), 86 UnalignedScratchAccess(false), 87 UnalignedBufferAccess(false), 88 89 EnableXNACK(false), 90 DebuggerInsertNops(false), 91 DebuggerReserveRegs(false), 92 DebuggerEmitPrologue(false), 93 94 EnableVGPRSpilling(false), 95 EnablePromoteAlloca(false), 96 EnableLoadStoreOpt(false), 97 EnableUnsafeDSOffsetFolding(false), 98 EnableSIScheduler(false), 99 DumpCode(false), 100 101 FP64(false), 102 IsGCN(false), 103 GCN1Encoding(false), 104 GCN3Encoding(false), 105 CIInsts(false), 106 SGPRInitBug(false), 107 HasSMemRealTime(false), 108 Has16BitInsts(false), 109 HasMovrel(false), 110 HasVGPRIndexMode(false), 111 HasScalarStores(false), 112 HasInv2PiInlineImm(false), 113 HasSDWA(false), 114 HasDPP(false), 115 FlatAddressSpace(false), 116 117 R600ALUInst(false), 118 CaymanISA(false), 119 CFALUBug(false), 120 HasVertexCache(false), 121 TexVTXClauseSize(0), 122 ScalarizeGlobal(false), 123 124 FeatureDisable(false), 125 InstrItins(getInstrItineraryForCPU(GPU)) { 126 initializeSubtargetDependencies(TT, GPU, FS); 127 } 128 129 // FIXME: These limits are for SI. Did they change with the larger maximum LDS 130 // size? 131 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves) const { 132 switch (NWaves) { 133 case 10: 134 return 1638; 135 case 9: 136 return 1820; 137 case 8: 138 return 2048; 139 case 7: 140 return 2340; 141 case 6: 142 return 2730; 143 case 5: 144 return 3276; 145 case 4: 146 return 4096; 147 case 3: 148 return 5461; 149 case 2: 150 return 8192; 151 default: 152 return getLocalMemorySize(); 153 } 154 } 155 156 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes) const { 157 if (Bytes <= 1638) 158 return 10; 159 160 if (Bytes <= 1820) 161 return 9; 162 163 if (Bytes <= 2048) 164 return 8; 165 166 if (Bytes <= 2340) 167 return 7; 168 169 if (Bytes <= 2730) 170 return 6; 171 172 if (Bytes <= 3276) 173 return 5; 174 175 if (Bytes <= 4096) 176 return 4; 177 178 if (Bytes <= 5461) 179 return 3; 180 181 if (Bytes <= 8192) 182 return 2; 183 184 return 1; 185 } 186 187 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 188 const Function &F) const { 189 // Default minimum/maximum flat work group sizes. 190 std::pair<unsigned, unsigned> Default = 191 AMDGPU::isCompute(F.getCallingConv()) ? 192 std::pair<unsigned, unsigned>(getWavefrontSize() * 2, 193 getWavefrontSize() * 4) : 194 std::pair<unsigned, unsigned>(1, getWavefrontSize()); 195 196 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 197 // starts using "amdgpu-flat-work-group-size" attribute. 198 Default.second = AMDGPU::getIntegerAttribute( 199 F, "amdgpu-max-work-group-size", Default.second); 200 Default.first = std::min(Default.first, Default.second); 201 202 // Requested minimum/maximum flat work group sizes. 203 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 204 F, "amdgpu-flat-work-group-size", Default); 205 206 // Make sure requested minimum is less than requested maximum. 207 if (Requested.first > Requested.second) 208 return Default; 209 210 // Make sure requested values do not violate subtarget's specifications. 211 if (Requested.first < getMinFlatWorkGroupSize()) 212 return Default; 213 if (Requested.second > getMaxFlatWorkGroupSize()) 214 return Default; 215 216 return Requested; 217 } 218 219 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 220 const Function &F) const { 221 // Default minimum/maximum number of waves per execution unit. 222 std::pair<unsigned, unsigned> Default(1, 0); 223 224 // Default/requested minimum/maximum flat work group sizes. 225 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 226 227 // If minimum/maximum flat work group sizes were explicitly requested using 228 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 229 // number of waves per execution unit to values implied by requested 230 // minimum/maximum flat work group sizes. 231 unsigned MinImpliedByFlatWorkGroupSize = 232 getMaxWavesPerEU(FlatWorkGroupSizes.second); 233 bool RequestedFlatWorkGroupSize = false; 234 235 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 236 // starts using "amdgpu-flat-work-group-size" attribute. 237 if (F.hasFnAttribute("amdgpu-max-work-group-size") || 238 F.hasFnAttribute("amdgpu-flat-work-group-size")) { 239 Default.first = MinImpliedByFlatWorkGroupSize; 240 RequestedFlatWorkGroupSize = true; 241 } 242 243 // Requested minimum/maximum number of waves per execution unit. 244 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 245 F, "amdgpu-waves-per-eu", Default, true); 246 247 // Make sure requested minimum is less than requested maximum. 248 if (Requested.second && Requested.first > Requested.second) 249 return Default; 250 251 // Make sure requested values do not violate subtarget's specifications. 252 if (Requested.first < getMinWavesPerEU() || 253 Requested.first > getMaxWavesPerEU()) 254 return Default; 255 if (Requested.second > getMaxWavesPerEU()) 256 return Default; 257 258 // Make sure requested values are compatible with values implied by requested 259 // minimum/maximum flat work group sizes. 260 if (RequestedFlatWorkGroupSize && 261 Requested.first > MinImpliedByFlatWorkGroupSize) 262 return Default; 263 264 return Requested; 265 } 266 267 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 268 const TargetMachine &TM) : 269 AMDGPUSubtarget(TT, GPU, FS, TM), 270 InstrInfo(*this), 271 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 272 TLInfo(TM, *this) {} 273 274 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS, 275 const TargetMachine &TM) : 276 AMDGPUSubtarget(TT, GPU, FS, TM), 277 InstrInfo(*this), 278 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 279 TLInfo(TM, *this) {} 280 281 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 282 unsigned NumRegionInstrs) const { 283 // Track register pressure so the scheduler can try to decrease 284 // pressure once register usage is above the threshold defined by 285 // SIRegisterInfo::getRegPressureSetLimit() 286 Policy.ShouldTrackPressure = true; 287 288 // Enabling both top down and bottom up scheduling seems to give us less 289 // register spills than just using one of these approaches on its own. 290 Policy.OnlyTopDown = false; 291 Policy.OnlyBottomUp = false; 292 293 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 294 if (!enableSIScheduler()) 295 Policy.ShouldTrackLaneMasks = true; 296 } 297 298 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const { 299 return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv()); 300 } 301 302 unsigned SISubtarget::getKernArgSegmentSize(unsigned ExplicitArgBytes) const { 303 unsigned ImplicitBytes = getImplicitArgNumBytes(); 304 if (ImplicitBytes == 0) 305 return ExplicitArgBytes; 306 307 unsigned Alignment = getAlignmentForImplicitArgPtr(); 308 return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 309 } 310 311 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 312 if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 313 if (SGPRs <= 80) 314 return 10; 315 if (SGPRs <= 88) 316 return 9; 317 if (SGPRs <= 100) 318 return 8; 319 return 7; 320 } 321 if (SGPRs <= 48) 322 return 10; 323 if (SGPRs <= 56) 324 return 9; 325 if (SGPRs <= 64) 326 return 8; 327 if (SGPRs <= 72) 328 return 7; 329 if (SGPRs <= 80) 330 return 6; 331 return 5; 332 } 333 334 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 335 if (VGPRs <= 24) 336 return 10; 337 if (VGPRs <= 28) 338 return 9; 339 if (VGPRs <= 32) 340 return 8; 341 if (VGPRs <= 36) 342 return 7; 343 if (VGPRs <= 40) 344 return 6; 345 if (VGPRs <= 48) 346 return 5; 347 if (VGPRs <= 64) 348 return 4; 349 if (VGPRs <= 84) 350 return 3; 351 if (VGPRs <= 128) 352 return 2; 353 return 1; 354 } 355 356 unsigned SISubtarget::getMaxNumSGPRs() const { 357 if (hasSGPRInitBug()) 358 return SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG; 359 360 if (getGeneration() >= VOLCANIC_ISLANDS) 361 return 102; 362 363 return 104; 364 } 365