1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief Implements the AMDGPU specific subclass of TargetSubtarget. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPUSubtarget.h" 16 #include "llvm/ADT/SmallString.h" 17 #include "llvm/CodeGen/MachineScheduler.h" 18 #include "llvm/Target/TargetFrameLowering.h" 19 #include <algorithm> 20 21 using namespace llvm; 22 23 #define DEBUG_TYPE "amdgpu-subtarget" 24 25 #define GET_SUBTARGETINFO_ENUM 26 #define GET_SUBTARGETINFO_TARGET_DESC 27 #define GET_SUBTARGETINFO_CTOR 28 #include "AMDGPUGenSubtargetInfo.inc" 29 30 AMDGPUSubtarget::~AMDGPUSubtarget() = default; 31 32 AMDGPUSubtarget & 33 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT, 34 StringRef GPU, StringRef FS) { 35 // Determine default and user-specified characteristics 36 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be 37 // enabled, but some instructions do not respect them and they run at the 38 // double precision rate, so don't enable by default. 39 // 40 // We want to be able to turn these off, but making this a subtarget feature 41 // for SI has the unhelpful behavior that it unsets everything else if you 42 // disable it. 43 44 SmallString<256> FullFS("+promote-alloca,+fp64-fp16-denormals,+load-store-opt,"); 45 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 46 FullFS += "+flat-for-global,+unaligned-buffer-access,"; 47 48 FullFS += FS; 49 50 ParseSubtargetFeatures(GPU, FullFS); 51 52 // FIXME: I don't think think Evergreen has any useful support for 53 // denormals, but should be checked. Should we issue a warning somewhere 54 // if someone tries to enable these? 55 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { 56 FP64FP16Denormals = false; 57 FP32Denormals = false; 58 } 59 60 // Set defaults if needed. 61 if (MaxPrivateElementSize == 0) 62 MaxPrivateElementSize = 4; 63 64 return *this; 65 } 66 67 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 68 const TargetMachine &TM) 69 : AMDGPUGenSubtargetInfo(TT, GPU, FS), 70 TargetTriple(TT), 71 Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600), 72 IsaVersion(ISAVersion0_0_0), 73 WavefrontSize(64), 74 LocalMemorySize(0), 75 LDSBankCount(0), 76 MaxPrivateElementSize(0), 77 78 FastFMAF32(false), 79 HalfRate64Ops(false), 80 81 FP32Denormals(false), 82 FP64FP16Denormals(false), 83 FPExceptions(false), 84 FlatForGlobal(false), 85 UnalignedScratchAccess(false), 86 UnalignedBufferAccess(false), 87 88 EnableXNACK(false), 89 DebuggerInsertNops(false), 90 DebuggerReserveRegs(false), 91 DebuggerEmitPrologue(false), 92 93 EnableVGPRSpilling(false), 94 EnablePromoteAlloca(false), 95 EnableLoadStoreOpt(false), 96 EnableUnsafeDSOffsetFolding(false), 97 EnableSIScheduler(false), 98 DumpCode(false), 99 100 FP64(false), 101 IsGCN(false), 102 GCN1Encoding(false), 103 GCN3Encoding(false), 104 CIInsts(false), 105 SGPRInitBug(false), 106 HasSMemRealTime(false), 107 Has16BitInsts(false), 108 HasMovrel(false), 109 HasVGPRIndexMode(false), 110 HasScalarStores(false), 111 HasInv2PiInlineImm(false), 112 HasSDWA(false), 113 HasDPP(false), 114 FlatAddressSpace(false), 115 116 R600ALUInst(false), 117 CaymanISA(false), 118 CFALUBug(false), 119 HasVertexCache(false), 120 TexVTXClauseSize(0), 121 ScalarizeGlobal(false), 122 123 FeatureDisable(false), 124 InstrItins(getInstrItineraryForCPU(GPU)) { 125 initializeSubtargetDependencies(TT, GPU, FS); 126 } 127 128 // FIXME: These limits are for SI. Did they change with the larger maximum LDS 129 // size? 130 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves) const { 131 switch (NWaves) { 132 case 10: 133 return 1638; 134 case 9: 135 return 1820; 136 case 8: 137 return 2048; 138 case 7: 139 return 2340; 140 case 6: 141 return 2730; 142 case 5: 143 return 3276; 144 case 4: 145 return 4096; 146 case 3: 147 return 5461; 148 case 2: 149 return 8192; 150 default: 151 return getLocalMemorySize(); 152 } 153 } 154 155 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes) const { 156 if (Bytes <= 1638) 157 return 10; 158 159 if (Bytes <= 1820) 160 return 9; 161 162 if (Bytes <= 2048) 163 return 8; 164 165 if (Bytes <= 2340) 166 return 7; 167 168 if (Bytes <= 2730) 169 return 6; 170 171 if (Bytes <= 3276) 172 return 5; 173 174 if (Bytes <= 4096) 175 return 4; 176 177 if (Bytes <= 5461) 178 return 3; 179 180 if (Bytes <= 8192) 181 return 2; 182 183 return 1; 184 } 185 186 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 187 const Function &F) const { 188 // Default minimum/maximum flat work group sizes. 189 std::pair<unsigned, unsigned> Default = 190 AMDGPU::isCompute(F.getCallingConv()) ? 191 std::pair<unsigned, unsigned>(getWavefrontSize() * 2, 192 getWavefrontSize() * 4) : 193 std::pair<unsigned, unsigned>(1, getWavefrontSize()); 194 195 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 196 // starts using "amdgpu-flat-work-group-size" attribute. 197 Default.second = AMDGPU::getIntegerAttribute( 198 F, "amdgpu-max-work-group-size", Default.second); 199 Default.first = std::min(Default.first, Default.second); 200 201 // Requested minimum/maximum flat work group sizes. 202 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 203 F, "amdgpu-flat-work-group-size", Default); 204 205 // Make sure requested minimum is less than requested maximum. 206 if (Requested.first > Requested.second) 207 return Default; 208 209 // Make sure requested values do not violate subtarget's specifications. 210 if (Requested.first < getMinFlatWorkGroupSize()) 211 return Default; 212 if (Requested.second > getMaxFlatWorkGroupSize()) 213 return Default; 214 215 return Requested; 216 } 217 218 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 219 const Function &F) const { 220 // Default minimum/maximum number of waves per execution unit. 221 std::pair<unsigned, unsigned> Default(1, 0); 222 223 // Default/requested minimum/maximum flat work group sizes. 224 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 225 226 // If minimum/maximum flat work group sizes were explicitly requested using 227 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 228 // number of waves per execution unit to values implied by requested 229 // minimum/maximum flat work group sizes. 230 unsigned MinImpliedByFlatWorkGroupSize = 231 getMaxWavesPerEU(FlatWorkGroupSizes.second); 232 bool RequestedFlatWorkGroupSize = false; 233 234 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 235 // starts using "amdgpu-flat-work-group-size" attribute. 236 if (F.hasFnAttribute("amdgpu-max-work-group-size") || 237 F.hasFnAttribute("amdgpu-flat-work-group-size")) { 238 Default.first = MinImpliedByFlatWorkGroupSize; 239 RequestedFlatWorkGroupSize = true; 240 } 241 242 // Requested minimum/maximum number of waves per execution unit. 243 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 244 F, "amdgpu-waves-per-eu", Default, true); 245 246 // Make sure requested minimum is less than requested maximum. 247 if (Requested.second && Requested.first > Requested.second) 248 return Default; 249 250 // Make sure requested values do not violate subtarget's specifications. 251 if (Requested.first < getMinWavesPerEU() || 252 Requested.first > getMaxWavesPerEU()) 253 return Default; 254 if (Requested.second > getMaxWavesPerEU()) 255 return Default; 256 257 // Make sure requested values are compatible with values implied by requested 258 // minimum/maximum flat work group sizes. 259 if (RequestedFlatWorkGroupSize && 260 Requested.first > MinImpliedByFlatWorkGroupSize) 261 return Default; 262 263 return Requested; 264 } 265 266 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 267 const TargetMachine &TM) : 268 AMDGPUSubtarget(TT, GPU, FS, TM), 269 InstrInfo(*this), 270 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 271 TLInfo(TM, *this) {} 272 273 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS, 274 const TargetMachine &TM) : 275 AMDGPUSubtarget(TT, GPU, FS, TM), 276 InstrInfo(*this), 277 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 278 TLInfo(TM, *this) {} 279 280 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 281 unsigned NumRegionInstrs) const { 282 // Track register pressure so the scheduler can try to decrease 283 // pressure once register usage is above the threshold defined by 284 // SIRegisterInfo::getRegPressureSetLimit() 285 Policy.ShouldTrackPressure = true; 286 287 // Enabling both top down and bottom up scheduling seems to give us less 288 // register spills than just using one of these approaches on its own. 289 Policy.OnlyTopDown = false; 290 Policy.OnlyBottomUp = false; 291 292 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 293 if (!enableSIScheduler()) 294 Policy.ShouldTrackLaneMasks = true; 295 } 296 297 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const { 298 return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv()); 299 } 300 301 unsigned SISubtarget::getKernArgSegmentSize(unsigned ExplicitArgBytes) const { 302 unsigned ImplicitBytes = getImplicitArgNumBytes(); 303 if (ImplicitBytes == 0) 304 return ExplicitArgBytes; 305 306 unsigned Alignment = getAlignmentForImplicitArgPtr(); 307 return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 308 } 309 310 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 311 if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 312 if (SGPRs <= 80) 313 return 10; 314 if (SGPRs <= 88) 315 return 9; 316 if (SGPRs <= 100) 317 return 8; 318 return 7; 319 } 320 if (SGPRs <= 48) 321 return 10; 322 if (SGPRs <= 56) 323 return 9; 324 if (SGPRs <= 64) 325 return 8; 326 if (SGPRs <= 72) 327 return 7; 328 if (SGPRs <= 80) 329 return 6; 330 return 5; 331 } 332 333 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 334 if (VGPRs <= 24) 335 return 10; 336 if (VGPRs <= 28) 337 return 9; 338 if (VGPRs <= 32) 339 return 8; 340 if (VGPRs <= 36) 341 return 7; 342 if (VGPRs <= 40) 343 return 6; 344 if (VGPRs <= 48) 345 return 5; 346 if (VGPRs <= 64) 347 return 4; 348 if (VGPRs <= 84) 349 return 3; 350 if (VGPRs <= 128) 351 return 2; 352 return 1; 353 } 354 355 unsigned SISubtarget::getMaxNumSGPRs() const { 356 if (hasSGPRInitBug()) 357 return SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG; 358 359 if (getGeneration() >= VOLCANIC_ISLANDS) 360 return 102; 361 362 return 104; 363 } 364