1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief Implements the AMDGPU specific subclass of TargetSubtarget. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPUSubtarget.h" 16 #include "llvm/ADT/SmallString.h" 17 #include "llvm/CodeGen/MachineScheduler.h" 18 #include "llvm/Target/TargetFrameLowering.h" 19 #include <algorithm> 20 21 using namespace llvm; 22 23 #define DEBUG_TYPE "amdgpu-subtarget" 24 25 #define GET_SUBTARGETINFO_ENUM 26 #define GET_SUBTARGETINFO_TARGET_DESC 27 #define GET_SUBTARGETINFO_CTOR 28 #include "AMDGPUGenSubtargetInfo.inc" 29 30 AMDGPUSubtarget::~AMDGPUSubtarget() = default; 31 32 AMDGPUSubtarget & 33 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT, 34 StringRef GPU, StringRef FS) { 35 // Determine default and user-specified characteristics 36 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be 37 // enabled, but some instructions do not respect them and they run at the 38 // double precision rate, so don't enable by default. 39 // 40 // We want to be able to turn these off, but making this a subtarget feature 41 // for SI has the unhelpful behavior that it unsets everything else if you 42 // disable it. 43 44 SmallString<256> FullFS("+promote-alloca,+fp64-denormals,+load-store-opt,"); 45 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 46 FullFS += "+flat-for-global,+unaligned-buffer-access,"; 47 FullFS += FS; 48 49 ParseSubtargetFeatures(GPU, FullFS); 50 51 // FIXME: I don't think think Evergreen has any useful support for 52 // denormals, but should be checked. Should we issue a warning somewhere 53 // if someone tries to enable these? 54 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { 55 FP16Denormals = false; 56 FP32Denormals = false; 57 FP64Denormals = false; 58 } 59 60 // Set defaults if needed. 61 if (MaxPrivateElementSize == 0) 62 MaxPrivateElementSize = 4; 63 64 return *this; 65 } 66 67 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 68 const TargetMachine &TM) 69 : AMDGPUGenSubtargetInfo(TT, GPU, FS), 70 TargetTriple(TT), 71 Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600), 72 IsaVersion(ISAVersion0_0_0), 73 WavefrontSize(64), 74 LocalMemorySize(0), 75 LDSBankCount(0), 76 MaxPrivateElementSize(0), 77 78 FastFMAF32(false), 79 HalfRate64Ops(false), 80 81 FP16Denormals(false), 82 FP32Denormals(false), 83 FP64Denormals(false), 84 FPExceptions(false), 85 FlatForGlobal(false), 86 UnalignedScratchAccess(false), 87 UnalignedBufferAccess(false), 88 89 EnableXNACK(false), 90 DebuggerInsertNops(false), 91 DebuggerReserveRegs(false), 92 DebuggerEmitPrologue(false), 93 94 EnableVGPRSpilling(false), 95 EnablePromoteAlloca(false), 96 EnableLoadStoreOpt(false), 97 EnableUnsafeDSOffsetFolding(false), 98 EnableSIScheduler(false), 99 DumpCode(false), 100 101 FP64(false), 102 IsGCN(false), 103 GCN1Encoding(false), 104 GCN3Encoding(false), 105 CIInsts(false), 106 SGPRInitBug(false), 107 HasSMemRealTime(false), 108 Has16BitInsts(false), 109 HasMovrel(false), 110 HasVGPRIndexMode(false), 111 HasScalarStores(false), 112 HasInv2PiInlineImm(false), 113 FlatAddressSpace(false), 114 115 R600ALUInst(false), 116 CaymanISA(false), 117 CFALUBug(false), 118 HasVertexCache(false), 119 TexVTXClauseSize(0), 120 ScalarizeGlobal(false), 121 122 FeatureDisable(false), 123 InstrItins(getInstrItineraryForCPU(GPU)) { 124 initializeSubtargetDependencies(TT, GPU, FS); 125 } 126 127 // FIXME: These limits are for SI. Did they change with the larger maximum LDS 128 // size? 129 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves) const { 130 switch (NWaves) { 131 case 10: 132 return 1638; 133 case 9: 134 return 1820; 135 case 8: 136 return 2048; 137 case 7: 138 return 2340; 139 case 6: 140 return 2730; 141 case 5: 142 return 3276; 143 case 4: 144 return 4096; 145 case 3: 146 return 5461; 147 case 2: 148 return 8192; 149 default: 150 return getLocalMemorySize(); 151 } 152 } 153 154 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes) const { 155 if (Bytes <= 1638) 156 return 10; 157 158 if (Bytes <= 1820) 159 return 9; 160 161 if (Bytes <= 2048) 162 return 8; 163 164 if (Bytes <= 2340) 165 return 7; 166 167 if (Bytes <= 2730) 168 return 6; 169 170 if (Bytes <= 3276) 171 return 5; 172 173 if (Bytes <= 4096) 174 return 4; 175 176 if (Bytes <= 5461) 177 return 3; 178 179 if (Bytes <= 8192) 180 return 2; 181 182 return 1; 183 } 184 185 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 186 const Function &F) const { 187 // Default minimum/maximum flat work group sizes. 188 std::pair<unsigned, unsigned> Default = 189 AMDGPU::isCompute(F.getCallingConv()) ? 190 std::pair<unsigned, unsigned>(getWavefrontSize() * 2, 191 getWavefrontSize() * 4) : 192 std::pair<unsigned, unsigned>(1, getWavefrontSize()); 193 194 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 195 // starts using "amdgpu-flat-work-group-size" attribute. 196 Default.second = AMDGPU::getIntegerAttribute( 197 F, "amdgpu-max-work-group-size", Default.second); 198 Default.first = std::min(Default.first, Default.second); 199 200 // Requested minimum/maximum flat work group sizes. 201 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 202 F, "amdgpu-flat-work-group-size", Default); 203 204 // Make sure requested minimum is less than requested maximum. 205 if (Requested.first > Requested.second) 206 return Default; 207 208 // Make sure requested values do not violate subtarget's specifications. 209 if (Requested.first < getMinFlatWorkGroupSize()) 210 return Default; 211 if (Requested.second > getMaxFlatWorkGroupSize()) 212 return Default; 213 214 return Requested; 215 } 216 217 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 218 const Function &F) const { 219 // Default minimum/maximum number of waves per execution unit. 220 std::pair<unsigned, unsigned> Default(1, 0); 221 222 // Default/requested minimum/maximum flat work group sizes. 223 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 224 225 // If minimum/maximum flat work group sizes were explicitly requested using 226 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 227 // number of waves per execution unit to values implied by requested 228 // minimum/maximum flat work group sizes. 229 unsigned MinImpliedByFlatWorkGroupSize = 230 getMaxWavesPerEU(FlatWorkGroupSizes.second); 231 bool RequestedFlatWorkGroupSize = false; 232 233 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 234 // starts using "amdgpu-flat-work-group-size" attribute. 235 if (F.hasFnAttribute("amdgpu-max-work-group-size") || 236 F.hasFnAttribute("amdgpu-flat-work-group-size")) { 237 Default.first = MinImpliedByFlatWorkGroupSize; 238 RequestedFlatWorkGroupSize = true; 239 } 240 241 // Requested minimum/maximum number of waves per execution unit. 242 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 243 F, "amdgpu-waves-per-eu", Default, true); 244 245 // Make sure requested minimum is less than requested maximum. 246 if (Requested.second && Requested.first > Requested.second) 247 return Default; 248 249 // Make sure requested values do not violate subtarget's specifications. 250 if (Requested.first < getMinWavesPerEU() || 251 Requested.first > getMaxWavesPerEU()) 252 return Default; 253 if (Requested.second > getMaxWavesPerEU()) 254 return Default; 255 256 // Make sure requested values are compatible with values implied by requested 257 // minimum/maximum flat work group sizes. 258 if (RequestedFlatWorkGroupSize && 259 Requested.first > MinImpliedByFlatWorkGroupSize) 260 return Default; 261 262 return Requested; 263 } 264 265 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 266 const TargetMachine &TM) : 267 AMDGPUSubtarget(TT, GPU, FS, TM), 268 InstrInfo(*this), 269 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 270 TLInfo(TM, *this) {} 271 272 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS, 273 const TargetMachine &TM) : 274 AMDGPUSubtarget(TT, GPU, FS, TM), 275 InstrInfo(*this), 276 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 277 TLInfo(TM, *this) {} 278 279 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 280 unsigned NumRegionInstrs) const { 281 // Track register pressure so the scheduler can try to decrease 282 // pressure once register usage is above the threshold defined by 283 // SIRegisterInfo::getRegPressureSetLimit() 284 Policy.ShouldTrackPressure = true; 285 286 // Enabling both top down and bottom up scheduling seems to give us less 287 // register spills than just using one of these approaches on its own. 288 Policy.OnlyTopDown = false; 289 Policy.OnlyBottomUp = false; 290 291 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 292 if (!enableSIScheduler()) 293 Policy.ShouldTrackLaneMasks = true; 294 } 295 296 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const { 297 return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv()); 298 } 299 300 unsigned SISubtarget::getKernArgSegmentSize(unsigned ExplicitArgBytes) const { 301 unsigned ImplicitBytes = getImplicitArgNumBytes(); 302 if (ImplicitBytes == 0) 303 return ExplicitArgBytes; 304 305 unsigned Alignment = getAlignmentForImplicitArgPtr(); 306 return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 307 } 308 309 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 310 if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 311 if (SGPRs <= 80) 312 return 10; 313 if (SGPRs <= 88) 314 return 9; 315 if (SGPRs <= 100) 316 return 8; 317 return 7; 318 } 319 if (SGPRs <= 48) 320 return 10; 321 if (SGPRs <= 56) 322 return 9; 323 if (SGPRs <= 64) 324 return 8; 325 if (SGPRs <= 72) 326 return 7; 327 if (SGPRs <= 80) 328 return 6; 329 return 5; 330 } 331 332 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 333 if (VGPRs <= 24) 334 return 10; 335 if (VGPRs <= 28) 336 return 9; 337 if (VGPRs <= 32) 338 return 8; 339 if (VGPRs <= 36) 340 return 7; 341 if (VGPRs <= 40) 342 return 6; 343 if (VGPRs <= 48) 344 return 5; 345 if (VGPRs <= 64) 346 return 4; 347 if (VGPRs <= 84) 348 return 3; 349 if (VGPRs <= 128) 350 return 2; 351 return 1; 352 } 353 354 unsigned SISubtarget::getMaxNumSGPRs() const { 355 if (hasSGPRInitBug()) 356 return SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG; 357 358 if (getGeneration() >= VOLCANIC_ISLANDS) 359 return 102; 360 361 return 104; 362 } 363