1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief Implements the AMDGPU specific subclass of TargetSubtarget. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPUSubtarget.h" 16 #include "llvm/ADT/SmallString.h" 17 #include "llvm/CodeGen/MachineScheduler.h" 18 #include "llvm/Target/TargetFrameLowering.h" 19 #include <algorithm> 20 21 using namespace llvm; 22 23 #define DEBUG_TYPE "amdgpu-subtarget" 24 25 #define GET_SUBTARGETINFO_ENUM 26 #define GET_SUBTARGETINFO_TARGET_DESC 27 #define GET_SUBTARGETINFO_CTOR 28 #include "AMDGPUGenSubtargetInfo.inc" 29 30 AMDGPUSubtarget::~AMDGPUSubtarget() = default; 31 32 AMDGPUSubtarget & 33 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT, 34 StringRef GPU, StringRef FS) { 35 // Determine default and user-specified characteristics 36 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be 37 // enabled, but some instructions do not respect them and they run at the 38 // double precision rate, so don't enable by default. 39 // 40 // We want to be able to turn these off, but making this a subtarget feature 41 // for SI has the unhelpful behavior that it unsets everything else if you 42 // disable it. 43 44 SmallString<256> FullFS("+promote-alloca,+fp64-fp16-denormals,+load-store-opt,"); 45 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 46 FullFS += "+flat-for-global,+unaligned-buffer-access,"; 47 48 FullFS += FS; 49 50 ParseSubtargetFeatures(GPU, FullFS); 51 52 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es 53 // on VI and newer hardware to avoid assertion failures due to missing ADDR64 54 // variants of MUBUF instructions. 55 if (!hasAddr64() && !FS.contains("flat-for-global")) { 56 FlatForGlobal = true; 57 } 58 59 // FIXME: I don't think think Evergreen has any useful support for 60 // denormals, but should be checked. Should we issue a warning somewhere 61 // if someone tries to enable these? 62 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { 63 FP64FP16Denormals = false; 64 FP32Denormals = false; 65 } 66 67 // Set defaults if needed. 68 if (MaxPrivateElementSize == 0) 69 MaxPrivateElementSize = 4; 70 71 return *this; 72 } 73 74 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 75 const TargetMachine &TM) 76 : AMDGPUGenSubtargetInfo(TT, GPU, FS), 77 TargetTriple(TT), 78 Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600), 79 IsaVersion(ISAVersion0_0_0), 80 WavefrontSize(64), 81 LocalMemorySize(0), 82 LDSBankCount(0), 83 MaxPrivateElementSize(0), 84 85 FastFMAF32(false), 86 HalfRate64Ops(false), 87 88 FP32Denormals(false), 89 FP64FP16Denormals(false), 90 FPExceptions(false), 91 FlatForGlobal(false), 92 UnalignedScratchAccess(false), 93 UnalignedBufferAccess(false), 94 95 EnableXNACK(false), 96 DebuggerInsertNops(false), 97 DebuggerReserveRegs(false), 98 DebuggerEmitPrologue(false), 99 100 EnableVGPRSpilling(false), 101 EnablePromoteAlloca(false), 102 EnableLoadStoreOpt(false), 103 EnableUnsafeDSOffsetFolding(false), 104 EnableSIScheduler(false), 105 DumpCode(false), 106 107 FP64(false), 108 IsGCN(false), 109 GCN1Encoding(false), 110 GCN3Encoding(false), 111 CIInsts(false), 112 SGPRInitBug(false), 113 HasSMemRealTime(false), 114 Has16BitInsts(false), 115 HasMovrel(false), 116 HasVGPRIndexMode(false), 117 HasScalarStores(false), 118 HasInv2PiInlineImm(false), 119 HasSDWA(false), 120 HasDPP(false), 121 FlatAddressSpace(false), 122 123 R600ALUInst(false), 124 CaymanISA(false), 125 CFALUBug(false), 126 HasVertexCache(false), 127 TexVTXClauseSize(0), 128 ScalarizeGlobal(false), 129 130 FeatureDisable(false), 131 InstrItins(getInstrItineraryForCPU(GPU)) { 132 initializeSubtargetDependencies(TT, GPU, FS); 133 } 134 135 // FIXME: These limits are for SI. Did they change with the larger maximum LDS 136 // size? 137 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves) const { 138 switch (NWaves) { 139 case 10: 140 return 1638; 141 case 9: 142 return 1820; 143 case 8: 144 return 2048; 145 case 7: 146 return 2340; 147 case 6: 148 return 2730; 149 case 5: 150 return 3276; 151 case 4: 152 return 4096; 153 case 3: 154 return 5461; 155 case 2: 156 return 8192; 157 default: 158 return getLocalMemorySize(); 159 } 160 } 161 162 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes) const { 163 if (Bytes <= 1638) 164 return 10; 165 166 if (Bytes <= 1820) 167 return 9; 168 169 if (Bytes <= 2048) 170 return 8; 171 172 if (Bytes <= 2340) 173 return 7; 174 175 if (Bytes <= 2730) 176 return 6; 177 178 if (Bytes <= 3276) 179 return 5; 180 181 if (Bytes <= 4096) 182 return 4; 183 184 if (Bytes <= 5461) 185 return 3; 186 187 if (Bytes <= 8192) 188 return 2; 189 190 return 1; 191 } 192 193 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 194 const Function &F) const { 195 // Default minimum/maximum flat work group sizes. 196 std::pair<unsigned, unsigned> Default = 197 AMDGPU::isCompute(F.getCallingConv()) ? 198 std::pair<unsigned, unsigned>(getWavefrontSize() * 2, 199 getWavefrontSize() * 4) : 200 std::pair<unsigned, unsigned>(1, getWavefrontSize()); 201 202 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 203 // starts using "amdgpu-flat-work-group-size" attribute. 204 Default.second = AMDGPU::getIntegerAttribute( 205 F, "amdgpu-max-work-group-size", Default.second); 206 Default.first = std::min(Default.first, Default.second); 207 208 // Requested minimum/maximum flat work group sizes. 209 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 210 F, "amdgpu-flat-work-group-size", Default); 211 212 // Make sure requested minimum is less than requested maximum. 213 if (Requested.first > Requested.second) 214 return Default; 215 216 // Make sure requested values do not violate subtarget's specifications. 217 if (Requested.first < getMinFlatWorkGroupSize()) 218 return Default; 219 if (Requested.second > getMaxFlatWorkGroupSize()) 220 return Default; 221 222 return Requested; 223 } 224 225 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 226 const Function &F) const { 227 // Default minimum/maximum number of waves per execution unit. 228 std::pair<unsigned, unsigned> Default(1, 0); 229 230 // Default/requested minimum/maximum flat work group sizes. 231 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 232 233 // If minimum/maximum flat work group sizes were explicitly requested using 234 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 235 // number of waves per execution unit to values implied by requested 236 // minimum/maximum flat work group sizes. 237 unsigned MinImpliedByFlatWorkGroupSize = 238 getMaxWavesPerEU(FlatWorkGroupSizes.second); 239 bool RequestedFlatWorkGroupSize = false; 240 241 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 242 // starts using "amdgpu-flat-work-group-size" attribute. 243 if (F.hasFnAttribute("amdgpu-max-work-group-size") || 244 F.hasFnAttribute("amdgpu-flat-work-group-size")) { 245 Default.first = MinImpliedByFlatWorkGroupSize; 246 RequestedFlatWorkGroupSize = true; 247 } 248 249 // Requested minimum/maximum number of waves per execution unit. 250 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 251 F, "amdgpu-waves-per-eu", Default, true); 252 253 // Make sure requested minimum is less than requested maximum. 254 if (Requested.second && Requested.first > Requested.second) 255 return Default; 256 257 // Make sure requested values do not violate subtarget's specifications. 258 if (Requested.first < getMinWavesPerEU() || 259 Requested.first > getMaxWavesPerEU()) 260 return Default; 261 if (Requested.second > getMaxWavesPerEU()) 262 return Default; 263 264 // Make sure requested values are compatible with values implied by requested 265 // minimum/maximum flat work group sizes. 266 if (RequestedFlatWorkGroupSize && 267 Requested.first > MinImpliedByFlatWorkGroupSize) 268 return Default; 269 270 return Requested; 271 } 272 273 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 274 const TargetMachine &TM) : 275 AMDGPUSubtarget(TT, GPU, FS, TM), 276 InstrInfo(*this), 277 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 278 TLInfo(TM, *this) {} 279 280 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS, 281 const TargetMachine &TM) : 282 AMDGPUSubtarget(TT, GPU, FS, TM), 283 InstrInfo(*this), 284 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 285 TLInfo(TM, *this) {} 286 287 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 288 unsigned NumRegionInstrs) const { 289 // Track register pressure so the scheduler can try to decrease 290 // pressure once register usage is above the threshold defined by 291 // SIRegisterInfo::getRegPressureSetLimit() 292 Policy.ShouldTrackPressure = true; 293 294 // Enabling both top down and bottom up scheduling seems to give us less 295 // register spills than just using one of these approaches on its own. 296 Policy.OnlyTopDown = false; 297 Policy.OnlyBottomUp = false; 298 299 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 300 if (!enableSIScheduler()) 301 Policy.ShouldTrackLaneMasks = true; 302 } 303 304 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const { 305 return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv()); 306 } 307 308 unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF, 309 unsigned ExplicitArgBytes) const { 310 unsigned ImplicitBytes = getImplicitArgNumBytes(MF); 311 if (ImplicitBytes == 0) 312 return ExplicitArgBytes; 313 314 unsigned Alignment = getAlignmentForImplicitArgPtr(); 315 return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 316 } 317 318 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 319 if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 320 if (SGPRs <= 80) 321 return 10; 322 if (SGPRs <= 88) 323 return 9; 324 if (SGPRs <= 100) 325 return 8; 326 return 7; 327 } 328 if (SGPRs <= 48) 329 return 10; 330 if (SGPRs <= 56) 331 return 9; 332 if (SGPRs <= 64) 333 return 8; 334 if (SGPRs <= 72) 335 return 7; 336 if (SGPRs <= 80) 337 return 6; 338 return 5; 339 } 340 341 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 342 if (VGPRs <= 24) 343 return 10; 344 if (VGPRs <= 28) 345 return 9; 346 if (VGPRs <= 32) 347 return 8; 348 if (VGPRs <= 36) 349 return 7; 350 if (VGPRs <= 40) 351 return 6; 352 if (VGPRs <= 48) 353 return 5; 354 if (VGPRs <= 64) 355 return 4; 356 if (VGPRs <= 84) 357 return 3; 358 if (VGPRs <= 128) 359 return 2; 360 return 1; 361 } 362 363 unsigned SISubtarget::getMaxNumSGPRs() const { 364 if (hasSGPRInitBug()) 365 return SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG; 366 367 if (getGeneration() >= VOLCANIC_ISLANDS) 368 return 102; 369 370 return 104; 371 } 372