1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief Implements the AMDGPU specific subclass of TargetSubtarget. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPUSubtarget.h" 16 #include "R600ISelLowering.h" 17 #include "R600InstrInfo.h" 18 #include "SIFrameLowering.h" 19 #include "SIISelLowering.h" 20 #include "SIInstrInfo.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "llvm/ADT/SmallString.h" 23 #include "llvm/CodeGen/MachineScheduler.h" 24 25 using namespace llvm; 26 27 #define DEBUG_TYPE "amdgpu-subtarget" 28 29 #define GET_SUBTARGETINFO_ENUM 30 #define GET_SUBTARGETINFO_TARGET_DESC 31 #define GET_SUBTARGETINFO_CTOR 32 #include "AMDGPUGenSubtargetInfo.inc" 33 34 AMDGPUSubtarget::~AMDGPUSubtarget() {} 35 36 AMDGPUSubtarget & 37 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT, 38 StringRef GPU, StringRef FS) { 39 // Determine default and user-specified characteristics 40 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be 41 // enabled, but some instructions do not respect them and they run at the 42 // double precision rate, so don't enable by default. 43 // 44 // We want to be able to turn these off, but making this a subtarget feature 45 // for SI has the unhelpful behavior that it unsets everything else if you 46 // disable it. 47 48 SmallString<256> FullFS("+promote-alloca,+fp64-denormals,+load-store-opt,"); 49 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 50 FullFS += "+flat-for-global,+unaligned-buffer-access,"; 51 FullFS += FS; 52 53 ParseSubtargetFeatures(GPU, FullFS); 54 55 // FIXME: I don't think think Evergreen has any useful support for 56 // denormals, but should be checked. Should we issue a warning somewhere 57 // if someone tries to enable these? 58 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { 59 FP32Denormals = false; 60 FP64Denormals = false; 61 } 62 63 // Set defaults if needed. 64 if (MaxPrivateElementSize == 0) 65 MaxPrivateElementSize = 4; 66 67 return *this; 68 } 69 70 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 71 const TargetMachine &TM) 72 : AMDGPUGenSubtargetInfo(TT, GPU, FS), 73 TargetTriple(TT), 74 Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600), 75 IsaVersion(ISAVersion0_0_0), 76 WavefrontSize(64), 77 LocalMemorySize(0), 78 LDSBankCount(0), 79 MaxPrivateElementSize(0), 80 81 FastFMAF32(false), 82 HalfRate64Ops(false), 83 84 FP32Denormals(false), 85 FP64Denormals(false), 86 FPExceptions(false), 87 FlatForGlobal(false), 88 UnalignedBufferAccess(false), 89 90 EnableXNACK(false), 91 DebuggerInsertNops(false), 92 DebuggerReserveRegs(false), 93 DebuggerEmitPrologue(false), 94 95 EnableVGPRSpilling(false), 96 EnablePromoteAlloca(false), 97 EnableLoadStoreOpt(false), 98 EnableUnsafeDSOffsetFolding(false), 99 EnableSIScheduler(false), 100 DumpCode(false), 101 102 FP64(false), 103 IsGCN(false), 104 GCN1Encoding(false), 105 GCN3Encoding(false), 106 CIInsts(false), 107 SGPRInitBug(false), 108 HasSMemRealTime(false), 109 Has16BitInsts(false), 110 FlatAddressSpace(false), 111 112 R600ALUInst(false), 113 CaymanISA(false), 114 CFALUBug(false), 115 HasVertexCache(false), 116 TexVTXClauseSize(0), 117 118 FeatureDisable(false), 119 InstrItins(getInstrItineraryForCPU(GPU)), 120 TSInfo() { 121 initializeSubtargetDependencies(TT, GPU, FS); 122 } 123 124 // FIXME: These limits are for SI. Did they change with the larger maximum LDS 125 // size? 126 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves) const { 127 switch (NWaves) { 128 case 10: 129 return 1638; 130 case 9: 131 return 1820; 132 case 8: 133 return 2048; 134 case 7: 135 return 2340; 136 case 6: 137 return 2730; 138 case 5: 139 return 3276; 140 case 4: 141 return 4096; 142 case 3: 143 return 5461; 144 case 2: 145 return 8192; 146 default: 147 return getLocalMemorySize(); 148 } 149 } 150 151 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes) const { 152 if (Bytes <= 1638) 153 return 10; 154 155 if (Bytes <= 1820) 156 return 9; 157 158 if (Bytes <= 2048) 159 return 8; 160 161 if (Bytes <= 2340) 162 return 7; 163 164 if (Bytes <= 2730) 165 return 6; 166 167 if (Bytes <= 3276) 168 return 5; 169 170 if (Bytes <= 4096) 171 return 4; 172 173 if (Bytes <= 5461) 174 return 3; 175 176 if (Bytes <= 8192) 177 return 2; 178 179 return 1; 180 } 181 182 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 183 const Function &F) const { 184 185 // Default minimum/maximum flat work group sizes. 186 std::pair<unsigned, unsigned> Default = 187 AMDGPU::isCompute(F.getCallingConv()) ? 188 std::pair<unsigned, unsigned>(getWavefrontSize() * 2, 189 getWavefrontSize() * 4) : 190 std::pair<unsigned, unsigned>(1, getWavefrontSize()); 191 192 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 193 // starts using "amdgpu-flat-work-group-size" attribute. 194 Default.second = AMDGPU::getIntegerAttribute( 195 F, "amdgpu-max-work-group-size", Default.second); 196 Default.first = std::min(Default.first, Default.second); 197 198 // Requested minimum/maximum flat work group sizes. 199 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 200 F, "amdgpu-flat-work-group-size", Default); 201 202 // Make sure requested minimum is less than requested maximum. 203 if (Requested.first > Requested.second) 204 return Default; 205 206 // Make sure requested values do not violate subtarget's specifications. 207 if (Requested.first < getMinFlatWorkGroupSize()) 208 return Default; 209 if (Requested.second > getMaxFlatWorkGroupSize()) 210 return Default; 211 212 return Requested; 213 } 214 215 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 216 const Function &F) const { 217 218 // Default minimum/maximum number of waves per execution unit. 219 std::pair<unsigned, unsigned> Default(1, 0); 220 221 // Default/requested minimum/maximum flat work group sizes. 222 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 223 224 // If minimum/maximum flat work group sizes were explicitly requested using 225 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 226 // number of waves per execution unit to values implied by requested 227 // minimum/maximum flat work group sizes. 228 unsigned MinImpliedByFlatWorkGroupSize = 229 getMaxWavesPerEU(FlatWorkGroupSizes.second); 230 bool RequestedFlatWorkGroupSize = false; 231 232 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 233 // starts using "amdgpu-flat-work-group-size" attribute. 234 if (F.hasFnAttribute("amdgpu-max-work-group-size") || 235 F.hasFnAttribute("amdgpu-flat-work-group-size")) { 236 Default.first = MinImpliedByFlatWorkGroupSize; 237 RequestedFlatWorkGroupSize = true; 238 } 239 240 // Requested minimum/maximum number of waves per execution unit. 241 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 242 F, "amdgpu-waves-per-eu", Default, true); 243 244 // Make sure requested minimum is less than requested maximum. 245 if (Requested.second && Requested.first > Requested.second) 246 return Default; 247 248 // Make sure requested values do not violate subtarget's specifications. 249 if (Requested.first < getMinWavesPerEU() || 250 Requested.first > getMaxWavesPerEU()) 251 return Default; 252 if (Requested.second > getMaxWavesPerEU()) 253 return Default; 254 255 // Make sure requested values are compatible with values implied by requested 256 // minimum/maximum flat work group sizes. 257 if (RequestedFlatWorkGroupSize && 258 Requested.first > MinImpliedByFlatWorkGroupSize) 259 return Default; 260 261 return Requested; 262 } 263 264 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 265 const TargetMachine &TM) : 266 AMDGPUSubtarget(TT, GPU, FS, TM), 267 InstrInfo(*this), 268 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 269 TLInfo(TM, *this) {} 270 271 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS, 272 const TargetMachine &TM) : 273 AMDGPUSubtarget(TT, GPU, FS, TM), 274 InstrInfo(*this), 275 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 276 TLInfo(TM, *this), 277 GISel() {} 278 279 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 280 unsigned NumRegionInstrs) const { 281 // Track register pressure so the scheduler can try to decrease 282 // pressure once register usage is above the threshold defined by 283 // SIRegisterInfo::getRegPressureSetLimit() 284 Policy.ShouldTrackPressure = true; 285 286 // Enabling both top down and bottom up scheduling seems to give us less 287 // register spills than just using one of these approaches on its own. 288 Policy.OnlyTopDown = false; 289 Policy.OnlyBottomUp = false; 290 291 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 292 if (!enableSIScheduler()) 293 Policy.ShouldTrackLaneMasks = true; 294 } 295 296 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const { 297 return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv()); 298 } 299 300 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 301 if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 302 if (SGPRs <= 80) 303 return 10; 304 if (SGPRs <= 88) 305 return 9; 306 if (SGPRs <= 100) 307 return 8; 308 return 7; 309 } 310 if (SGPRs <= 48) 311 return 10; 312 if (SGPRs <= 56) 313 return 9; 314 if (SGPRs <= 64) 315 return 8; 316 if (SGPRs <= 72) 317 return 7; 318 if (SGPRs <= 80) 319 return 6; 320 return 5; 321 } 322 323 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 324 if (VGPRs <= 24) 325 return 10; 326 if (VGPRs <= 28) 327 return 9; 328 if (VGPRs <= 32) 329 return 8; 330 if (VGPRs <= 36) 331 return 7; 332 if (VGPRs <= 40) 333 return 6; 334 if (VGPRs <= 48) 335 return 5; 336 if (VGPRs <= 64) 337 return 4; 338 if (VGPRs <= 84) 339 return 3; 340 if (VGPRs <= 128) 341 return 2; 342 return 1; 343 } 344