1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief Implements the AMDGPU specific subclass of TargetSubtarget. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPUSubtarget.h" 16 #include "R600ISelLowering.h" 17 #include "R600InstrInfo.h" 18 #include "SIFrameLowering.h" 19 #include "SIISelLowering.h" 20 #include "SIInstrInfo.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "llvm/ADT/SmallString.h" 23 #include "llvm/CodeGen/MachineScheduler.h" 24 25 using namespace llvm; 26 27 #define DEBUG_TYPE "amdgpu-subtarget" 28 29 #define GET_SUBTARGETINFO_ENUM 30 #define GET_SUBTARGETINFO_TARGET_DESC 31 #define GET_SUBTARGETINFO_CTOR 32 #include "AMDGPUGenSubtargetInfo.inc" 33 34 AMDGPUSubtarget::~AMDGPUSubtarget() {} 35 36 AMDGPUSubtarget & 37 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT, 38 StringRef GPU, StringRef FS) { 39 // Determine default and user-specified characteristics 40 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be 41 // enabled, but some instructions do not respect them and they run at the 42 // double precision rate, so don't enable by default. 43 // 44 // We want to be able to turn these off, but making this a subtarget feature 45 // for SI has the unhelpful behavior that it unsets everything else if you 46 // disable it. 47 48 SmallString<256> FullFS("+promote-alloca,+fp64-denormals,+load-store-opt,"); 49 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 50 FullFS += "+flat-for-global,+unaligned-buffer-access,"; 51 FullFS += FS; 52 53 ParseSubtargetFeatures(GPU, FullFS); 54 55 // FIXME: I don't think think Evergreen has any useful support for 56 // denormals, but should be checked. Should we issue a warning somewhere 57 // if someone tries to enable these? 58 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { 59 FP32Denormals = false; 60 FP64Denormals = false; 61 } 62 63 // Set defaults if needed. 64 if (MaxPrivateElementSize == 0) 65 MaxPrivateElementSize = 4; 66 67 return *this; 68 } 69 70 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 71 const TargetMachine &TM) 72 : AMDGPUGenSubtargetInfo(TT, GPU, FS), 73 TargetTriple(TT), 74 Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600), 75 IsaVersion(ISAVersion0_0_0), 76 WavefrontSize(64), 77 LocalMemorySize(0), 78 LDSBankCount(0), 79 MaxPrivateElementSize(0), 80 81 FastFMAF32(false), 82 HalfRate64Ops(false), 83 84 FP32Denormals(false), 85 FP64Denormals(false), 86 FPExceptions(false), 87 FlatForGlobal(false), 88 UnalignedBufferAccess(false), 89 90 EnableXNACK(false), 91 DebuggerInsertNops(false), 92 DebuggerReserveRegs(false), 93 DebuggerEmitPrologue(false), 94 95 EnableVGPRSpilling(false), 96 EnablePromoteAlloca(false), 97 EnableLoadStoreOpt(false), 98 EnableUnsafeDSOffsetFolding(false), 99 EnableSIScheduler(false), 100 DumpCode(false), 101 102 FP64(false), 103 IsGCN(false), 104 GCN1Encoding(false), 105 GCN3Encoding(false), 106 CIInsts(false), 107 SGPRInitBug(false), 108 HasSMemRealTime(false), 109 Has16BitInsts(false), 110 HasMovrel(false), 111 HasVGPRIndexMode(false), 112 FlatAddressSpace(false), 113 114 R600ALUInst(false), 115 CaymanISA(false), 116 CFALUBug(false), 117 HasVertexCache(false), 118 TexVTXClauseSize(0), 119 120 FeatureDisable(false), 121 InstrItins(getInstrItineraryForCPU(GPU)), 122 TSInfo() { 123 initializeSubtargetDependencies(TT, GPU, FS); 124 } 125 126 // FIXME: These limits are for SI. Did they change with the larger maximum LDS 127 // size? 128 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves) const { 129 switch (NWaves) { 130 case 10: 131 return 1638; 132 case 9: 133 return 1820; 134 case 8: 135 return 2048; 136 case 7: 137 return 2340; 138 case 6: 139 return 2730; 140 case 5: 141 return 3276; 142 case 4: 143 return 4096; 144 case 3: 145 return 5461; 146 case 2: 147 return 8192; 148 default: 149 return getLocalMemorySize(); 150 } 151 } 152 153 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes) const { 154 if (Bytes <= 1638) 155 return 10; 156 157 if (Bytes <= 1820) 158 return 9; 159 160 if (Bytes <= 2048) 161 return 8; 162 163 if (Bytes <= 2340) 164 return 7; 165 166 if (Bytes <= 2730) 167 return 6; 168 169 if (Bytes <= 3276) 170 return 5; 171 172 if (Bytes <= 4096) 173 return 4; 174 175 if (Bytes <= 5461) 176 return 3; 177 178 if (Bytes <= 8192) 179 return 2; 180 181 return 1; 182 } 183 184 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 185 const Function &F) const { 186 187 // Default minimum/maximum flat work group sizes. 188 std::pair<unsigned, unsigned> Default = 189 AMDGPU::isCompute(F.getCallingConv()) ? 190 std::pair<unsigned, unsigned>(getWavefrontSize() * 2, 191 getWavefrontSize() * 4) : 192 std::pair<unsigned, unsigned>(1, getWavefrontSize()); 193 194 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 195 // starts using "amdgpu-flat-work-group-size" attribute. 196 Default.second = AMDGPU::getIntegerAttribute( 197 F, "amdgpu-max-work-group-size", Default.second); 198 Default.first = std::min(Default.first, Default.second); 199 200 // Requested minimum/maximum flat work group sizes. 201 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 202 F, "amdgpu-flat-work-group-size", Default); 203 204 // Make sure requested minimum is less than requested maximum. 205 if (Requested.first > Requested.second) 206 return Default; 207 208 // Make sure requested values do not violate subtarget's specifications. 209 if (Requested.first < getMinFlatWorkGroupSize()) 210 return Default; 211 if (Requested.second > getMaxFlatWorkGroupSize()) 212 return Default; 213 214 return Requested; 215 } 216 217 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 218 const Function &F) const { 219 220 // Default minimum/maximum number of waves per execution unit. 221 std::pair<unsigned, unsigned> Default(1, 0); 222 223 // Default/requested minimum/maximum flat work group sizes. 224 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 225 226 // If minimum/maximum flat work group sizes were explicitly requested using 227 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 228 // number of waves per execution unit to values implied by requested 229 // minimum/maximum flat work group sizes. 230 unsigned MinImpliedByFlatWorkGroupSize = 231 getMaxWavesPerEU(FlatWorkGroupSizes.second); 232 bool RequestedFlatWorkGroupSize = false; 233 234 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 235 // starts using "amdgpu-flat-work-group-size" attribute. 236 if (F.hasFnAttribute("amdgpu-max-work-group-size") || 237 F.hasFnAttribute("amdgpu-flat-work-group-size")) { 238 Default.first = MinImpliedByFlatWorkGroupSize; 239 RequestedFlatWorkGroupSize = true; 240 } 241 242 // Requested minimum/maximum number of waves per execution unit. 243 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 244 F, "amdgpu-waves-per-eu", Default, true); 245 246 // Make sure requested minimum is less than requested maximum. 247 if (Requested.second && Requested.first > Requested.second) 248 return Default; 249 250 // Make sure requested values do not violate subtarget's specifications. 251 if (Requested.first < getMinWavesPerEU() || 252 Requested.first > getMaxWavesPerEU()) 253 return Default; 254 if (Requested.second > getMaxWavesPerEU()) 255 return Default; 256 257 // Make sure requested values are compatible with values implied by requested 258 // minimum/maximum flat work group sizes. 259 if (RequestedFlatWorkGroupSize && 260 Requested.first > MinImpliedByFlatWorkGroupSize) 261 return Default; 262 263 return Requested; 264 } 265 266 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 267 const TargetMachine &TM) : 268 AMDGPUSubtarget(TT, GPU, FS, TM), 269 InstrInfo(*this), 270 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 271 TLInfo(TM, *this) {} 272 273 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS, 274 const TargetMachine &TM) : 275 AMDGPUSubtarget(TT, GPU, FS, TM), 276 InstrInfo(*this), 277 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 278 TLInfo(TM, *this), 279 GISel() {} 280 281 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 282 unsigned NumRegionInstrs) const { 283 // Track register pressure so the scheduler can try to decrease 284 // pressure once register usage is above the threshold defined by 285 // SIRegisterInfo::getRegPressureSetLimit() 286 Policy.ShouldTrackPressure = true; 287 288 // Enabling both top down and bottom up scheduling seems to give us less 289 // register spills than just using one of these approaches on its own. 290 Policy.OnlyTopDown = false; 291 Policy.OnlyBottomUp = false; 292 293 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 294 if (!enableSIScheduler()) 295 Policy.ShouldTrackLaneMasks = true; 296 } 297 298 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const { 299 return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv()); 300 } 301 302 unsigned SISubtarget::getKernArgSegmentSize(unsigned ExplicitArgBytes) const { 303 unsigned ImplicitBytes = getImplicitArgNumBytes(); 304 if (ImplicitBytes == 0) 305 return ExplicitArgBytes; 306 307 unsigned Alignment = getAlignmentForImplicitArgPtr(); 308 return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 309 } 310 311 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 312 if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 313 if (SGPRs <= 80) 314 return 10; 315 if (SGPRs <= 88) 316 return 9; 317 if (SGPRs <= 100) 318 return 8; 319 return 7; 320 } 321 if (SGPRs <= 48) 322 return 10; 323 if (SGPRs <= 56) 324 return 9; 325 if (SGPRs <= 64) 326 return 8; 327 if (SGPRs <= 72) 328 return 7; 329 if (SGPRs <= 80) 330 return 6; 331 return 5; 332 } 333 334 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 335 if (VGPRs <= 24) 336 return 10; 337 if (VGPRs <= 28) 338 return 9; 339 if (VGPRs <= 32) 340 return 8; 341 if (VGPRs <= 36) 342 return 7; 343 if (VGPRs <= 40) 344 return 6; 345 if (VGPRs <= 48) 346 return 5; 347 if (VGPRs <= 64) 348 return 4; 349 if (VGPRs <= 84) 350 return 3; 351 if (VGPRs <= 128) 352 return 2; 353 return 1; 354 } 355