1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief Implements the AMDGPU specific subclass of TargetSubtarget. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPUSubtarget.h" 16 #include "R600ISelLowering.h" 17 #include "R600InstrInfo.h" 18 #include "SIFrameLowering.h" 19 #include "SIISelLowering.h" 20 #include "SIInstrInfo.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "llvm/ADT/SmallString.h" 23 #include "llvm/CodeGen/MachineScheduler.h" 24 25 using namespace llvm; 26 27 #define DEBUG_TYPE "amdgpu-subtarget" 28 29 #define GET_SUBTARGETINFO_ENUM 30 #define GET_SUBTARGETINFO_TARGET_DESC 31 #define GET_SUBTARGETINFO_CTOR 32 #include "AMDGPUGenSubtargetInfo.inc" 33 34 AMDGPUSubtarget::~AMDGPUSubtarget() {} 35 36 AMDGPUSubtarget & 37 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT, 38 StringRef GPU, StringRef FS) { 39 // Determine default and user-specified characteristics 40 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be 41 // enabled, but some instructions do not respect them and they run at the 42 // double precision rate, so don't enable by default. 43 // 44 // We want to be able to turn these off, but making this a subtarget feature 45 // for SI has the unhelpful behavior that it unsets everything else if you 46 // disable it. 47 48 SmallString<256> FullFS("+promote-alloca,+fp64-denormals,+load-store-opt,"); 49 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 50 FullFS += "+flat-for-global,+unaligned-buffer-access,"; 51 FullFS += FS; 52 53 ParseSubtargetFeatures(GPU, FullFS); 54 55 // FIXME: I don't think think Evergreen has any useful support for 56 // denormals, but should be checked. Should we issue a warning somewhere 57 // if someone tries to enable these? 58 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { 59 FP16Denormals = false; 60 FP32Denormals = false; 61 FP64Denormals = false; 62 } 63 64 // Set defaults if needed. 65 if (MaxPrivateElementSize == 0) 66 MaxPrivateElementSize = 4; 67 68 return *this; 69 } 70 71 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 72 const TargetMachine &TM) 73 : AMDGPUGenSubtargetInfo(TT, GPU, FS), 74 TargetTriple(TT), 75 Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600), 76 IsaVersion(ISAVersion0_0_0), 77 WavefrontSize(64), 78 LocalMemorySize(0), 79 LDSBankCount(0), 80 MaxPrivateElementSize(0), 81 82 FastFMAF32(false), 83 HalfRate64Ops(false), 84 85 FP16Denormals(false), 86 FP32Denormals(false), 87 FP64Denormals(false), 88 FPExceptions(false), 89 FlatForGlobal(false), 90 UnalignedScratchAccess(false), 91 UnalignedBufferAccess(false), 92 93 EnableXNACK(false), 94 DebuggerInsertNops(false), 95 DebuggerReserveRegs(false), 96 DebuggerEmitPrologue(false), 97 98 EnableVGPRSpilling(false), 99 EnablePromoteAlloca(false), 100 EnableLoadStoreOpt(false), 101 EnableUnsafeDSOffsetFolding(false), 102 EnableSIScheduler(false), 103 DumpCode(false), 104 105 FP64(false), 106 IsGCN(false), 107 GCN1Encoding(false), 108 GCN3Encoding(false), 109 CIInsts(false), 110 SGPRInitBug(false), 111 HasSMemRealTime(false), 112 Has16BitInsts(false), 113 HasMovrel(false), 114 HasVGPRIndexMode(false), 115 HasScalarStores(false), 116 HasInv2PiInlineImm(false), 117 FlatAddressSpace(false), 118 119 R600ALUInst(false), 120 CaymanISA(false), 121 CFALUBug(false), 122 HasVertexCache(false), 123 TexVTXClauseSize(0), 124 ScalarizeGlobal(false), 125 126 FeatureDisable(false), 127 InstrItins(getInstrItineraryForCPU(GPU)), 128 TSInfo() { 129 initializeSubtargetDependencies(TT, GPU, FS); 130 } 131 132 // FIXME: These limits are for SI. Did they change with the larger maximum LDS 133 // size? 134 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves) const { 135 switch (NWaves) { 136 case 10: 137 return 1638; 138 case 9: 139 return 1820; 140 case 8: 141 return 2048; 142 case 7: 143 return 2340; 144 case 6: 145 return 2730; 146 case 5: 147 return 3276; 148 case 4: 149 return 4096; 150 case 3: 151 return 5461; 152 case 2: 153 return 8192; 154 default: 155 return getLocalMemorySize(); 156 } 157 } 158 159 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes) const { 160 if (Bytes <= 1638) 161 return 10; 162 163 if (Bytes <= 1820) 164 return 9; 165 166 if (Bytes <= 2048) 167 return 8; 168 169 if (Bytes <= 2340) 170 return 7; 171 172 if (Bytes <= 2730) 173 return 6; 174 175 if (Bytes <= 3276) 176 return 5; 177 178 if (Bytes <= 4096) 179 return 4; 180 181 if (Bytes <= 5461) 182 return 3; 183 184 if (Bytes <= 8192) 185 return 2; 186 187 return 1; 188 } 189 190 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 191 const Function &F) const { 192 193 // Default minimum/maximum flat work group sizes. 194 std::pair<unsigned, unsigned> Default = 195 AMDGPU::isCompute(F.getCallingConv()) ? 196 std::pair<unsigned, unsigned>(getWavefrontSize() * 2, 197 getWavefrontSize() * 4) : 198 std::pair<unsigned, unsigned>(1, getWavefrontSize()); 199 200 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 201 // starts using "amdgpu-flat-work-group-size" attribute. 202 Default.second = AMDGPU::getIntegerAttribute( 203 F, "amdgpu-max-work-group-size", Default.second); 204 Default.first = std::min(Default.first, Default.second); 205 206 // Requested minimum/maximum flat work group sizes. 207 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 208 F, "amdgpu-flat-work-group-size", Default); 209 210 // Make sure requested minimum is less than requested maximum. 211 if (Requested.first > Requested.second) 212 return Default; 213 214 // Make sure requested values do not violate subtarget's specifications. 215 if (Requested.first < getMinFlatWorkGroupSize()) 216 return Default; 217 if (Requested.second > getMaxFlatWorkGroupSize()) 218 return Default; 219 220 return Requested; 221 } 222 223 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 224 const Function &F) const { 225 226 // Default minimum/maximum number of waves per execution unit. 227 std::pair<unsigned, unsigned> Default(1, 0); 228 229 // Default/requested minimum/maximum flat work group sizes. 230 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 231 232 // If minimum/maximum flat work group sizes were explicitly requested using 233 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 234 // number of waves per execution unit to values implied by requested 235 // minimum/maximum flat work group sizes. 236 unsigned MinImpliedByFlatWorkGroupSize = 237 getMaxWavesPerEU(FlatWorkGroupSizes.second); 238 bool RequestedFlatWorkGroupSize = false; 239 240 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 241 // starts using "amdgpu-flat-work-group-size" attribute. 242 if (F.hasFnAttribute("amdgpu-max-work-group-size") || 243 F.hasFnAttribute("amdgpu-flat-work-group-size")) { 244 Default.first = MinImpliedByFlatWorkGroupSize; 245 RequestedFlatWorkGroupSize = true; 246 } 247 248 // Requested minimum/maximum number of waves per execution unit. 249 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 250 F, "amdgpu-waves-per-eu", Default, true); 251 252 // Make sure requested minimum is less than requested maximum. 253 if (Requested.second && Requested.first > Requested.second) 254 return Default; 255 256 // Make sure requested values do not violate subtarget's specifications. 257 if (Requested.first < getMinWavesPerEU() || 258 Requested.first > getMaxWavesPerEU()) 259 return Default; 260 if (Requested.second > getMaxWavesPerEU()) 261 return Default; 262 263 // Make sure requested values are compatible with values implied by requested 264 // minimum/maximum flat work group sizes. 265 if (RequestedFlatWorkGroupSize && 266 Requested.first > MinImpliedByFlatWorkGroupSize) 267 return Default; 268 269 return Requested; 270 } 271 272 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 273 const TargetMachine &TM) : 274 AMDGPUSubtarget(TT, GPU, FS, TM), 275 InstrInfo(*this), 276 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 277 TLInfo(TM, *this) {} 278 279 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS, 280 const TargetMachine &TM) : 281 AMDGPUSubtarget(TT, GPU, FS, TM), 282 InstrInfo(*this), 283 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 284 TLInfo(TM, *this), 285 GISel() {} 286 287 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 288 unsigned NumRegionInstrs) const { 289 // Track register pressure so the scheduler can try to decrease 290 // pressure once register usage is above the threshold defined by 291 // SIRegisterInfo::getRegPressureSetLimit() 292 Policy.ShouldTrackPressure = true; 293 294 // Enabling both top down and bottom up scheduling seems to give us less 295 // register spills than just using one of these approaches on its own. 296 Policy.OnlyTopDown = false; 297 Policy.OnlyBottomUp = false; 298 299 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 300 if (!enableSIScheduler()) 301 Policy.ShouldTrackLaneMasks = true; 302 } 303 304 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const { 305 return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv()); 306 } 307 308 unsigned SISubtarget::getKernArgSegmentSize(unsigned ExplicitArgBytes) const { 309 unsigned ImplicitBytes = getImplicitArgNumBytes(); 310 if (ImplicitBytes == 0) 311 return ExplicitArgBytes; 312 313 unsigned Alignment = getAlignmentForImplicitArgPtr(); 314 return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 315 } 316 317 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 318 if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 319 if (SGPRs <= 80) 320 return 10; 321 if (SGPRs <= 88) 322 return 9; 323 if (SGPRs <= 100) 324 return 8; 325 return 7; 326 } 327 if (SGPRs <= 48) 328 return 10; 329 if (SGPRs <= 56) 330 return 9; 331 if (SGPRs <= 64) 332 return 8; 333 if (SGPRs <= 72) 334 return 7; 335 if (SGPRs <= 80) 336 return 6; 337 return 5; 338 } 339 340 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 341 if (VGPRs <= 24) 342 return 10; 343 if (VGPRs <= 28) 344 return 9; 345 if (VGPRs <= 32) 346 return 8; 347 if (VGPRs <= 36) 348 return 7; 349 if (VGPRs <= 40) 350 return 6; 351 if (VGPRs <= 48) 352 return 5; 353 if (VGPRs <= 64) 354 return 4; 355 if (VGPRs <= 84) 356 return 3; 357 if (VGPRs <= 128) 358 return 2; 359 return 1; 360 } 361 362 unsigned SISubtarget::getMaxNumSGPRs() const { 363 if (hasSGPRInitBug()) 364 return SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG; 365 366 if (getGeneration() >= VOLCANIC_ISLANDS) 367 return 102; 368 369 return 104; 370 } 371