1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief Implements the AMDGPU specific subclass of TargetSubtarget. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPUSubtarget.h" 16 #include "R600ISelLowering.h" 17 #include "R600InstrInfo.h" 18 #include "SIFrameLowering.h" 19 #include "SIISelLowering.h" 20 #include "SIInstrInfo.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "llvm/ADT/SmallString.h" 23 #include "llvm/CodeGen/MachineScheduler.h" 24 25 using namespace llvm; 26 27 #define DEBUG_TYPE "amdgpu-subtarget" 28 29 #define GET_SUBTARGETINFO_ENUM 30 #define GET_SUBTARGETINFO_TARGET_DESC 31 #define GET_SUBTARGETINFO_CTOR 32 #include "AMDGPUGenSubtargetInfo.inc" 33 34 AMDGPUSubtarget::~AMDGPUSubtarget() {} 35 36 AMDGPUSubtarget & 37 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT, 38 StringRef GPU, StringRef FS) { 39 // Determine default and user-specified characteristics 40 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be 41 // enabled, but some instructions do not respect them and they run at the 42 // double precision rate, so don't enable by default. 43 // 44 // We want to be able to turn these off, but making this a subtarget feature 45 // for SI has the unhelpful behavior that it unsets everything else if you 46 // disable it. 47 48 SmallString<256> FullFS("+promote-alloca,+fp64-denormals,+load-store-opt,"); 49 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 50 FullFS += "+flat-for-global,+unaligned-buffer-access,"; 51 FullFS += FS; 52 53 ParseSubtargetFeatures(GPU, FullFS); 54 55 // FIXME: I don't think think Evergreen has any useful support for 56 // denormals, but should be checked. Should we issue a warning somewhere 57 // if someone tries to enable these? 58 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { 59 FP16Denormals = false; 60 FP32Denormals = false; 61 FP64Denormals = false; 62 } 63 64 // Set defaults if needed. 65 if (MaxPrivateElementSize == 0) 66 MaxPrivateElementSize = 4; 67 68 return *this; 69 } 70 71 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 72 const TargetMachine &TM) 73 : AMDGPUGenSubtargetInfo(TT, GPU, FS), 74 TargetTriple(TT), 75 Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600), 76 IsaVersion(ISAVersion0_0_0), 77 WavefrontSize(64), 78 LocalMemorySize(0), 79 LDSBankCount(0), 80 MaxPrivateElementSize(0), 81 82 FastFMAF32(false), 83 HalfRate64Ops(false), 84 85 FP16Denormals(false), 86 FP32Denormals(false), 87 FP64Denormals(false), 88 FPExceptions(false), 89 FlatForGlobal(false), 90 UnalignedScratchAccess(false), 91 UnalignedBufferAccess(false), 92 93 EnableXNACK(false), 94 DebuggerInsertNops(false), 95 DebuggerReserveRegs(false), 96 DebuggerEmitPrologue(false), 97 98 EnableVGPRSpilling(false), 99 EnablePromoteAlloca(false), 100 EnableLoadStoreOpt(false), 101 EnableUnsafeDSOffsetFolding(false), 102 EnableSIScheduler(false), 103 DumpCode(false), 104 105 FP64(false), 106 IsGCN(false), 107 GCN1Encoding(false), 108 GCN3Encoding(false), 109 CIInsts(false), 110 SGPRInitBug(false), 111 HasSMemRealTime(false), 112 Has16BitInsts(false), 113 HasMovrel(false), 114 HasVGPRIndexMode(false), 115 HasScalarStores(false), 116 HasInv2PiInlineImm(false), 117 FlatAddressSpace(false), 118 119 R600ALUInst(false), 120 CaymanISA(false), 121 CFALUBug(false), 122 HasVertexCache(false), 123 TexVTXClauseSize(0), 124 125 FeatureDisable(false), 126 InstrItins(getInstrItineraryForCPU(GPU)), 127 TSInfo() { 128 initializeSubtargetDependencies(TT, GPU, FS); 129 } 130 131 // FIXME: These limits are for SI. Did they change with the larger maximum LDS 132 // size? 133 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves) const { 134 switch (NWaves) { 135 case 10: 136 return 1638; 137 case 9: 138 return 1820; 139 case 8: 140 return 2048; 141 case 7: 142 return 2340; 143 case 6: 144 return 2730; 145 case 5: 146 return 3276; 147 case 4: 148 return 4096; 149 case 3: 150 return 5461; 151 case 2: 152 return 8192; 153 default: 154 return getLocalMemorySize(); 155 } 156 } 157 158 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes) const { 159 if (Bytes <= 1638) 160 return 10; 161 162 if (Bytes <= 1820) 163 return 9; 164 165 if (Bytes <= 2048) 166 return 8; 167 168 if (Bytes <= 2340) 169 return 7; 170 171 if (Bytes <= 2730) 172 return 6; 173 174 if (Bytes <= 3276) 175 return 5; 176 177 if (Bytes <= 4096) 178 return 4; 179 180 if (Bytes <= 5461) 181 return 3; 182 183 if (Bytes <= 8192) 184 return 2; 185 186 return 1; 187 } 188 189 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 190 const Function &F) const { 191 192 // Default minimum/maximum flat work group sizes. 193 std::pair<unsigned, unsigned> Default = 194 AMDGPU::isCompute(F.getCallingConv()) ? 195 std::pair<unsigned, unsigned>(getWavefrontSize() * 2, 196 getWavefrontSize() * 4) : 197 std::pair<unsigned, unsigned>(1, getWavefrontSize()); 198 199 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 200 // starts using "amdgpu-flat-work-group-size" attribute. 201 Default.second = AMDGPU::getIntegerAttribute( 202 F, "amdgpu-max-work-group-size", Default.second); 203 Default.first = std::min(Default.first, Default.second); 204 205 // Requested minimum/maximum flat work group sizes. 206 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 207 F, "amdgpu-flat-work-group-size", Default); 208 209 // Make sure requested minimum is less than requested maximum. 210 if (Requested.first > Requested.second) 211 return Default; 212 213 // Make sure requested values do not violate subtarget's specifications. 214 if (Requested.first < getMinFlatWorkGroupSize()) 215 return Default; 216 if (Requested.second > getMaxFlatWorkGroupSize()) 217 return Default; 218 219 return Requested; 220 } 221 222 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 223 const Function &F) const { 224 225 // Default minimum/maximum number of waves per execution unit. 226 std::pair<unsigned, unsigned> Default(1, 0); 227 228 // Default/requested minimum/maximum flat work group sizes. 229 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 230 231 // If minimum/maximum flat work group sizes were explicitly requested using 232 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 233 // number of waves per execution unit to values implied by requested 234 // minimum/maximum flat work group sizes. 235 unsigned MinImpliedByFlatWorkGroupSize = 236 getMaxWavesPerEU(FlatWorkGroupSizes.second); 237 bool RequestedFlatWorkGroupSize = false; 238 239 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 240 // starts using "amdgpu-flat-work-group-size" attribute. 241 if (F.hasFnAttribute("amdgpu-max-work-group-size") || 242 F.hasFnAttribute("amdgpu-flat-work-group-size")) { 243 Default.first = MinImpliedByFlatWorkGroupSize; 244 RequestedFlatWorkGroupSize = true; 245 } 246 247 // Requested minimum/maximum number of waves per execution unit. 248 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 249 F, "amdgpu-waves-per-eu", Default, true); 250 251 // Make sure requested minimum is less than requested maximum. 252 if (Requested.second && Requested.first > Requested.second) 253 return Default; 254 255 // Make sure requested values do not violate subtarget's specifications. 256 if (Requested.first < getMinWavesPerEU() || 257 Requested.first > getMaxWavesPerEU()) 258 return Default; 259 if (Requested.second > getMaxWavesPerEU()) 260 return Default; 261 262 // Make sure requested values are compatible with values implied by requested 263 // minimum/maximum flat work group sizes. 264 if (RequestedFlatWorkGroupSize && 265 Requested.first > MinImpliedByFlatWorkGroupSize) 266 return Default; 267 268 return Requested; 269 } 270 271 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 272 const TargetMachine &TM) : 273 AMDGPUSubtarget(TT, GPU, FS, TM), 274 InstrInfo(*this), 275 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 276 TLInfo(TM, *this) {} 277 278 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS, 279 const TargetMachine &TM) : 280 AMDGPUSubtarget(TT, GPU, FS, TM), 281 InstrInfo(*this), 282 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 283 TLInfo(TM, *this), 284 GISel() {} 285 286 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 287 unsigned NumRegionInstrs) const { 288 // Track register pressure so the scheduler can try to decrease 289 // pressure once register usage is above the threshold defined by 290 // SIRegisterInfo::getRegPressureSetLimit() 291 Policy.ShouldTrackPressure = true; 292 293 // Enabling both top down and bottom up scheduling seems to give us less 294 // register spills than just using one of these approaches on its own. 295 Policy.OnlyTopDown = false; 296 Policy.OnlyBottomUp = false; 297 298 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 299 if (!enableSIScheduler()) 300 Policy.ShouldTrackLaneMasks = true; 301 } 302 303 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const { 304 return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv()); 305 } 306 307 unsigned SISubtarget::getKernArgSegmentSize(unsigned ExplicitArgBytes) const { 308 unsigned ImplicitBytes = getImplicitArgNumBytes(); 309 if (ImplicitBytes == 0) 310 return ExplicitArgBytes; 311 312 unsigned Alignment = getAlignmentForImplicitArgPtr(); 313 return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 314 } 315 316 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 317 if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 318 if (SGPRs <= 80) 319 return 10; 320 if (SGPRs <= 88) 321 return 9; 322 if (SGPRs <= 100) 323 return 8; 324 return 7; 325 } 326 if (SGPRs <= 48) 327 return 10; 328 if (SGPRs <= 56) 329 return 9; 330 if (SGPRs <= 64) 331 return 8; 332 if (SGPRs <= 72) 333 return 7; 334 if (SGPRs <= 80) 335 return 6; 336 return 5; 337 } 338 339 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 340 if (VGPRs <= 24) 341 return 10; 342 if (VGPRs <= 28) 343 return 9; 344 if (VGPRs <= 32) 345 return 8; 346 if (VGPRs <= 36) 347 return 7; 348 if (VGPRs <= 40) 349 return 6; 350 if (VGPRs <= 48) 351 return 5; 352 if (VGPRs <= 64) 353 return 4; 354 if (VGPRs <= 84) 355 return 3; 356 if (VGPRs <= 128) 357 return 2; 358 return 1; 359 } 360 361 unsigned SISubtarget::getMaxNumSGPRs() const { 362 if (hasSGPRInitBug()) 363 return SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG; 364 365 if (getGeneration() >= VOLCANIC_ISLANDS) 366 return 102; 367 368 return 104; 369 } 370