1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief Implements the AMDGPU specific subclass of TargetSubtarget. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPUSubtarget.h" 16 #include "R600ISelLowering.h" 17 #include "R600InstrInfo.h" 18 #include "SIFrameLowering.h" 19 #include "SIISelLowering.h" 20 #include "SIInstrInfo.h" 21 #include "SIMachineFunctionInfo.h" 22 #include "llvm/ADT/SmallString.h" 23 #include "llvm/CodeGen/MachineScheduler.h" 24 25 using namespace llvm; 26 27 #define DEBUG_TYPE "amdgpu-subtarget" 28 29 #define GET_SUBTARGETINFO_ENUM 30 #define GET_SUBTARGETINFO_TARGET_DESC 31 #define GET_SUBTARGETINFO_CTOR 32 #include "AMDGPUGenSubtargetInfo.inc" 33 34 AMDGPUSubtarget::~AMDGPUSubtarget() {} 35 36 AMDGPUSubtarget & 37 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT, 38 StringRef GPU, StringRef FS) { 39 // Determine default and user-specified characteristics 40 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be 41 // enabled, but some instructions do not respect them and they run at the 42 // double precision rate, so don't enable by default. 43 // 44 // We want to be able to turn these off, but making this a subtarget feature 45 // for SI has the unhelpful behavior that it unsets everything else if you 46 // disable it. 47 48 SmallString<256> FullFS("+promote-alloca,+fp64-denormals,+load-store-opt,"); 49 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 50 FullFS += "+flat-for-global,+unaligned-buffer-access,"; 51 FullFS += FS; 52 53 ParseSubtargetFeatures(GPU, FullFS); 54 55 // FIXME: I don't think think Evergreen has any useful support for 56 // denormals, but should be checked. Should we issue a warning somewhere 57 // if someone tries to enable these? 58 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { 59 FP32Denormals = false; 60 FP64Denormals = false; 61 } 62 63 // Set defaults if needed. 64 if (MaxPrivateElementSize == 0) 65 MaxPrivateElementSize = 4; 66 67 return *this; 68 } 69 70 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 71 const TargetMachine &TM) 72 : AMDGPUGenSubtargetInfo(TT, GPU, FS), 73 TargetTriple(TT), 74 Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600), 75 IsaVersion(ISAVersion0_0_0), 76 WavefrontSize(64), 77 LocalMemorySize(0), 78 LDSBankCount(0), 79 MaxPrivateElementSize(0), 80 81 FastFMAF32(false), 82 HalfRate64Ops(false), 83 84 FP32Denormals(false), 85 FP64Denormals(false), 86 FPExceptions(false), 87 FlatForGlobal(false), 88 UnalignedScratchAccess(false), 89 UnalignedBufferAccess(false), 90 91 EnableXNACK(false), 92 DebuggerInsertNops(false), 93 DebuggerReserveRegs(false), 94 DebuggerEmitPrologue(false), 95 96 EnableVGPRSpilling(false), 97 EnablePromoteAlloca(false), 98 EnableLoadStoreOpt(false), 99 EnableUnsafeDSOffsetFolding(false), 100 EnableSIScheduler(false), 101 DumpCode(false), 102 103 FP64(false), 104 IsGCN(false), 105 GCN1Encoding(false), 106 GCN3Encoding(false), 107 CIInsts(false), 108 SGPRInitBug(false), 109 HasSMemRealTime(false), 110 Has16BitInsts(false), 111 HasMovrel(false), 112 HasVGPRIndexMode(false), 113 FlatAddressSpace(false), 114 115 R600ALUInst(false), 116 CaymanISA(false), 117 CFALUBug(false), 118 HasVertexCache(false), 119 TexVTXClauseSize(0), 120 121 FeatureDisable(false), 122 InstrItins(getInstrItineraryForCPU(GPU)), 123 TSInfo() { 124 initializeSubtargetDependencies(TT, GPU, FS); 125 } 126 127 // FIXME: These limits are for SI. Did they change with the larger maximum LDS 128 // size? 129 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves) const { 130 switch (NWaves) { 131 case 10: 132 return 1638; 133 case 9: 134 return 1820; 135 case 8: 136 return 2048; 137 case 7: 138 return 2340; 139 case 6: 140 return 2730; 141 case 5: 142 return 3276; 143 case 4: 144 return 4096; 145 case 3: 146 return 5461; 147 case 2: 148 return 8192; 149 default: 150 return getLocalMemorySize(); 151 } 152 } 153 154 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes) const { 155 if (Bytes <= 1638) 156 return 10; 157 158 if (Bytes <= 1820) 159 return 9; 160 161 if (Bytes <= 2048) 162 return 8; 163 164 if (Bytes <= 2340) 165 return 7; 166 167 if (Bytes <= 2730) 168 return 6; 169 170 if (Bytes <= 3276) 171 return 5; 172 173 if (Bytes <= 4096) 174 return 4; 175 176 if (Bytes <= 5461) 177 return 3; 178 179 if (Bytes <= 8192) 180 return 2; 181 182 return 1; 183 } 184 185 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 186 const Function &F) const { 187 188 // Default minimum/maximum flat work group sizes. 189 std::pair<unsigned, unsigned> Default = 190 AMDGPU::isCompute(F.getCallingConv()) ? 191 std::pair<unsigned, unsigned>(getWavefrontSize() * 2, 192 getWavefrontSize() * 4) : 193 std::pair<unsigned, unsigned>(1, getWavefrontSize()); 194 195 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 196 // starts using "amdgpu-flat-work-group-size" attribute. 197 Default.second = AMDGPU::getIntegerAttribute( 198 F, "amdgpu-max-work-group-size", Default.second); 199 Default.first = std::min(Default.first, Default.second); 200 201 // Requested minimum/maximum flat work group sizes. 202 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 203 F, "amdgpu-flat-work-group-size", Default); 204 205 // Make sure requested minimum is less than requested maximum. 206 if (Requested.first > Requested.second) 207 return Default; 208 209 // Make sure requested values do not violate subtarget's specifications. 210 if (Requested.first < getMinFlatWorkGroupSize()) 211 return Default; 212 if (Requested.second > getMaxFlatWorkGroupSize()) 213 return Default; 214 215 return Requested; 216 } 217 218 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 219 const Function &F) const { 220 221 // Default minimum/maximum number of waves per execution unit. 222 std::pair<unsigned, unsigned> Default(1, 0); 223 224 // Default/requested minimum/maximum flat work group sizes. 225 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 226 227 // If minimum/maximum flat work group sizes were explicitly requested using 228 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 229 // number of waves per execution unit to values implied by requested 230 // minimum/maximum flat work group sizes. 231 unsigned MinImpliedByFlatWorkGroupSize = 232 getMaxWavesPerEU(FlatWorkGroupSizes.second); 233 bool RequestedFlatWorkGroupSize = false; 234 235 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 236 // starts using "amdgpu-flat-work-group-size" attribute. 237 if (F.hasFnAttribute("amdgpu-max-work-group-size") || 238 F.hasFnAttribute("amdgpu-flat-work-group-size")) { 239 Default.first = MinImpliedByFlatWorkGroupSize; 240 RequestedFlatWorkGroupSize = true; 241 } 242 243 // Requested minimum/maximum number of waves per execution unit. 244 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 245 F, "amdgpu-waves-per-eu", Default, true); 246 247 // Make sure requested minimum is less than requested maximum. 248 if (Requested.second && Requested.first > Requested.second) 249 return Default; 250 251 // Make sure requested values do not violate subtarget's specifications. 252 if (Requested.first < getMinWavesPerEU() || 253 Requested.first > getMaxWavesPerEU()) 254 return Default; 255 if (Requested.second > getMaxWavesPerEU()) 256 return Default; 257 258 // Make sure requested values are compatible with values implied by requested 259 // minimum/maximum flat work group sizes. 260 if (RequestedFlatWorkGroupSize && 261 Requested.first > MinImpliedByFlatWorkGroupSize) 262 return Default; 263 264 return Requested; 265 } 266 267 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 268 const TargetMachine &TM) : 269 AMDGPUSubtarget(TT, GPU, FS, TM), 270 InstrInfo(*this), 271 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 272 TLInfo(TM, *this) {} 273 274 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS, 275 const TargetMachine &TM) : 276 AMDGPUSubtarget(TT, GPU, FS, TM), 277 InstrInfo(*this), 278 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 279 TLInfo(TM, *this), 280 GISel() {} 281 282 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 283 unsigned NumRegionInstrs) const { 284 // Track register pressure so the scheduler can try to decrease 285 // pressure once register usage is above the threshold defined by 286 // SIRegisterInfo::getRegPressureSetLimit() 287 Policy.ShouldTrackPressure = true; 288 289 // Enabling both top down and bottom up scheduling seems to give us less 290 // register spills than just using one of these approaches on its own. 291 Policy.OnlyTopDown = false; 292 Policy.OnlyBottomUp = false; 293 294 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 295 if (!enableSIScheduler()) 296 Policy.ShouldTrackLaneMasks = true; 297 } 298 299 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const { 300 return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv()); 301 } 302 303 unsigned SISubtarget::getKernArgSegmentSize(unsigned ExplicitArgBytes) const { 304 unsigned ImplicitBytes = getImplicitArgNumBytes(); 305 if (ImplicitBytes == 0) 306 return ExplicitArgBytes; 307 308 unsigned Alignment = getAlignmentForImplicitArgPtr(); 309 return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 310 } 311 312 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 313 if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 314 if (SGPRs <= 80) 315 return 10; 316 if (SGPRs <= 88) 317 return 9; 318 if (SGPRs <= 100) 319 return 8; 320 return 7; 321 } 322 if (SGPRs <= 48) 323 return 10; 324 if (SGPRs <= 56) 325 return 9; 326 if (SGPRs <= 64) 327 return 8; 328 if (SGPRs <= 72) 329 return 7; 330 if (SGPRs <= 80) 331 return 6; 332 return 5; 333 } 334 335 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 336 if (VGPRs <= 24) 337 return 10; 338 if (VGPRs <= 28) 339 return 9; 340 if (VGPRs <= 32) 341 return 8; 342 if (VGPRs <= 36) 343 return 7; 344 if (VGPRs <= 40) 345 return 6; 346 if (VGPRs <= 48) 347 return 5; 348 if (VGPRs <= 64) 349 return 4; 350 if (VGPRs <= 84) 351 return 3; 352 if (VGPRs <= 128) 353 return 2; 354 return 1; 355 } 356