1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief Implements the AMDGPU specific subclass of TargetSubtarget. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPUSubtarget.h" 16 #include "SIMachineFunctionInfo.h" 17 #include "llvm/ADT/SmallString.h" 18 #include "llvm/CodeGen/MachineScheduler.h" 19 #include "llvm/IR/MDBuilder.h" 20 #include "llvm/Target/TargetFrameLowering.h" 21 #include <algorithm> 22 23 using namespace llvm; 24 25 #define DEBUG_TYPE "amdgpu-subtarget" 26 27 #define GET_SUBTARGETINFO_TARGET_DESC 28 #define GET_SUBTARGETINFO_CTOR 29 #include "AMDGPUGenSubtargetInfo.inc" 30 31 AMDGPUSubtarget::~AMDGPUSubtarget() = default; 32 33 AMDGPUSubtarget & 34 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT, 35 StringRef GPU, StringRef FS) { 36 // Determine default and user-specified characteristics 37 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be 38 // enabled, but some instructions do not respect them and they run at the 39 // double precision rate, so don't enable by default. 40 // 41 // We want to be able to turn these off, but making this a subtarget feature 42 // for SI has the unhelpful behavior that it unsets everything else if you 43 // disable it. 44 45 SmallString<256> FullFS("+promote-alloca,+fp64-fp16-denormals,+dx10-clamp,+load-store-opt,"); 46 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 47 FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,"; 48 49 FullFS += FS; 50 51 ParseSubtargetFeatures(GPU, FullFS); 52 53 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es 54 // on VI and newer hardware to avoid assertion failures due to missing ADDR64 55 // variants of MUBUF instructions. 56 if (!hasAddr64() && !FS.contains("flat-for-global")) { 57 FlatForGlobal = true; 58 } 59 60 // FIXME: I don't think think Evergreen has any useful support for 61 // denormals, but should be checked. Should we issue a warning somewhere 62 // if someone tries to enable these? 63 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { 64 FP64FP16Denormals = false; 65 FP32Denormals = false; 66 } 67 68 // Set defaults if needed. 69 if (MaxPrivateElementSize == 0) 70 MaxPrivateElementSize = 4; 71 72 return *this; 73 } 74 75 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 76 const TargetMachine &TM) 77 : AMDGPUGenSubtargetInfo(TT, GPU, FS), 78 TargetTriple(TT), 79 Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600), 80 IsaVersion(ISAVersion0_0_0), 81 WavefrontSize(64), 82 LocalMemorySize(0), 83 LDSBankCount(0), 84 MaxPrivateElementSize(0), 85 86 FastFMAF32(false), 87 HalfRate64Ops(false), 88 89 FP32Denormals(false), 90 FP64FP16Denormals(false), 91 FPExceptions(false), 92 DX10Clamp(false), 93 FlatForGlobal(false), 94 UnalignedScratchAccess(false), 95 UnalignedBufferAccess(false), 96 97 HasApertureRegs(false), 98 EnableXNACK(false), 99 TrapHandler(false), 100 DebuggerInsertNops(false), 101 DebuggerReserveRegs(false), 102 DebuggerEmitPrologue(false), 103 104 EnableVGPRSpilling(false), 105 EnablePromoteAlloca(false), 106 EnableLoadStoreOpt(false), 107 EnableUnsafeDSOffsetFolding(false), 108 EnableSIScheduler(false), 109 DumpCode(false), 110 111 FP64(false), 112 IsGCN(false), 113 GCN1Encoding(false), 114 GCN3Encoding(false), 115 CIInsts(false), 116 GFX9Insts(false), 117 SGPRInitBug(false), 118 HasSMemRealTime(false), 119 Has16BitInsts(false), 120 HasVOP3PInsts(false), 121 HasMovrel(false), 122 HasVGPRIndexMode(false), 123 HasScalarStores(false), 124 HasInv2PiInlineImm(false), 125 HasSDWA(false), 126 HasDPP(false), 127 FlatAddressSpace(false), 128 129 R600ALUInst(false), 130 CaymanISA(false), 131 CFALUBug(false), 132 HasVertexCache(false), 133 TexVTXClauseSize(0), 134 ScalarizeGlobal(false), 135 136 FeatureDisable(false), 137 InstrItins(getInstrItineraryForCPU(GPU)) { 138 AS = AMDGPU::getAMDGPUAS(TT); 139 initializeSubtargetDependencies(TT, GPU, FS); 140 } 141 142 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 143 const Function &F) const { 144 if (NWaves == 1) 145 return getLocalMemorySize(); 146 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 147 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 148 unsigned MaxWaves = getMaxWavesPerEU(); 149 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 150 } 151 152 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 153 const Function &F) const { 154 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 155 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 156 unsigned MaxWaves = getMaxWavesPerEU(); 157 unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu; 158 unsigned NumWaves = Limit / (Bytes ? Bytes : 1u); 159 NumWaves = std::min(NumWaves, MaxWaves); 160 NumWaves = std::max(NumWaves, 1u); 161 return NumWaves; 162 } 163 164 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 165 const Function &F) const { 166 // Default minimum/maximum flat work group sizes. 167 std::pair<unsigned, unsigned> Default = 168 AMDGPU::isCompute(F.getCallingConv()) ? 169 std::pair<unsigned, unsigned>(getWavefrontSize() * 2, 170 getWavefrontSize() * 4) : 171 std::pair<unsigned, unsigned>(1, getWavefrontSize()); 172 173 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 174 // starts using "amdgpu-flat-work-group-size" attribute. 175 Default.second = AMDGPU::getIntegerAttribute( 176 F, "amdgpu-max-work-group-size", Default.second); 177 Default.first = std::min(Default.first, Default.second); 178 179 // Requested minimum/maximum flat work group sizes. 180 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 181 F, "amdgpu-flat-work-group-size", Default); 182 183 // Make sure requested minimum is less than requested maximum. 184 if (Requested.first > Requested.second) 185 return Default; 186 187 // Make sure requested values do not violate subtarget's specifications. 188 if (Requested.first < getMinFlatWorkGroupSize()) 189 return Default; 190 if (Requested.second > getMaxFlatWorkGroupSize()) 191 return Default; 192 193 return Requested; 194 } 195 196 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 197 const Function &F) const { 198 // Default minimum/maximum number of waves per execution unit. 199 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 200 201 // Default/requested minimum/maximum flat work group sizes. 202 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 203 204 // If minimum/maximum flat work group sizes were explicitly requested using 205 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 206 // number of waves per execution unit to values implied by requested 207 // minimum/maximum flat work group sizes. 208 unsigned MinImpliedByFlatWorkGroupSize = 209 getMaxWavesPerEU(FlatWorkGroupSizes.second); 210 bool RequestedFlatWorkGroupSize = false; 211 212 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 213 // starts using "amdgpu-flat-work-group-size" attribute. 214 if (F.hasFnAttribute("amdgpu-max-work-group-size") || 215 F.hasFnAttribute("amdgpu-flat-work-group-size")) { 216 Default.first = MinImpliedByFlatWorkGroupSize; 217 RequestedFlatWorkGroupSize = true; 218 } 219 220 // Requested minimum/maximum number of waves per execution unit. 221 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 222 F, "amdgpu-waves-per-eu", Default, true); 223 224 // Make sure requested minimum is less than requested maximum. 225 if (Requested.second && Requested.first > Requested.second) 226 return Default; 227 228 // Make sure requested values do not violate subtarget's specifications. 229 if (Requested.first < getMinWavesPerEU() || 230 Requested.first > getMaxWavesPerEU()) 231 return Default; 232 if (Requested.second > getMaxWavesPerEU()) 233 return Default; 234 235 // Make sure requested values are compatible with values implied by requested 236 // minimum/maximum flat work group sizes. 237 if (RequestedFlatWorkGroupSize && 238 Requested.first > MinImpliedByFlatWorkGroupSize) 239 return Default; 240 241 return Requested; 242 } 243 244 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 245 Function *Kernel = I->getParent()->getParent(); 246 unsigned MinSize = 0; 247 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 248 bool IdQuery = false; 249 250 // If reqd_work_group_size is present it narrows value down. 251 if (auto *CI = dyn_cast<CallInst>(I)) { 252 const Function *F = CI->getCalledFunction(); 253 if (F) { 254 unsigned Dim = UINT_MAX; 255 switch (F->getIntrinsicID()) { 256 case Intrinsic::amdgcn_workitem_id_x: 257 case Intrinsic::r600_read_tidig_x: 258 IdQuery = true; 259 case Intrinsic::r600_read_local_size_x: 260 Dim = 0; 261 break; 262 case Intrinsic::amdgcn_workitem_id_y: 263 case Intrinsic::r600_read_tidig_y: 264 IdQuery = true; 265 case Intrinsic::r600_read_local_size_y: 266 Dim = 1; 267 break; 268 case Intrinsic::amdgcn_workitem_id_z: 269 case Intrinsic::r600_read_tidig_z: 270 IdQuery = true; 271 case Intrinsic::r600_read_local_size_z: 272 Dim = 2; 273 break; 274 default: 275 break; 276 } 277 if (Dim <= 3) { 278 if (auto Node = Kernel->getMetadata("reqd_work_group_size")) 279 if (Node->getNumOperands() == 3) 280 MinSize = MaxSize = mdconst::extract<ConstantInt>( 281 Node->getOperand(Dim))->getZExtValue(); 282 } 283 } 284 } 285 286 if (!MaxSize) 287 return false; 288 289 // Range metadata is [Lo, Hi). For ID query we need to pass max size 290 // as Hi. For size query we need to pass Hi + 1. 291 if (IdQuery) 292 MinSize = 0; 293 else 294 ++MaxSize; 295 296 MDBuilder MDB(I->getContext()); 297 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 298 APInt(32, MaxSize)); 299 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 300 return true; 301 } 302 303 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 304 const TargetMachine &TM) : 305 AMDGPUSubtarget(TT, GPU, FS, TM), 306 InstrInfo(*this), 307 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 308 TLInfo(TM, *this) {} 309 310 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS, 311 const TargetMachine &TM) : 312 AMDGPUSubtarget(TT, GPU, FS, TM), 313 InstrInfo(*this), 314 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 315 TLInfo(TM, *this) {} 316 317 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 318 unsigned NumRegionInstrs) const { 319 // Track register pressure so the scheduler can try to decrease 320 // pressure once register usage is above the threshold defined by 321 // SIRegisterInfo::getRegPressureSetLimit() 322 Policy.ShouldTrackPressure = true; 323 324 // Enabling both top down and bottom up scheduling seems to give us less 325 // register spills than just using one of these approaches on its own. 326 Policy.OnlyTopDown = false; 327 Policy.OnlyBottomUp = false; 328 329 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 330 if (!enableSIScheduler()) 331 Policy.ShouldTrackLaneMasks = true; 332 } 333 334 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const { 335 return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv()); 336 } 337 338 unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF, 339 unsigned ExplicitArgBytes) const { 340 unsigned ImplicitBytes = getImplicitArgNumBytes(MF); 341 if (ImplicitBytes == 0) 342 return ExplicitArgBytes; 343 344 unsigned Alignment = getAlignmentForImplicitArgPtr(); 345 return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 346 } 347 348 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 349 if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 350 if (SGPRs <= 80) 351 return 10; 352 if (SGPRs <= 88) 353 return 9; 354 if (SGPRs <= 100) 355 return 8; 356 return 7; 357 } 358 if (SGPRs <= 48) 359 return 10; 360 if (SGPRs <= 56) 361 return 9; 362 if (SGPRs <= 64) 363 return 8; 364 if (SGPRs <= 72) 365 return 7; 366 if (SGPRs <= 80) 367 return 6; 368 return 5; 369 } 370 371 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 372 if (VGPRs <= 24) 373 return 10; 374 if (VGPRs <= 28) 375 return 9; 376 if (VGPRs <= 32) 377 return 8; 378 if (VGPRs <= 36) 379 return 7; 380 if (VGPRs <= 40) 381 return 6; 382 if (VGPRs <= 48) 383 return 5; 384 if (VGPRs <= 64) 385 return 4; 386 if (VGPRs <= 84) 387 return 3; 388 if (VGPRs <= 128) 389 return 2; 390 return 1; 391 } 392 393 unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 394 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 395 if (MFI.hasFlatScratchInit()) { 396 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 397 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 398 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 399 return 4; // FLAT_SCRATCH, VCC (in that order). 400 } 401 402 if (isXNACKEnabled()) 403 return 4; // XNACK, VCC (in that order). 404 return 2; // VCC. 405 } 406 407 unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 408 const Function &F = *MF.getFunction(); 409 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 410 411 // Compute maximum number of SGPRs function can use using default/requested 412 // minimum number of waves per execution unit. 413 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 414 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 415 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 416 417 // Check if maximum number of SGPRs was explicitly requested using 418 // "amdgpu-num-sgpr" attribute. 419 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 420 unsigned Requested = AMDGPU::getIntegerAttribute( 421 F, "amdgpu-num-sgpr", MaxNumSGPRs); 422 423 // Make sure requested value does not violate subtarget's specifications. 424 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 425 Requested = 0; 426 427 // If more SGPRs are required to support the input user/system SGPRs, 428 // increase to accommodate them. 429 // 430 // FIXME: This really ends up using the requested number of SGPRs + number 431 // of reserved special registers in total. Theoretically you could re-use 432 // the last input registers for these special registers, but this would 433 // require a lot of complexity to deal with the weird aliasing. 434 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 435 if (Requested && Requested < InputNumSGPRs) 436 Requested = InputNumSGPRs; 437 438 // Make sure requested value is compatible with values implied by 439 // default/requested minimum/maximum number of waves per execution unit. 440 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 441 Requested = 0; 442 if (WavesPerEU.second && 443 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 444 Requested = 0; 445 446 if (Requested) 447 MaxNumSGPRs = Requested; 448 } 449 450 if (hasSGPRInitBug()) 451 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 452 453 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 454 MaxAddressableNumSGPRs); 455 } 456 457 unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 458 const Function &F = *MF.getFunction(); 459 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 460 461 // Compute maximum number of VGPRs function can use using default/requested 462 // minimum number of waves per execution unit. 463 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 464 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 465 466 // Check if maximum number of VGPRs was explicitly requested using 467 // "amdgpu-num-vgpr" attribute. 468 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 469 unsigned Requested = AMDGPU::getIntegerAttribute( 470 F, "amdgpu-num-vgpr", MaxNumVGPRs); 471 472 // Make sure requested value does not violate subtarget's specifications. 473 if (Requested && Requested <= getReservedNumVGPRs(MF)) 474 Requested = 0; 475 476 // Make sure requested value is compatible with values implied by 477 // default/requested minimum/maximum number of waves per execution unit. 478 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 479 Requested = 0; 480 if (WavesPerEU.second && 481 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 482 Requested = 0; 483 484 if (Requested) 485 MaxNumVGPRs = Requested; 486 } 487 488 return MaxNumVGPRs - getReservedNumVGPRs(MF); 489 } 490