1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief Implements the AMDGPU specific subclass of TargetSubtarget. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPUSubtarget.h" 16 #include "SIMachineFunctionInfo.h" 17 #include "llvm/ADT/SmallString.h" 18 #include "llvm/CodeGen/MachineScheduler.h" 19 #include "llvm/IR/MDBuilder.h" 20 #include "llvm/Target/TargetFrameLowering.h" 21 #include <algorithm> 22 23 using namespace llvm; 24 25 #define DEBUG_TYPE "amdgpu-subtarget" 26 27 #define GET_SUBTARGETINFO_TARGET_DESC 28 #define GET_SUBTARGETINFO_CTOR 29 #include "AMDGPUGenSubtargetInfo.inc" 30 31 AMDGPUSubtarget::~AMDGPUSubtarget() = default; 32 33 AMDGPUSubtarget & 34 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT, 35 StringRef GPU, StringRef FS) { 36 // Determine default and user-specified characteristics 37 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be 38 // enabled, but some instructions do not respect them and they run at the 39 // double precision rate, so don't enable by default. 40 // 41 // We want to be able to turn these off, but making this a subtarget feature 42 // for SI has the unhelpful behavior that it unsets everything else if you 43 // disable it. 44 45 SmallString<256> FullFS("+promote-alloca,+fp64-fp16-denormals,+dx10-clamp,+load-store-opt,"); 46 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 47 FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,"; 48 49 FullFS += FS; 50 51 ParseSubtargetFeatures(GPU, FullFS); 52 53 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es 54 // on VI and newer hardware to avoid assertion failures due to missing ADDR64 55 // variants of MUBUF instructions. 56 if (!hasAddr64() && !FS.contains("flat-for-global")) { 57 FlatForGlobal = true; 58 } 59 60 // FIXME: I don't think think Evergreen has any useful support for 61 // denormals, but should be checked. Should we issue a warning somewhere 62 // if someone tries to enable these? 63 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { 64 FP64FP16Denormals = false; 65 FP32Denormals = false; 66 } 67 68 // Set defaults if needed. 69 if (MaxPrivateElementSize == 0) 70 MaxPrivateElementSize = 4; 71 72 return *this; 73 } 74 75 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 76 const TargetMachine &TM) 77 : AMDGPUGenSubtargetInfo(TT, GPU, FS), 78 TargetTriple(TT), 79 Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600), 80 IsaVersion(ISAVersion0_0_0), 81 WavefrontSize(64), 82 LocalMemorySize(0), 83 LDSBankCount(0), 84 MaxPrivateElementSize(0), 85 86 FastFMAF32(false), 87 HalfRate64Ops(false), 88 89 FP32Denormals(false), 90 FP64FP16Denormals(false), 91 FPExceptions(false), 92 DX10Clamp(false), 93 FlatForGlobal(false), 94 UnalignedScratchAccess(false), 95 UnalignedBufferAccess(false), 96 97 HasApertureRegs(false), 98 EnableXNACK(false), 99 TrapHandler(false), 100 DebuggerInsertNops(false), 101 DebuggerReserveRegs(false), 102 DebuggerEmitPrologue(false), 103 104 EnableVGPRSpilling(false), 105 EnablePromoteAlloca(false), 106 EnableLoadStoreOpt(false), 107 EnableUnsafeDSOffsetFolding(false), 108 EnableSIScheduler(false), 109 DumpCode(false), 110 111 FP64(false), 112 IsGCN(false), 113 GCN1Encoding(false), 114 GCN3Encoding(false), 115 CIInsts(false), 116 GFX9Insts(false), 117 SGPRInitBug(false), 118 HasSMemRealTime(false), 119 Has16BitInsts(false), 120 HasVOP3PInsts(false), 121 HasMovrel(false), 122 HasVGPRIndexMode(false), 123 HasScalarStores(false), 124 HasInv2PiInlineImm(false), 125 HasSDWA(false), 126 HasDPP(false), 127 FlatAddressSpace(false), 128 FlatInstOffsets(false), 129 FlatGlobalInsts(false), 130 FlatScratchInsts(false), 131 132 R600ALUInst(false), 133 CaymanISA(false), 134 CFALUBug(false), 135 HasVertexCache(false), 136 TexVTXClauseSize(0), 137 ScalarizeGlobal(false), 138 139 FeatureDisable(false), 140 InstrItins(getInstrItineraryForCPU(GPU)) { 141 AS = AMDGPU::getAMDGPUAS(TT); 142 initializeSubtargetDependencies(TT, GPU, FS); 143 } 144 145 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 146 const Function &F) const { 147 if (NWaves == 1) 148 return getLocalMemorySize(); 149 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 150 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 151 unsigned MaxWaves = getMaxWavesPerEU(); 152 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 153 } 154 155 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 156 const Function &F) const { 157 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 158 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 159 unsigned MaxWaves = getMaxWavesPerEU(); 160 unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu; 161 unsigned NumWaves = Limit / (Bytes ? Bytes : 1u); 162 NumWaves = std::min(NumWaves, MaxWaves); 163 NumWaves = std::max(NumWaves, 1u); 164 return NumWaves; 165 } 166 167 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 168 const Function &F) const { 169 // Default minimum/maximum flat work group sizes. 170 std::pair<unsigned, unsigned> Default = 171 AMDGPU::isCompute(F.getCallingConv()) ? 172 std::pair<unsigned, unsigned>(getWavefrontSize() * 2, 173 getWavefrontSize() * 4) : 174 std::pair<unsigned, unsigned>(1, getWavefrontSize()); 175 176 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 177 // starts using "amdgpu-flat-work-group-size" attribute. 178 Default.second = AMDGPU::getIntegerAttribute( 179 F, "amdgpu-max-work-group-size", Default.second); 180 Default.first = std::min(Default.first, Default.second); 181 182 // Requested minimum/maximum flat work group sizes. 183 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 184 F, "amdgpu-flat-work-group-size", Default); 185 186 // Make sure requested minimum is less than requested maximum. 187 if (Requested.first > Requested.second) 188 return Default; 189 190 // Make sure requested values do not violate subtarget's specifications. 191 if (Requested.first < getMinFlatWorkGroupSize()) 192 return Default; 193 if (Requested.second > getMaxFlatWorkGroupSize()) 194 return Default; 195 196 return Requested; 197 } 198 199 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 200 const Function &F) const { 201 // Default minimum/maximum number of waves per execution unit. 202 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 203 204 // Default/requested minimum/maximum flat work group sizes. 205 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 206 207 // If minimum/maximum flat work group sizes were explicitly requested using 208 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 209 // number of waves per execution unit to values implied by requested 210 // minimum/maximum flat work group sizes. 211 unsigned MinImpliedByFlatWorkGroupSize = 212 getMaxWavesPerEU(FlatWorkGroupSizes.second); 213 bool RequestedFlatWorkGroupSize = false; 214 215 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 216 // starts using "amdgpu-flat-work-group-size" attribute. 217 if (F.hasFnAttribute("amdgpu-max-work-group-size") || 218 F.hasFnAttribute("amdgpu-flat-work-group-size")) { 219 Default.first = MinImpliedByFlatWorkGroupSize; 220 RequestedFlatWorkGroupSize = true; 221 } 222 223 // Requested minimum/maximum number of waves per execution unit. 224 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 225 F, "amdgpu-waves-per-eu", Default, true); 226 227 // Make sure requested minimum is less than requested maximum. 228 if (Requested.second && Requested.first > Requested.second) 229 return Default; 230 231 // Make sure requested values do not violate subtarget's specifications. 232 if (Requested.first < getMinWavesPerEU() || 233 Requested.first > getMaxWavesPerEU()) 234 return Default; 235 if (Requested.second > getMaxWavesPerEU()) 236 return Default; 237 238 // Make sure requested values are compatible with values implied by requested 239 // minimum/maximum flat work group sizes. 240 if (RequestedFlatWorkGroupSize && 241 Requested.first > MinImpliedByFlatWorkGroupSize) 242 return Default; 243 244 return Requested; 245 } 246 247 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 248 Function *Kernel = I->getParent()->getParent(); 249 unsigned MinSize = 0; 250 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 251 bool IdQuery = false; 252 253 // If reqd_work_group_size is present it narrows value down. 254 if (auto *CI = dyn_cast<CallInst>(I)) { 255 const Function *F = CI->getCalledFunction(); 256 if (F) { 257 unsigned Dim = UINT_MAX; 258 switch (F->getIntrinsicID()) { 259 case Intrinsic::amdgcn_workitem_id_x: 260 case Intrinsic::r600_read_tidig_x: 261 IdQuery = true; 262 case Intrinsic::r600_read_local_size_x: 263 Dim = 0; 264 break; 265 case Intrinsic::amdgcn_workitem_id_y: 266 case Intrinsic::r600_read_tidig_y: 267 IdQuery = true; 268 case Intrinsic::r600_read_local_size_y: 269 Dim = 1; 270 break; 271 case Intrinsic::amdgcn_workitem_id_z: 272 case Intrinsic::r600_read_tidig_z: 273 IdQuery = true; 274 case Intrinsic::r600_read_local_size_z: 275 Dim = 2; 276 break; 277 default: 278 break; 279 } 280 if (Dim <= 3) { 281 if (auto Node = Kernel->getMetadata("reqd_work_group_size")) 282 if (Node->getNumOperands() == 3) 283 MinSize = MaxSize = mdconst::extract<ConstantInt>( 284 Node->getOperand(Dim))->getZExtValue(); 285 } 286 } 287 } 288 289 if (!MaxSize) 290 return false; 291 292 // Range metadata is [Lo, Hi). For ID query we need to pass max size 293 // as Hi. For size query we need to pass Hi + 1. 294 if (IdQuery) 295 MinSize = 0; 296 else 297 ++MaxSize; 298 299 MDBuilder MDB(I->getContext()); 300 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 301 APInt(32, MaxSize)); 302 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 303 return true; 304 } 305 306 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 307 const TargetMachine &TM) : 308 AMDGPUSubtarget(TT, GPU, FS, TM), 309 InstrInfo(*this), 310 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 311 TLInfo(TM, *this) {} 312 313 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS, 314 const TargetMachine &TM) : 315 AMDGPUSubtarget(TT, GPU, FS, TM), 316 InstrInfo(*this), 317 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 318 TLInfo(TM, *this) {} 319 320 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 321 unsigned NumRegionInstrs) const { 322 // Track register pressure so the scheduler can try to decrease 323 // pressure once register usage is above the threshold defined by 324 // SIRegisterInfo::getRegPressureSetLimit() 325 Policy.ShouldTrackPressure = true; 326 327 // Enabling both top down and bottom up scheduling seems to give us less 328 // register spills than just using one of these approaches on its own. 329 Policy.OnlyTopDown = false; 330 Policy.OnlyBottomUp = false; 331 332 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 333 if (!enableSIScheduler()) 334 Policy.ShouldTrackLaneMasks = true; 335 } 336 337 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const { 338 return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv()); 339 } 340 341 unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF, 342 unsigned ExplicitArgBytes) const { 343 unsigned ImplicitBytes = getImplicitArgNumBytes(MF); 344 if (ImplicitBytes == 0) 345 return ExplicitArgBytes; 346 347 unsigned Alignment = getAlignmentForImplicitArgPtr(); 348 return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 349 } 350 351 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 352 if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 353 if (SGPRs <= 80) 354 return 10; 355 if (SGPRs <= 88) 356 return 9; 357 if (SGPRs <= 100) 358 return 8; 359 return 7; 360 } 361 if (SGPRs <= 48) 362 return 10; 363 if (SGPRs <= 56) 364 return 9; 365 if (SGPRs <= 64) 366 return 8; 367 if (SGPRs <= 72) 368 return 7; 369 if (SGPRs <= 80) 370 return 6; 371 return 5; 372 } 373 374 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 375 if (VGPRs <= 24) 376 return 10; 377 if (VGPRs <= 28) 378 return 9; 379 if (VGPRs <= 32) 380 return 8; 381 if (VGPRs <= 36) 382 return 7; 383 if (VGPRs <= 40) 384 return 6; 385 if (VGPRs <= 48) 386 return 5; 387 if (VGPRs <= 64) 388 return 4; 389 if (VGPRs <= 84) 390 return 3; 391 if (VGPRs <= 128) 392 return 2; 393 return 1; 394 } 395 396 unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 397 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 398 if (MFI.hasFlatScratchInit()) { 399 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 400 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 401 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 402 return 4; // FLAT_SCRATCH, VCC (in that order). 403 } 404 405 if (isXNACKEnabled()) 406 return 4; // XNACK, VCC (in that order). 407 return 2; // VCC. 408 } 409 410 unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 411 const Function &F = *MF.getFunction(); 412 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 413 414 // Compute maximum number of SGPRs function can use using default/requested 415 // minimum number of waves per execution unit. 416 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 417 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 418 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 419 420 // Check if maximum number of SGPRs was explicitly requested using 421 // "amdgpu-num-sgpr" attribute. 422 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 423 unsigned Requested = AMDGPU::getIntegerAttribute( 424 F, "amdgpu-num-sgpr", MaxNumSGPRs); 425 426 // Make sure requested value does not violate subtarget's specifications. 427 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 428 Requested = 0; 429 430 // If more SGPRs are required to support the input user/system SGPRs, 431 // increase to accommodate them. 432 // 433 // FIXME: This really ends up using the requested number of SGPRs + number 434 // of reserved special registers in total. Theoretically you could re-use 435 // the last input registers for these special registers, but this would 436 // require a lot of complexity to deal with the weird aliasing. 437 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 438 if (Requested && Requested < InputNumSGPRs) 439 Requested = InputNumSGPRs; 440 441 // Make sure requested value is compatible with values implied by 442 // default/requested minimum/maximum number of waves per execution unit. 443 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 444 Requested = 0; 445 if (WavesPerEU.second && 446 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 447 Requested = 0; 448 449 if (Requested) 450 MaxNumSGPRs = Requested; 451 } 452 453 if (hasSGPRInitBug()) 454 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 455 456 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 457 MaxAddressableNumSGPRs); 458 } 459 460 unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 461 const Function &F = *MF.getFunction(); 462 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 463 464 // Compute maximum number of VGPRs function can use using default/requested 465 // minimum number of waves per execution unit. 466 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 467 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 468 469 // Check if maximum number of VGPRs was explicitly requested using 470 // "amdgpu-num-vgpr" attribute. 471 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 472 unsigned Requested = AMDGPU::getIntegerAttribute( 473 F, "amdgpu-num-vgpr", MaxNumVGPRs); 474 475 // Make sure requested value does not violate subtarget's specifications. 476 if (Requested && Requested <= getReservedNumVGPRs(MF)) 477 Requested = 0; 478 479 // Make sure requested value is compatible with values implied by 480 // default/requested minimum/maximum number of waves per execution unit. 481 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 482 Requested = 0; 483 if (WavesPerEU.second && 484 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 485 Requested = 0; 486 487 if (Requested) 488 MaxNumVGPRs = Requested; 489 } 490 491 return MaxNumVGPRs - getReservedNumVGPRs(MF); 492 } 493