1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief Implements the AMDGPU specific subclass of TargetSubtarget. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPUSubtarget.h" 16 #include "SIMachineFunctionInfo.h" 17 #include "llvm/ADT/SmallString.h" 18 #include "llvm/CodeGen/MachineScheduler.h" 19 #include "llvm/IR/MDBuilder.h" 20 #include "llvm/Target/TargetFrameLowering.h" 21 #include <algorithm> 22 23 using namespace llvm; 24 25 #define DEBUG_TYPE "amdgpu-subtarget" 26 27 #define GET_SUBTARGETINFO_TARGET_DESC 28 #define GET_SUBTARGETINFO_CTOR 29 #include "AMDGPUGenSubtargetInfo.inc" 30 31 AMDGPUSubtarget::~AMDGPUSubtarget() = default; 32 33 AMDGPUSubtarget & 34 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT, 35 StringRef GPU, StringRef FS) { 36 // Determine default and user-specified characteristics 37 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be 38 // enabled, but some instructions do not respect them and they run at the 39 // double precision rate, so don't enable by default. 40 // 41 // We want to be able to turn these off, but making this a subtarget feature 42 // for SI has the unhelpful behavior that it unsets everything else if you 43 // disable it. 44 45 SmallString<256> FullFS("+promote-alloca,+fp64-fp16-denormals,+dx10-clamp,+load-store-opt,"); 46 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 47 FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,"; 48 49 FullFS += FS; 50 51 ParseSubtargetFeatures(GPU, FullFS); 52 53 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es 54 // on VI and newer hardware to avoid assertion failures due to missing ADDR64 55 // variants of MUBUF instructions. 56 if (!hasAddr64() && !FS.contains("flat-for-global")) { 57 FlatForGlobal = true; 58 } 59 60 // FIXME: I don't think think Evergreen has any useful support for 61 // denormals, but should be checked. Should we issue a warning somewhere 62 // if someone tries to enable these? 63 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { 64 FP64FP16Denormals = false; 65 FP32Denormals = false; 66 } 67 68 // Set defaults if needed. 69 if (MaxPrivateElementSize == 0) 70 MaxPrivateElementSize = 4; 71 72 return *this; 73 } 74 75 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 76 const TargetMachine &TM) 77 : AMDGPUGenSubtargetInfo(TT, GPU, FS), 78 TargetTriple(TT), 79 Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600), 80 IsaVersion(ISAVersion0_0_0), 81 WavefrontSize(64), 82 LocalMemorySize(0), 83 LDSBankCount(0), 84 MaxPrivateElementSize(0), 85 86 FastFMAF32(false), 87 HalfRate64Ops(false), 88 89 FP32Denormals(false), 90 FP64FP16Denormals(false), 91 FPExceptions(false), 92 DX10Clamp(false), 93 FlatForGlobal(false), 94 AutoWaitcntBeforeBarrier(false), 95 UnalignedScratchAccess(false), 96 UnalignedBufferAccess(false), 97 98 HasApertureRegs(false), 99 EnableXNACK(false), 100 TrapHandler(false), 101 DebuggerInsertNops(false), 102 DebuggerReserveRegs(false), 103 DebuggerEmitPrologue(false), 104 105 EnableVGPRSpilling(false), 106 EnablePromoteAlloca(false), 107 EnableLoadStoreOpt(false), 108 EnableUnsafeDSOffsetFolding(false), 109 EnableSIScheduler(false), 110 DumpCode(false), 111 112 FP64(false), 113 IsGCN(false), 114 GCN1Encoding(false), 115 GCN3Encoding(false), 116 CIInsts(false), 117 GFX9Insts(false), 118 SGPRInitBug(false), 119 HasSMemRealTime(false), 120 Has16BitInsts(false), 121 HasVOP3PInsts(false), 122 HasMovrel(false), 123 HasVGPRIndexMode(false), 124 HasScalarStores(false), 125 HasInv2PiInlineImm(false), 126 HasSDWA(false), 127 HasDPP(false), 128 FlatAddressSpace(false), 129 FlatInstOffsets(false), 130 FlatGlobalInsts(false), 131 FlatScratchInsts(false), 132 133 R600ALUInst(false), 134 CaymanISA(false), 135 CFALUBug(false), 136 HasVertexCache(false), 137 TexVTXClauseSize(0), 138 ScalarizeGlobal(false), 139 140 FeatureDisable(false), 141 InstrItins(getInstrItineraryForCPU(GPU)) { 142 AS = AMDGPU::getAMDGPUAS(TT); 143 initializeSubtargetDependencies(TT, GPU, FS); 144 } 145 146 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 147 const Function &F) const { 148 if (NWaves == 1) 149 return getLocalMemorySize(); 150 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 151 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 152 unsigned MaxWaves = getMaxWavesPerEU(); 153 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 154 } 155 156 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 157 const Function &F) const { 158 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 159 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 160 unsigned MaxWaves = getMaxWavesPerEU(); 161 unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu; 162 unsigned NumWaves = Limit / (Bytes ? Bytes : 1u); 163 NumWaves = std::min(NumWaves, MaxWaves); 164 NumWaves = std::max(NumWaves, 1u); 165 return NumWaves; 166 } 167 168 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 169 const Function &F) const { 170 // Default minimum/maximum flat work group sizes. 171 std::pair<unsigned, unsigned> Default = 172 AMDGPU::isCompute(F.getCallingConv()) ? 173 std::pair<unsigned, unsigned>(getWavefrontSize() * 2, 174 getWavefrontSize() * 4) : 175 std::pair<unsigned, unsigned>(1, getWavefrontSize()); 176 177 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 178 // starts using "amdgpu-flat-work-group-size" attribute. 179 Default.second = AMDGPU::getIntegerAttribute( 180 F, "amdgpu-max-work-group-size", Default.second); 181 Default.first = std::min(Default.first, Default.second); 182 183 // Requested minimum/maximum flat work group sizes. 184 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 185 F, "amdgpu-flat-work-group-size", Default); 186 187 // Make sure requested minimum is less than requested maximum. 188 if (Requested.first > Requested.second) 189 return Default; 190 191 // Make sure requested values do not violate subtarget's specifications. 192 if (Requested.first < getMinFlatWorkGroupSize()) 193 return Default; 194 if (Requested.second > getMaxFlatWorkGroupSize()) 195 return Default; 196 197 return Requested; 198 } 199 200 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 201 const Function &F) const { 202 // Default minimum/maximum number of waves per execution unit. 203 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 204 205 // Default/requested minimum/maximum flat work group sizes. 206 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 207 208 // If minimum/maximum flat work group sizes were explicitly requested using 209 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 210 // number of waves per execution unit to values implied by requested 211 // minimum/maximum flat work group sizes. 212 unsigned MinImpliedByFlatWorkGroupSize = 213 getMaxWavesPerEU(FlatWorkGroupSizes.second); 214 bool RequestedFlatWorkGroupSize = false; 215 216 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 217 // starts using "amdgpu-flat-work-group-size" attribute. 218 if (F.hasFnAttribute("amdgpu-max-work-group-size") || 219 F.hasFnAttribute("amdgpu-flat-work-group-size")) { 220 Default.first = MinImpliedByFlatWorkGroupSize; 221 RequestedFlatWorkGroupSize = true; 222 } 223 224 // Requested minimum/maximum number of waves per execution unit. 225 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 226 F, "amdgpu-waves-per-eu", Default, true); 227 228 // Make sure requested minimum is less than requested maximum. 229 if (Requested.second && Requested.first > Requested.second) 230 return Default; 231 232 // Make sure requested values do not violate subtarget's specifications. 233 if (Requested.first < getMinWavesPerEU() || 234 Requested.first > getMaxWavesPerEU()) 235 return Default; 236 if (Requested.second > getMaxWavesPerEU()) 237 return Default; 238 239 // Make sure requested values are compatible with values implied by requested 240 // minimum/maximum flat work group sizes. 241 if (RequestedFlatWorkGroupSize && 242 Requested.first > MinImpliedByFlatWorkGroupSize) 243 return Default; 244 245 return Requested; 246 } 247 248 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 249 Function *Kernel = I->getParent()->getParent(); 250 unsigned MinSize = 0; 251 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 252 bool IdQuery = false; 253 254 // If reqd_work_group_size is present it narrows value down. 255 if (auto *CI = dyn_cast<CallInst>(I)) { 256 const Function *F = CI->getCalledFunction(); 257 if (F) { 258 unsigned Dim = UINT_MAX; 259 switch (F->getIntrinsicID()) { 260 case Intrinsic::amdgcn_workitem_id_x: 261 case Intrinsic::r600_read_tidig_x: 262 IdQuery = true; 263 case Intrinsic::r600_read_local_size_x: 264 Dim = 0; 265 break; 266 case Intrinsic::amdgcn_workitem_id_y: 267 case Intrinsic::r600_read_tidig_y: 268 IdQuery = true; 269 case Intrinsic::r600_read_local_size_y: 270 Dim = 1; 271 break; 272 case Intrinsic::amdgcn_workitem_id_z: 273 case Intrinsic::r600_read_tidig_z: 274 IdQuery = true; 275 case Intrinsic::r600_read_local_size_z: 276 Dim = 2; 277 break; 278 default: 279 break; 280 } 281 if (Dim <= 3) { 282 if (auto Node = Kernel->getMetadata("reqd_work_group_size")) 283 if (Node->getNumOperands() == 3) 284 MinSize = MaxSize = mdconst::extract<ConstantInt>( 285 Node->getOperand(Dim))->getZExtValue(); 286 } 287 } 288 } 289 290 if (!MaxSize) 291 return false; 292 293 // Range metadata is [Lo, Hi). For ID query we need to pass max size 294 // as Hi. For size query we need to pass Hi + 1. 295 if (IdQuery) 296 MinSize = 0; 297 else 298 ++MaxSize; 299 300 MDBuilder MDB(I->getContext()); 301 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 302 APInt(32, MaxSize)); 303 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 304 return true; 305 } 306 307 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 308 const TargetMachine &TM) : 309 AMDGPUSubtarget(TT, GPU, FS, TM), 310 InstrInfo(*this), 311 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 312 TLInfo(TM, *this) {} 313 314 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS, 315 const TargetMachine &TM) : 316 AMDGPUSubtarget(TT, GPU, FS, TM), 317 InstrInfo(*this), 318 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 319 TLInfo(TM, *this) {} 320 321 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 322 unsigned NumRegionInstrs) const { 323 // Track register pressure so the scheduler can try to decrease 324 // pressure once register usage is above the threshold defined by 325 // SIRegisterInfo::getRegPressureSetLimit() 326 Policy.ShouldTrackPressure = true; 327 328 // Enabling both top down and bottom up scheduling seems to give us less 329 // register spills than just using one of these approaches on its own. 330 Policy.OnlyTopDown = false; 331 Policy.OnlyBottomUp = false; 332 333 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 334 if (!enableSIScheduler()) 335 Policy.ShouldTrackLaneMasks = true; 336 } 337 338 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const { 339 return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv()); 340 } 341 342 unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF, 343 unsigned ExplicitArgBytes) const { 344 unsigned ImplicitBytes = getImplicitArgNumBytes(MF); 345 if (ImplicitBytes == 0) 346 return ExplicitArgBytes; 347 348 unsigned Alignment = getAlignmentForImplicitArgPtr(); 349 return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 350 } 351 352 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 353 if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 354 if (SGPRs <= 80) 355 return 10; 356 if (SGPRs <= 88) 357 return 9; 358 if (SGPRs <= 100) 359 return 8; 360 return 7; 361 } 362 if (SGPRs <= 48) 363 return 10; 364 if (SGPRs <= 56) 365 return 9; 366 if (SGPRs <= 64) 367 return 8; 368 if (SGPRs <= 72) 369 return 7; 370 if (SGPRs <= 80) 371 return 6; 372 return 5; 373 } 374 375 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 376 if (VGPRs <= 24) 377 return 10; 378 if (VGPRs <= 28) 379 return 9; 380 if (VGPRs <= 32) 381 return 8; 382 if (VGPRs <= 36) 383 return 7; 384 if (VGPRs <= 40) 385 return 6; 386 if (VGPRs <= 48) 387 return 5; 388 if (VGPRs <= 64) 389 return 4; 390 if (VGPRs <= 84) 391 return 3; 392 if (VGPRs <= 128) 393 return 2; 394 return 1; 395 } 396 397 unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 398 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 399 if (MFI.hasFlatScratchInit()) { 400 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 401 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 402 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 403 return 4; // FLAT_SCRATCH, VCC (in that order). 404 } 405 406 if (isXNACKEnabled()) 407 return 4; // XNACK, VCC (in that order). 408 return 2; // VCC. 409 } 410 411 unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 412 const Function &F = *MF.getFunction(); 413 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 414 415 // Compute maximum number of SGPRs function can use using default/requested 416 // minimum number of waves per execution unit. 417 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 418 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 419 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 420 421 // Check if maximum number of SGPRs was explicitly requested using 422 // "amdgpu-num-sgpr" attribute. 423 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 424 unsigned Requested = AMDGPU::getIntegerAttribute( 425 F, "amdgpu-num-sgpr", MaxNumSGPRs); 426 427 // Make sure requested value does not violate subtarget's specifications. 428 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 429 Requested = 0; 430 431 // If more SGPRs are required to support the input user/system SGPRs, 432 // increase to accommodate them. 433 // 434 // FIXME: This really ends up using the requested number of SGPRs + number 435 // of reserved special registers in total. Theoretically you could re-use 436 // the last input registers for these special registers, but this would 437 // require a lot of complexity to deal with the weird aliasing. 438 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 439 if (Requested && Requested < InputNumSGPRs) 440 Requested = InputNumSGPRs; 441 442 // Make sure requested value is compatible with values implied by 443 // default/requested minimum/maximum number of waves per execution unit. 444 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 445 Requested = 0; 446 if (WavesPerEU.second && 447 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 448 Requested = 0; 449 450 if (Requested) 451 MaxNumSGPRs = Requested; 452 } 453 454 if (hasSGPRInitBug()) 455 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 456 457 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 458 MaxAddressableNumSGPRs); 459 } 460 461 unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 462 const Function &F = *MF.getFunction(); 463 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 464 465 // Compute maximum number of VGPRs function can use using default/requested 466 // minimum number of waves per execution unit. 467 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 468 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 469 470 // Check if maximum number of VGPRs was explicitly requested using 471 // "amdgpu-num-vgpr" attribute. 472 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 473 unsigned Requested = AMDGPU::getIntegerAttribute( 474 F, "amdgpu-num-vgpr", MaxNumVGPRs); 475 476 // Make sure requested value does not violate subtarget's specifications. 477 if (Requested && Requested <= getReservedNumVGPRs(MF)) 478 Requested = 0; 479 480 // Make sure requested value is compatible with values implied by 481 // default/requested minimum/maximum number of waves per execution unit. 482 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 483 Requested = 0; 484 if (WavesPerEU.second && 485 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 486 Requested = 0; 487 488 if (Requested) 489 MaxNumVGPRs = Requested; 490 } 491 492 return MaxNumVGPRs - getReservedNumVGPRs(MF); 493 } 494