1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief Implements the AMDGPU specific subclass of TargetSubtarget. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPUSubtarget.h" 16 #include "AMDGPU.h" 17 #include "AMDGPUTargetMachine.h" 18 #ifdef LLVM_BUILD_GLOBAL_ISEL 19 #include "AMDGPUCallLowering.h" 20 #include "AMDGPUInstructionSelector.h" 21 #include "AMDGPULegalizerInfo.h" 22 #include "AMDGPURegisterBankInfo.h" 23 #endif 24 #include "SIMachineFunctionInfo.h" 25 #include "llvm/ADT/SmallString.h" 26 #include "llvm/CodeGen/MachineScheduler.h" 27 #include "llvm/IR/MDBuilder.h" 28 #include "llvm/Target/TargetFrameLowering.h" 29 #include <algorithm> 30 31 using namespace llvm; 32 33 #define DEBUG_TYPE "amdgpu-subtarget" 34 35 #define GET_SUBTARGETINFO_TARGET_DESC 36 #define GET_SUBTARGETINFO_CTOR 37 #include "AMDGPUGenSubtargetInfo.inc" 38 39 AMDGPUSubtarget::~AMDGPUSubtarget() = default; 40 41 AMDGPUSubtarget & 42 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT, 43 StringRef GPU, StringRef FS) { 44 // Determine default and user-specified characteristics 45 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be 46 // enabled, but some instructions do not respect them and they run at the 47 // double precision rate, so don't enable by default. 48 // 49 // We want to be able to turn these off, but making this a subtarget feature 50 // for SI has the unhelpful behavior that it unsets everything else if you 51 // disable it. 52 53 SmallString<256> FullFS("+promote-alloca,+fp64-fp16-denormals,+dx10-clamp,+load-store-opt,"); 54 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. 55 FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,"; 56 57 FullFS += FS; 58 59 ParseSubtargetFeatures(GPU, FullFS); 60 61 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es 62 // on VI and newer hardware to avoid assertion failures due to missing ADDR64 63 // variants of MUBUF instructions. 64 if (!hasAddr64() && !FS.contains("flat-for-global")) { 65 FlatForGlobal = true; 66 } 67 68 // FIXME: I don't think think Evergreen has any useful support for 69 // denormals, but should be checked. Should we issue a warning somewhere 70 // if someone tries to enable these? 71 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { 72 FP64FP16Denormals = false; 73 FP32Denormals = false; 74 } 75 76 // Set defaults if needed. 77 if (MaxPrivateElementSize == 0) 78 MaxPrivateElementSize = 4; 79 80 return *this; 81 } 82 83 #ifdef LLVM_BUILD_GLOBAL_ISEL 84 namespace { 85 86 struct SIGISelActualAccessor : public GISelAccessor { 87 std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo; 88 std::unique_ptr<InstructionSelector> InstSelector; 89 std::unique_ptr<LegalizerInfo> Legalizer; 90 std::unique_ptr<RegisterBankInfo> RegBankInfo; 91 const AMDGPUCallLowering *getCallLowering() const override { 92 return CallLoweringInfo.get(); 93 } 94 const InstructionSelector *getInstructionSelector() const override { 95 return InstSelector.get(); 96 } 97 const LegalizerInfo *getLegalizerInfo() const override { 98 return Legalizer.get(); 99 } 100 const RegisterBankInfo *getRegBankInfo() const override { 101 return RegBankInfo.get(); 102 } 103 }; 104 105 } // end anonymous namespace 106 #endif 107 108 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 109 const TargetMachine &TM) 110 : AMDGPUGenSubtargetInfo(TT, GPU, FS), 111 TargetTriple(TT), 112 Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600), 113 IsaVersion(ISAVersion0_0_0), 114 WavefrontSize(64), 115 LocalMemorySize(0), 116 LDSBankCount(0), 117 MaxPrivateElementSize(0), 118 119 FastFMAF32(false), 120 HalfRate64Ops(false), 121 122 FP32Denormals(false), 123 FP64FP16Denormals(false), 124 FPExceptions(false), 125 DX10Clamp(false), 126 FlatForGlobal(false), 127 AutoWaitcntBeforeBarrier(false), 128 UnalignedScratchAccess(false), 129 UnalignedBufferAccess(false), 130 131 HasApertureRegs(false), 132 EnableXNACK(false), 133 TrapHandler(false), 134 DebuggerInsertNops(false), 135 DebuggerReserveRegs(false), 136 DebuggerEmitPrologue(false), 137 138 EnableVGPRSpilling(false), 139 EnablePromoteAlloca(false), 140 EnableLoadStoreOpt(false), 141 EnableUnsafeDSOffsetFolding(false), 142 EnableSIScheduler(false), 143 DumpCode(false), 144 145 FP64(false), 146 IsGCN(false), 147 GCN1Encoding(false), 148 GCN3Encoding(false), 149 CIInsts(false), 150 GFX9Insts(false), 151 SGPRInitBug(false), 152 HasSMemRealTime(false), 153 Has16BitInsts(false), 154 HasVOP3PInsts(false), 155 HasMovrel(false), 156 HasVGPRIndexMode(false), 157 HasScalarStores(false), 158 HasInv2PiInlineImm(false), 159 HasSDWA(false), 160 HasSDWAOmod(false), 161 HasSDWAScalar(false), 162 HasSDWASdst(false), 163 HasSDWAMac(false), 164 HasSDWAOutModsVOPC(false), 165 HasDPP(false), 166 FlatAddressSpace(false), 167 FlatInstOffsets(false), 168 FlatGlobalInsts(false), 169 FlatScratchInsts(false), 170 AddNoCarryInsts(false), 171 172 R600ALUInst(false), 173 CaymanISA(false), 174 CFALUBug(false), 175 HasVertexCache(false), 176 TexVTXClauseSize(0), 177 ScalarizeGlobal(false), 178 179 FeatureDisable(false), 180 InstrItins(getInstrItineraryForCPU(GPU)) { 181 AS = AMDGPU::getAMDGPUAS(TT); 182 initializeSubtargetDependencies(TT, GPU, FS); 183 } 184 185 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, 186 const Function &F) const { 187 if (NWaves == 1) 188 return getLocalMemorySize(); 189 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 190 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 191 unsigned MaxWaves = getMaxWavesPerEU(); 192 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; 193 } 194 195 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, 196 const Function &F) const { 197 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; 198 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); 199 unsigned MaxWaves = getMaxWavesPerEU(); 200 unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu; 201 unsigned NumWaves = Limit / (Bytes ? Bytes : 1u); 202 NumWaves = std::min(NumWaves, MaxWaves); 203 NumWaves = std::max(NumWaves, 1u); 204 return NumWaves; 205 } 206 207 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( 208 const Function &F) const { 209 // Default minimum/maximum flat work group sizes. 210 std::pair<unsigned, unsigned> Default = 211 AMDGPU::isCompute(F.getCallingConv()) ? 212 std::pair<unsigned, unsigned>(getWavefrontSize() * 2, 213 getWavefrontSize() * 4) : 214 std::pair<unsigned, unsigned>(1, getWavefrontSize()); 215 216 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 217 // starts using "amdgpu-flat-work-group-size" attribute. 218 Default.second = AMDGPU::getIntegerAttribute( 219 F, "amdgpu-max-work-group-size", Default.second); 220 Default.first = std::min(Default.first, Default.second); 221 222 // Requested minimum/maximum flat work group sizes. 223 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 224 F, "amdgpu-flat-work-group-size", Default); 225 226 // Make sure requested minimum is less than requested maximum. 227 if (Requested.first > Requested.second) 228 return Default; 229 230 // Make sure requested values do not violate subtarget's specifications. 231 if (Requested.first < getMinFlatWorkGroupSize()) 232 return Default; 233 if (Requested.second > getMaxFlatWorkGroupSize()) 234 return Default; 235 236 return Requested; 237 } 238 239 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( 240 const Function &F) const { 241 // Default minimum/maximum number of waves per execution unit. 242 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); 243 244 // Default/requested minimum/maximum flat work group sizes. 245 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F); 246 247 // If minimum/maximum flat work group sizes were explicitly requested using 248 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum 249 // number of waves per execution unit to values implied by requested 250 // minimum/maximum flat work group sizes. 251 unsigned MinImpliedByFlatWorkGroupSize = 252 getMaxWavesPerEU(FlatWorkGroupSizes.second); 253 bool RequestedFlatWorkGroupSize = false; 254 255 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa 256 // starts using "amdgpu-flat-work-group-size" attribute. 257 if (F.hasFnAttribute("amdgpu-max-work-group-size") || 258 F.hasFnAttribute("amdgpu-flat-work-group-size")) { 259 Default.first = MinImpliedByFlatWorkGroupSize; 260 RequestedFlatWorkGroupSize = true; 261 } 262 263 // Requested minimum/maximum number of waves per execution unit. 264 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( 265 F, "amdgpu-waves-per-eu", Default, true); 266 267 // Make sure requested minimum is less than requested maximum. 268 if (Requested.second && Requested.first > Requested.second) 269 return Default; 270 271 // Make sure requested values do not violate subtarget's specifications. 272 if (Requested.first < getMinWavesPerEU() || 273 Requested.first > getMaxWavesPerEU()) 274 return Default; 275 if (Requested.second > getMaxWavesPerEU()) 276 return Default; 277 278 // Make sure requested values are compatible with values implied by requested 279 // minimum/maximum flat work group sizes. 280 if (RequestedFlatWorkGroupSize && 281 Requested.first < MinImpliedByFlatWorkGroupSize) 282 return Default; 283 284 return Requested; 285 } 286 287 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { 288 Function *Kernel = I->getParent()->getParent(); 289 unsigned MinSize = 0; 290 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; 291 bool IdQuery = false; 292 293 // If reqd_work_group_size is present it narrows value down. 294 if (auto *CI = dyn_cast<CallInst>(I)) { 295 const Function *F = CI->getCalledFunction(); 296 if (F) { 297 unsigned Dim = UINT_MAX; 298 switch (F->getIntrinsicID()) { 299 case Intrinsic::amdgcn_workitem_id_x: 300 case Intrinsic::r600_read_tidig_x: 301 IdQuery = true; 302 LLVM_FALLTHROUGH; 303 case Intrinsic::r600_read_local_size_x: 304 Dim = 0; 305 break; 306 case Intrinsic::amdgcn_workitem_id_y: 307 case Intrinsic::r600_read_tidig_y: 308 IdQuery = true; 309 LLVM_FALLTHROUGH; 310 case Intrinsic::r600_read_local_size_y: 311 Dim = 1; 312 break; 313 case Intrinsic::amdgcn_workitem_id_z: 314 case Intrinsic::r600_read_tidig_z: 315 IdQuery = true; 316 LLVM_FALLTHROUGH; 317 case Intrinsic::r600_read_local_size_z: 318 Dim = 2; 319 break; 320 default: 321 break; 322 } 323 if (Dim <= 3) { 324 if (auto Node = Kernel->getMetadata("reqd_work_group_size")) 325 if (Node->getNumOperands() == 3) 326 MinSize = MaxSize = mdconst::extract<ConstantInt>( 327 Node->getOperand(Dim))->getZExtValue(); 328 } 329 } 330 } 331 332 if (!MaxSize) 333 return false; 334 335 // Range metadata is [Lo, Hi). For ID query we need to pass max size 336 // as Hi. For size query we need to pass Hi + 1. 337 if (IdQuery) 338 MinSize = 0; 339 else 340 ++MaxSize; 341 342 MDBuilder MDB(I->getContext()); 343 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), 344 APInt(32, MaxSize)); 345 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 346 return true; 347 } 348 349 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, 350 const TargetMachine &TM) : 351 AMDGPUSubtarget(TT, GPU, FS, TM), 352 InstrInfo(*this), 353 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 354 TLInfo(TM, *this) {} 355 356 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS, 357 const TargetMachine &TM) 358 : AMDGPUSubtarget(TT, GPU, FS, TM), InstrInfo(*this), 359 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), 360 TLInfo(TM, *this) { 361 #ifndef LLVM_BUILD_GLOBAL_ISEL 362 GISelAccessor *GISel = new GISelAccessor(); 363 #else 364 SIGISelActualAccessor *GISel = new SIGISelActualAccessor(); 365 GISel->CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); 366 GISel->Legalizer.reset(new AMDGPULegalizerInfo()); 367 368 GISel->RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo())); 369 GISel->InstSelector.reset(new AMDGPUInstructionSelector( 370 *this, *static_cast<AMDGPURegisterBankInfo *>(GISel->RegBankInfo.get()))); 371 #endif 372 setGISelAccessor(*GISel); 373 } 374 375 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 376 unsigned NumRegionInstrs) const { 377 // Track register pressure so the scheduler can try to decrease 378 // pressure once register usage is above the threshold defined by 379 // SIRegisterInfo::getRegPressureSetLimit() 380 Policy.ShouldTrackPressure = true; 381 382 // Enabling both top down and bottom up scheduling seems to give us less 383 // register spills than just using one of these approaches on its own. 384 Policy.OnlyTopDown = false; 385 Policy.OnlyBottomUp = false; 386 387 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. 388 if (!enableSIScheduler()) 389 Policy.ShouldTrackLaneMasks = true; 390 } 391 392 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const { 393 return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv()); 394 } 395 396 unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF, 397 unsigned ExplicitArgBytes) const { 398 unsigned ImplicitBytes = getImplicitArgNumBytes(MF); 399 if (ImplicitBytes == 0) 400 return ExplicitArgBytes; 401 402 unsigned Alignment = getAlignmentForImplicitArgPtr(); 403 return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; 404 } 405 406 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { 407 if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { 408 if (SGPRs <= 80) 409 return 10; 410 if (SGPRs <= 88) 411 return 9; 412 if (SGPRs <= 100) 413 return 8; 414 return 7; 415 } 416 if (SGPRs <= 48) 417 return 10; 418 if (SGPRs <= 56) 419 return 9; 420 if (SGPRs <= 64) 421 return 8; 422 if (SGPRs <= 72) 423 return 7; 424 if (SGPRs <= 80) 425 return 6; 426 return 5; 427 } 428 429 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { 430 if (VGPRs <= 24) 431 return 10; 432 if (VGPRs <= 28) 433 return 9; 434 if (VGPRs <= 32) 435 return 8; 436 if (VGPRs <= 36) 437 return 7; 438 if (VGPRs <= 40) 439 return 6; 440 if (VGPRs <= 48) 441 return 5; 442 if (VGPRs <= 64) 443 return 4; 444 if (VGPRs <= 84) 445 return 3; 446 if (VGPRs <= 128) 447 return 2; 448 return 1; 449 } 450 451 unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { 452 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 453 if (MFI.hasFlatScratchInit()) { 454 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) 455 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). 456 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) 457 return 4; // FLAT_SCRATCH, VCC (in that order). 458 } 459 460 if (isXNACKEnabled()) 461 return 4; // XNACK, VCC (in that order). 462 return 2; // VCC. 463 } 464 465 unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { 466 const Function &F = *MF.getFunction(); 467 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 468 469 // Compute maximum number of SGPRs function can use using default/requested 470 // minimum number of waves per execution unit. 471 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 472 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); 473 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); 474 475 // Check if maximum number of SGPRs was explicitly requested using 476 // "amdgpu-num-sgpr" attribute. 477 if (F.hasFnAttribute("amdgpu-num-sgpr")) { 478 unsigned Requested = AMDGPU::getIntegerAttribute( 479 F, "amdgpu-num-sgpr", MaxNumSGPRs); 480 481 // Make sure requested value does not violate subtarget's specifications. 482 if (Requested && (Requested <= getReservedNumSGPRs(MF))) 483 Requested = 0; 484 485 // If more SGPRs are required to support the input user/system SGPRs, 486 // increase to accommodate them. 487 // 488 // FIXME: This really ends up using the requested number of SGPRs + number 489 // of reserved special registers in total. Theoretically you could re-use 490 // the last input registers for these special registers, but this would 491 // require a lot of complexity to deal with the weird aliasing. 492 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs(); 493 if (Requested && Requested < InputNumSGPRs) 494 Requested = InputNumSGPRs; 495 496 // Make sure requested value is compatible with values implied by 497 // default/requested minimum/maximum number of waves per execution unit. 498 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) 499 Requested = 0; 500 if (WavesPerEU.second && 501 Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) 502 Requested = 0; 503 504 if (Requested) 505 MaxNumSGPRs = Requested; 506 } 507 508 if (hasSGPRInitBug()) 509 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 510 511 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF), 512 MaxAddressableNumSGPRs); 513 } 514 515 unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { 516 const Function &F = *MF.getFunction(); 517 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 518 519 // Compute maximum number of VGPRs function can use using default/requested 520 // minimum number of waves per execution unit. 521 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU(); 522 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); 523 524 // Check if maximum number of VGPRs was explicitly requested using 525 // "amdgpu-num-vgpr" attribute. 526 if (F.hasFnAttribute("amdgpu-num-vgpr")) { 527 unsigned Requested = AMDGPU::getIntegerAttribute( 528 F, "amdgpu-num-vgpr", MaxNumVGPRs); 529 530 // Make sure requested value does not violate subtarget's specifications. 531 if (Requested && Requested <= getReservedNumVGPRs(MF)) 532 Requested = 0; 533 534 // Make sure requested value is compatible with values implied by 535 // default/requested minimum/maximum number of waves per execution unit. 536 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) 537 Requested = 0; 538 if (WavesPerEU.second && 539 Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) 540 Requested = 0; 541 542 if (Requested) 543 MaxNumVGPRs = Requested; 544 } 545 546 return MaxNumVGPRs - getReservedNumVGPRs(MF); 547 } 548