1 //===- AMDGPUResourceUsageAnalysis.h ---- analysis of resources -----------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// \brief Analyzes how many registers and other resources are used by 11 /// functions. 12 /// 13 /// The results of this analysis are used to fill the register usage, flat 14 /// usage, etc. into hardware registers. 15 /// 16 /// The analysis takes callees into account. E.g. if a function A that needs 10 17 /// VGPRs calls a function B that needs 20 VGPRs, querying the VGPR usage of A 18 /// will return 20. 19 /// It is assumed that an indirect call can go into any function except 20 /// hardware-entrypoints. Therefore the register usage of functions with 21 /// indirect calls is estimated as the maximum of all non-entrypoint functions 22 /// in the module. 23 /// 24 //===----------------------------------------------------------------------===// 25 26 #include "AMDGPUResourceUsageAnalysis.h" 27 #include "AMDGPU.h" 28 #include "GCNSubtarget.h" 29 #include "SIMachineFunctionInfo.h" 30 #include "llvm/Analysis/CallGraph.h" 31 #include "llvm/CodeGen/MachineFrameInfo.h" 32 #include "llvm/CodeGen/TargetPassConfig.h" 33 #include "llvm/IR/GlobalAlias.h" 34 #include "llvm/IR/GlobalValue.h" 35 #include "llvm/Target/TargetMachine.h" 36 37 using namespace llvm; 38 using namespace llvm::AMDGPU; 39 40 #define DEBUG_TYPE "amdgpu-resource-usage" 41 42 char llvm::AMDGPUResourceUsageAnalysis::ID = 0; 43 char &llvm::AMDGPUResourceUsageAnalysisID = AMDGPUResourceUsageAnalysis::ID; 44 45 // We need to tell the runtime some amount ahead of time if we don't know the 46 // true stack size. Assume a smaller number if this is only due to dynamic / 47 // non-entry block allocas. 48 static cl::opt<uint32_t> AssumedStackSizeForExternalCall( 49 "amdgpu-assume-external-call-stack-size", 50 cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden, 51 cl::init(16384)); 52 53 static cl::opt<uint32_t> AssumedStackSizeForDynamicSizeObjects( 54 "amdgpu-assume-dynamic-stack-object-size", 55 cl::desc("Assumed extra stack use if there are any " 56 "variable sized objects (in bytes)"), 57 cl::Hidden, cl::init(4096)); 58 59 INITIALIZE_PASS(AMDGPUResourceUsageAnalysis, DEBUG_TYPE, 60 "Function register usage analysis", true, true) 61 62 static const Function *getCalleeFunction(const MachineOperand &Op) { 63 if (Op.isImm()) { 64 assert(Op.getImm() == 0); 65 return nullptr; 66 } 67 if (auto *GA = dyn_cast<GlobalAlias>(Op.getGlobal())) 68 return cast<Function>(GA->getOperand(0)); 69 return cast<Function>(Op.getGlobal()); 70 } 71 72 static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI, 73 const SIInstrInfo &TII, unsigned Reg) { 74 for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) { 75 if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent())) 76 return true; 77 } 78 79 return false; 80 } 81 82 int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumSGPRs( 83 const GCNSubtarget &ST) const { 84 return NumExplicitSGPR + 85 IsaInfo::getNumExtraSGPRs(&ST, UsesVCC, UsesFlatScratch, 86 ST.getTargetID().isXnackOnOrAny()); 87 } 88 89 int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs( 90 const GCNSubtarget &ST, int32_t ArgNumAGPR, int32_t ArgNumVGPR) const { 91 return AMDGPU::getTotalNumVGPRs(ST.hasGFX90AInsts(), ArgNumAGPR, ArgNumVGPR); 92 } 93 94 int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs( 95 const GCNSubtarget &ST) const { 96 return getTotalNumVGPRs(ST, NumAGPR, NumVGPR); 97 } 98 99 bool AMDGPUResourceUsageAnalysis::runOnModule(Module &M) { 100 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); 101 if (!TPC) 102 return false; 103 104 MachineModuleInfo &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI(); 105 const TargetMachine &TM = TPC->getTM<TargetMachine>(); 106 bool HasIndirectCall = false; 107 108 for (Function &F : M) { 109 if (F.isDeclaration()) 110 continue; 111 112 MachineFunction *MF = MMI.getMachineFunction(F); 113 assert(MF && "function must have been generated already"); 114 115 auto CI = CallGraphResourceInfo.insert( 116 std::make_pair(&F, SIFunctionResourceInfo())); 117 SIFunctionResourceInfo &Info = CI.first->second; 118 assert(CI.second && "should only be called once per function"); 119 Info = analyzeResourceUsage(*MF, TM); 120 HasIndirectCall |= Info.HasIndirectCall; 121 } 122 123 if (HasIndirectCall) 124 propagateIndirectCallRegisterUsage(); 125 126 return false; 127 } 128 129 AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo 130 AMDGPUResourceUsageAnalysis::analyzeResourceUsage( 131 const MachineFunction &MF, const TargetMachine &TM) const { 132 SIFunctionResourceInfo Info; 133 134 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 135 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 136 const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); 137 const MachineRegisterInfo &MRI = MF.getRegInfo(); 138 const SIInstrInfo *TII = ST.getInstrInfo(); 139 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 140 141 Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) || 142 MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI) || 143 MRI.isLiveIn(MFI->getPreloadedReg( 144 AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT)); 145 146 // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat 147 // instructions aren't used to access the scratch buffer. Inline assembly may 148 // need it though. 149 // 150 // If we only have implicit uses of flat_scr on flat instructions, it is not 151 // really needed. 152 if (Info.UsesFlatScratch && !MFI->hasFlatScratchInit() && 153 (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) && 154 !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) && 155 !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) { 156 Info.UsesFlatScratch = false; 157 } 158 159 Info.PrivateSegmentSize = FrameInfo.getStackSize(); 160 161 // Assume a big number if there are any unknown sized objects. 162 Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects(); 163 if (Info.HasDynamicallySizedStack) 164 Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects; 165 166 if (MFI->isStackRealigned()) 167 Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value(); 168 169 Info.UsesVCC = 170 MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI); 171 172 // If there are no calls, MachineRegisterInfo can tell us the used register 173 // count easily. 174 // A tail call isn't considered a call for MachineFrameInfo's purposes. 175 if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) { 176 MCPhysReg HighestVGPRReg = AMDGPU::NoRegister; 177 for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) { 178 if (MRI.isPhysRegUsed(Reg)) { 179 HighestVGPRReg = Reg; 180 break; 181 } 182 } 183 184 if (ST.hasMAIInsts()) { 185 MCPhysReg HighestAGPRReg = AMDGPU::NoRegister; 186 for (MCPhysReg Reg : reverse(AMDGPU::AGPR_32RegClass.getRegisters())) { 187 if (MRI.isPhysRegUsed(Reg)) { 188 HighestAGPRReg = Reg; 189 break; 190 } 191 } 192 Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister 193 ? 0 194 : TRI.getHWRegIndex(HighestAGPRReg) + 1; 195 } 196 197 MCPhysReg HighestSGPRReg = AMDGPU::NoRegister; 198 for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) { 199 if (MRI.isPhysRegUsed(Reg)) { 200 HighestSGPRReg = Reg; 201 break; 202 } 203 } 204 205 // We found the maximum register index. They start at 0, so add one to get 206 // the number of registers. 207 Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister 208 ? 0 209 : TRI.getHWRegIndex(HighestVGPRReg) + 1; 210 Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister 211 ? 0 212 : TRI.getHWRegIndex(HighestSGPRReg) + 1; 213 214 return Info; 215 } 216 217 int32_t MaxVGPR = -1; 218 int32_t MaxAGPR = -1; 219 int32_t MaxSGPR = -1; 220 uint64_t CalleeFrameSize = 0; 221 222 for (const MachineBasicBlock &MBB : MF) { 223 for (const MachineInstr &MI : MBB) { 224 // TODO: Check regmasks? Do they occur anywhere except calls? 225 for (const MachineOperand &MO : MI.operands()) { 226 unsigned Width = 0; 227 bool IsSGPR = false; 228 bool IsAGPR = false; 229 230 if (!MO.isReg()) 231 continue; 232 233 Register Reg = MO.getReg(); 234 switch (Reg) { 235 case AMDGPU::EXEC: 236 case AMDGPU::EXEC_LO: 237 case AMDGPU::EXEC_HI: 238 case AMDGPU::SCC: 239 case AMDGPU::M0: 240 case AMDGPU::M0_LO16: 241 case AMDGPU::M0_HI16: 242 case AMDGPU::SRC_SHARED_BASE: 243 case AMDGPU::SRC_SHARED_LIMIT: 244 case AMDGPU::SRC_PRIVATE_BASE: 245 case AMDGPU::SRC_PRIVATE_LIMIT: 246 case AMDGPU::SGPR_NULL: 247 case AMDGPU::MODE: 248 continue; 249 250 case AMDGPU::SRC_POPS_EXITING_WAVE_ID: 251 llvm_unreachable("src_pops_exiting_wave_id should not be used"); 252 253 case AMDGPU::NoRegister: 254 assert(MI.isDebugInstr() && 255 "Instruction uses invalid noreg register"); 256 continue; 257 258 case AMDGPU::VCC: 259 case AMDGPU::VCC_LO: 260 case AMDGPU::VCC_HI: 261 case AMDGPU::VCC_LO_LO16: 262 case AMDGPU::VCC_LO_HI16: 263 case AMDGPU::VCC_HI_LO16: 264 case AMDGPU::VCC_HI_HI16: 265 Info.UsesVCC = true; 266 continue; 267 268 case AMDGPU::FLAT_SCR: 269 case AMDGPU::FLAT_SCR_LO: 270 case AMDGPU::FLAT_SCR_HI: 271 continue; 272 273 case AMDGPU::XNACK_MASK: 274 case AMDGPU::XNACK_MASK_LO: 275 case AMDGPU::XNACK_MASK_HI: 276 llvm_unreachable("xnack_mask registers should not be used"); 277 278 case AMDGPU::LDS_DIRECT: 279 llvm_unreachable("lds_direct register should not be used"); 280 281 case AMDGPU::TBA: 282 case AMDGPU::TBA_LO: 283 case AMDGPU::TBA_HI: 284 case AMDGPU::TMA: 285 case AMDGPU::TMA_LO: 286 case AMDGPU::TMA_HI: 287 llvm_unreachable("trap handler registers should not be used"); 288 289 case AMDGPU::SRC_VCCZ: 290 llvm_unreachable("src_vccz register should not be used"); 291 292 case AMDGPU::SRC_EXECZ: 293 llvm_unreachable("src_execz register should not be used"); 294 295 case AMDGPU::SRC_SCC: 296 llvm_unreachable("src_scc register should not be used"); 297 298 default: 299 break; 300 } 301 302 if (AMDGPU::SReg_32RegClass.contains(Reg) || 303 AMDGPU::SReg_LO16RegClass.contains(Reg) || 304 AMDGPU::SGPR_HI16RegClass.contains(Reg)) { 305 assert(!AMDGPU::TTMP_32RegClass.contains(Reg) && 306 "trap handler registers should not be used"); 307 IsSGPR = true; 308 Width = 1; 309 } else if (AMDGPU::VGPR_32RegClass.contains(Reg) || 310 AMDGPU::VGPR_LO16RegClass.contains(Reg) || 311 AMDGPU::VGPR_HI16RegClass.contains(Reg)) { 312 IsSGPR = false; 313 Width = 1; 314 } else if (AMDGPU::AGPR_32RegClass.contains(Reg) || 315 AMDGPU::AGPR_LO16RegClass.contains(Reg)) { 316 IsSGPR = false; 317 IsAGPR = true; 318 Width = 1; 319 } else if (AMDGPU::SReg_64RegClass.contains(Reg)) { 320 assert(!AMDGPU::TTMP_64RegClass.contains(Reg) && 321 "trap handler registers should not be used"); 322 IsSGPR = true; 323 Width = 2; 324 } else if (AMDGPU::VReg_64RegClass.contains(Reg)) { 325 IsSGPR = false; 326 Width = 2; 327 } else if (AMDGPU::AReg_64RegClass.contains(Reg)) { 328 IsSGPR = false; 329 IsAGPR = true; 330 Width = 2; 331 } else if (AMDGPU::VReg_96RegClass.contains(Reg)) { 332 IsSGPR = false; 333 Width = 3; 334 } else if (AMDGPU::SReg_96RegClass.contains(Reg)) { 335 IsSGPR = true; 336 Width = 3; 337 } else if (AMDGPU::AReg_96RegClass.contains(Reg)) { 338 IsSGPR = false; 339 IsAGPR = true; 340 Width = 3; 341 } else if (AMDGPU::SReg_128RegClass.contains(Reg)) { 342 assert(!AMDGPU::TTMP_128RegClass.contains(Reg) && 343 "trap handler registers should not be used"); 344 IsSGPR = true; 345 Width = 4; 346 } else if (AMDGPU::VReg_128RegClass.contains(Reg)) { 347 IsSGPR = false; 348 Width = 4; 349 } else if (AMDGPU::AReg_128RegClass.contains(Reg)) { 350 IsSGPR = false; 351 IsAGPR = true; 352 Width = 4; 353 } else if (AMDGPU::VReg_160RegClass.contains(Reg)) { 354 IsSGPR = false; 355 Width = 5; 356 } else if (AMDGPU::SReg_160RegClass.contains(Reg)) { 357 IsSGPR = true; 358 Width = 5; 359 } else if (AMDGPU::AReg_160RegClass.contains(Reg)) { 360 IsSGPR = false; 361 IsAGPR = true; 362 Width = 5; 363 } else if (AMDGPU::VReg_192RegClass.contains(Reg)) { 364 IsSGPR = false; 365 Width = 6; 366 } else if (AMDGPU::SReg_192RegClass.contains(Reg)) { 367 IsSGPR = true; 368 Width = 6; 369 } else if (AMDGPU::AReg_192RegClass.contains(Reg)) { 370 IsSGPR = false; 371 IsAGPR = true; 372 Width = 6; 373 } else if (AMDGPU::VReg_224RegClass.contains(Reg)) { 374 IsSGPR = false; 375 Width = 7; 376 } else if (AMDGPU::SReg_224RegClass.contains(Reg)) { 377 IsSGPR = true; 378 Width = 7; 379 } else if (AMDGPU::AReg_224RegClass.contains(Reg)) { 380 IsSGPR = false; 381 IsAGPR = true; 382 Width = 7; 383 } else if (AMDGPU::SReg_256RegClass.contains(Reg)) { 384 assert(!AMDGPU::TTMP_256RegClass.contains(Reg) && 385 "trap handler registers should not be used"); 386 IsSGPR = true; 387 Width = 8; 388 } else if (AMDGPU::VReg_256RegClass.contains(Reg)) { 389 IsSGPR = false; 390 Width = 8; 391 } else if (AMDGPU::AReg_256RegClass.contains(Reg)) { 392 IsSGPR = false; 393 IsAGPR = true; 394 Width = 8; 395 } else if (AMDGPU::SReg_512RegClass.contains(Reg)) { 396 assert(!AMDGPU::TTMP_512RegClass.contains(Reg) && 397 "trap handler registers should not be used"); 398 IsSGPR = true; 399 Width = 16; 400 } else if (AMDGPU::VReg_512RegClass.contains(Reg)) { 401 IsSGPR = false; 402 Width = 16; 403 } else if (AMDGPU::AReg_512RegClass.contains(Reg)) { 404 IsSGPR = false; 405 IsAGPR = true; 406 Width = 16; 407 } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) { 408 IsSGPR = true; 409 Width = 32; 410 } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) { 411 IsSGPR = false; 412 Width = 32; 413 } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) { 414 IsSGPR = false; 415 IsAGPR = true; 416 Width = 32; 417 } else { 418 llvm_unreachable("Unknown register class"); 419 } 420 unsigned HWReg = TRI.getHWRegIndex(Reg); 421 int MaxUsed = HWReg + Width - 1; 422 if (IsSGPR) { 423 MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR; 424 } else if (IsAGPR) { 425 MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR; 426 } else { 427 MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR; 428 } 429 } 430 431 if (MI.isCall()) { 432 // Pseudo used just to encode the underlying global. Is there a better 433 // way to track this? 434 435 const MachineOperand *CalleeOp = 436 TII->getNamedOperand(MI, AMDGPU::OpName::callee); 437 438 const Function *Callee = getCalleeFunction(*CalleeOp); 439 DenseMap<const Function *, SIFunctionResourceInfo>::const_iterator I = 440 CallGraphResourceInfo.end(); 441 442 // Avoid crashing on undefined behavior with an illegal call to a 443 // kernel. If a callsite's calling convention doesn't match the 444 // function's, it's undefined behavior. If the callsite calling 445 // convention does match, that would have errored earlier. 446 if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv())) 447 report_fatal_error("invalid call to entry function"); 448 449 bool IsIndirect = !Callee || Callee->isDeclaration(); 450 if (!IsIndirect) 451 I = CallGraphResourceInfo.find(Callee); 452 453 // FIXME: Call site could have norecurse on it 454 if (!Callee || !Callee->doesNotRecurse()) { 455 Info.HasRecursion = true; 456 457 // TODO: If we happen to know there is no stack usage in the 458 // callgraph, we don't need to assume an infinitely growing stack. 459 if (!MI.isReturn()) { 460 // We don't need to assume an unknown stack size for tail calls. 461 462 // FIXME: This only benefits in the case where the kernel does not 463 // directly call the tail called function. If a kernel directly 464 // calls a tail recursive function, we'll assume maximum stack size 465 // based on the regular call instruction. 466 CalleeFrameSize = 467 std::max(CalleeFrameSize, 468 static_cast<uint64_t>(AssumedStackSizeForExternalCall)); 469 } 470 } 471 472 if (IsIndirect || I == CallGraphResourceInfo.end()) { 473 CalleeFrameSize = 474 std::max(CalleeFrameSize, 475 static_cast<uint64_t>(AssumedStackSizeForExternalCall)); 476 477 // Register usage of indirect calls gets handled later 478 Info.UsesVCC = true; 479 Info.UsesFlatScratch = ST.hasFlatAddressSpace(); 480 Info.HasDynamicallySizedStack = true; 481 Info.HasIndirectCall = true; 482 } else { 483 // We force CodeGen to run in SCC order, so the callee's register 484 // usage etc. should be the cumulative usage of all callees. 485 MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR); 486 MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR); 487 MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR); 488 CalleeFrameSize = 489 std::max(I->second.PrivateSegmentSize, CalleeFrameSize); 490 Info.UsesVCC |= I->second.UsesVCC; 491 Info.UsesFlatScratch |= I->second.UsesFlatScratch; 492 Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack; 493 Info.HasRecursion |= I->second.HasRecursion; 494 Info.HasIndirectCall |= I->second.HasIndirectCall; 495 } 496 } 497 } 498 } 499 500 Info.NumExplicitSGPR = MaxSGPR + 1; 501 Info.NumVGPR = MaxVGPR + 1; 502 Info.NumAGPR = MaxAGPR + 1; 503 Info.PrivateSegmentSize += CalleeFrameSize; 504 505 return Info; 506 } 507 508 void AMDGPUResourceUsageAnalysis::propagateIndirectCallRegisterUsage() { 509 // Collect the maximum number of registers from non-hardware-entrypoints. 510 // All these functions are potential targets for indirect calls. 511 int32_t NonKernelMaxSGPRs = 0; 512 int32_t NonKernelMaxVGPRs = 0; 513 int32_t NonKernelMaxAGPRs = 0; 514 515 for (const auto &I : CallGraphResourceInfo) { 516 if (!AMDGPU::isEntryFunctionCC(I.getFirst()->getCallingConv())) { 517 auto &Info = I.getSecond(); 518 NonKernelMaxSGPRs = std::max(NonKernelMaxSGPRs, Info.NumExplicitSGPR); 519 NonKernelMaxVGPRs = std::max(NonKernelMaxVGPRs, Info.NumVGPR); 520 NonKernelMaxAGPRs = std::max(NonKernelMaxAGPRs, Info.NumAGPR); 521 } 522 } 523 524 // Add register usage for functions with indirect calls. 525 // For calls to unknown functions, we assume the maximum register usage of 526 // all non-hardware-entrypoints in the current module. 527 for (auto &I : CallGraphResourceInfo) { 528 auto &Info = I.getSecond(); 529 if (Info.HasIndirectCall) { 530 Info.NumExplicitSGPR = std::max(Info.NumExplicitSGPR, NonKernelMaxSGPRs); 531 Info.NumVGPR = std::max(Info.NumVGPR, NonKernelMaxVGPRs); 532 Info.NumAGPR = std::max(Info.NumAGPR, NonKernelMaxAGPRs); 533 } 534 } 535 } 536