1 //===- AMDGPUResourceUsageAnalysis.h ---- analysis of resources -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// \brief Analyzes how many registers and other resources are used by
11 /// functions.
12 ///
13 /// The results of this analysis are used to fill the register usage, flat
14 /// usage, etc. into hardware registers.
15 ///
16 /// The analysis takes callees into account. E.g. if a function A that needs 10
17 /// VGPRs calls a function B that needs 20 VGPRs, querying the VGPR usage of A
18 /// will return 20.
19 /// It is assumed that an indirect call can go into any function except
20 /// hardware-entrypoints. Therefore the register usage of functions with
21 /// indirect calls is estimated as the maximum of all non-entrypoint functions
22 /// in the module.
23 ///
24 //===----------------------------------------------------------------------===//
25 
26 #include "AMDGPUResourceUsageAnalysis.h"
27 #include "AMDGPU.h"
28 #include "GCNSubtarget.h"
29 #include "SIMachineFunctionInfo.h"
30 #include "llvm/Analysis/CallGraph.h"
31 #include "llvm/CodeGen/TargetPassConfig.h"
32 #include "llvm/IR/GlobalAlias.h"
33 #include "llvm/IR/GlobalValue.h"
34 #include "llvm/Target/TargetMachine.h"
35 
36 using namespace llvm;
37 using namespace llvm::AMDGPU;
38 
39 #define DEBUG_TYPE "amdgpu-resource-usage"
40 
41 char llvm::AMDGPUResourceUsageAnalysis::ID = 0;
42 char &llvm::AMDGPUResourceUsageAnalysisID = AMDGPUResourceUsageAnalysis::ID;
43 
44 // We need to tell the runtime some amount ahead of time if we don't know the
45 // true stack size. Assume a smaller number if this is only due to dynamic /
46 // non-entry block allocas.
47 static cl::opt<uint32_t> AssumedStackSizeForExternalCall(
48     "amdgpu-assume-external-call-stack-size",
49     cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden,
50     cl::init(16384));
51 
52 static cl::opt<uint32_t> AssumedStackSizeForDynamicSizeObjects(
53     "amdgpu-assume-dynamic-stack-object-size",
54     cl::desc("Assumed extra stack use if there are any "
55              "variable sized objects (in bytes)"),
56     cl::Hidden, cl::init(4096));
57 
58 INITIALIZE_PASS(AMDGPUResourceUsageAnalysis, DEBUG_TYPE,
59                 "Function register usage analysis", true, true)
60 
61 static const Function *getCalleeFunction(const MachineOperand &Op) {
62   if (Op.isImm()) {
63     assert(Op.getImm() == 0);
64     return nullptr;
65   }
66   if (auto *GA = dyn_cast<GlobalAlias>(Op.getGlobal()))
67     return cast<Function>(GA->getOperand(0));
68   return cast<Function>(Op.getGlobal());
69 }
70 
71 static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,
72                                   const SIInstrInfo &TII, unsigned Reg) {
73   for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) {
74     if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent()))
75       return true;
76   }
77 
78   return false;
79 }
80 
81 int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumSGPRs(
82     const GCNSubtarget &ST) const {
83   return NumExplicitSGPR +
84          IsaInfo::getNumExtraSGPRs(&ST, UsesVCC, UsesFlatScratch,
85                                    ST.getTargetID().isXnackOnOrAny());
86 }
87 
88 int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
89     const GCNSubtarget &ST, int32_t ArgNumAGPR, int32_t ArgNumVGPR) const {
90   return AMDGPU::getTotalNumVGPRs(ST.hasGFX90AInsts(), ArgNumAGPR, ArgNumVGPR);
91 }
92 
93 int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
94     const GCNSubtarget &ST) const {
95   return getTotalNumVGPRs(ST, NumAGPR, NumVGPR);
96 }
97 
98 bool AMDGPUResourceUsageAnalysis::runOnModule(Module &M) {
99   auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
100   if (!TPC)
101     return false;
102 
103   MachineModuleInfo &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
104   const TargetMachine &TM = TPC->getTM<TargetMachine>();
105   bool HasIndirectCall = false;
106 
107   for (Function &F : M) {
108     if (F.isDeclaration())
109       continue;
110 
111     MachineFunction *MF = MMI.getMachineFunction(F);
112     assert(MF && "function must have been generated already");
113 
114     auto CI = CallGraphResourceInfo.insert(
115         std::make_pair(&F, SIFunctionResourceInfo()));
116     SIFunctionResourceInfo &Info = CI.first->second;
117     assert(CI.second && "should only be called once per function");
118     Info = analyzeResourceUsage(*MF, TM);
119     HasIndirectCall |= Info.HasIndirectCall;
120   }
121 
122   if (HasIndirectCall)
123     propagateIndirectCallRegisterUsage();
124 
125   return false;
126 }
127 
128 AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo
129 AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
130     const MachineFunction &MF, const TargetMachine &TM) const {
131   SIFunctionResourceInfo Info;
132 
133   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
134   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
135   const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
136   const MachineRegisterInfo &MRI = MF.getRegInfo();
137   const SIInstrInfo *TII = ST.getInstrInfo();
138   const SIRegisterInfo &TRI = TII->getRegisterInfo();
139 
140   Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||
141                          MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI) ||
142                          MRI.isLiveIn(MFI->getPreloadedReg(
143                              AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT));
144 
145   // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
146   // instructions aren't used to access the scratch buffer. Inline assembly may
147   // need it though.
148   //
149   // If we only have implicit uses of flat_scr on flat instructions, it is not
150   // really needed.
151   if (Info.UsesFlatScratch && !MFI->hasFlatScratchInit() &&
152       (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) &&
153        !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) &&
154        !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) {
155     Info.UsesFlatScratch = false;
156   }
157 
158   Info.PrivateSegmentSize = FrameInfo.getStackSize();
159 
160   // Assume a big number if there are any unknown sized objects.
161   Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
162   if (Info.HasDynamicallySizedStack)
163     Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects;
164 
165   if (MFI->isStackRealigned())
166     Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();
167 
168   Info.UsesVCC =
169       MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI);
170 
171   // If there are no calls, MachineRegisterInfo can tell us the used register
172   // count easily.
173   // A tail call isn't considered a call for MachineFrameInfo's purposes.
174   if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
175     MCPhysReg HighestVGPRReg = AMDGPU::NoRegister;
176     for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) {
177       if (MRI.isPhysRegUsed(Reg)) {
178         HighestVGPRReg = Reg;
179         break;
180       }
181     }
182 
183     if (ST.hasMAIInsts()) {
184       MCPhysReg HighestAGPRReg = AMDGPU::NoRegister;
185       for (MCPhysReg Reg : reverse(AMDGPU::AGPR_32RegClass.getRegisters())) {
186         if (MRI.isPhysRegUsed(Reg)) {
187           HighestAGPRReg = Reg;
188           break;
189         }
190       }
191       Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister
192                          ? 0
193                          : TRI.getHWRegIndex(HighestAGPRReg) + 1;
194     }
195 
196     MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
197     for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {
198       if (MRI.isPhysRegUsed(Reg)) {
199         HighestSGPRReg = Reg;
200         break;
201       }
202     }
203 
204     // We found the maximum register index. They start at 0, so add one to get
205     // the number of registers.
206     Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister
207                        ? 0
208                        : TRI.getHWRegIndex(HighestVGPRReg) + 1;
209     Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister
210                                ? 0
211                                : TRI.getHWRegIndex(HighestSGPRReg) + 1;
212 
213     return Info;
214   }
215 
216   int32_t MaxVGPR = -1;
217   int32_t MaxAGPR = -1;
218   int32_t MaxSGPR = -1;
219   uint64_t CalleeFrameSize = 0;
220 
221   for (const MachineBasicBlock &MBB : MF) {
222     for (const MachineInstr &MI : MBB) {
223       // TODO: Check regmasks? Do they occur anywhere except calls?
224       for (const MachineOperand &MO : MI.operands()) {
225         unsigned Width = 0;
226         bool IsSGPR = false;
227         bool IsAGPR = false;
228 
229         if (!MO.isReg())
230           continue;
231 
232         Register Reg = MO.getReg();
233         switch (Reg) {
234         case AMDGPU::EXEC:
235         case AMDGPU::EXEC_LO:
236         case AMDGPU::EXEC_HI:
237         case AMDGPU::SCC:
238         case AMDGPU::M0:
239         case AMDGPU::M0_LO16:
240         case AMDGPU::M0_HI16:
241         case AMDGPU::SRC_SHARED_BASE:
242         case AMDGPU::SRC_SHARED_LIMIT:
243         case AMDGPU::SRC_PRIVATE_BASE:
244         case AMDGPU::SRC_PRIVATE_LIMIT:
245         case AMDGPU::SGPR_NULL:
246         case AMDGPU::MODE:
247           continue;
248 
249         case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
250           llvm_unreachable("src_pops_exiting_wave_id should not be used");
251 
252         case AMDGPU::NoRegister:
253           assert(MI.isDebugInstr() &&
254                  "Instruction uses invalid noreg register");
255           continue;
256 
257         case AMDGPU::VCC:
258         case AMDGPU::VCC_LO:
259         case AMDGPU::VCC_HI:
260         case AMDGPU::VCC_LO_LO16:
261         case AMDGPU::VCC_LO_HI16:
262         case AMDGPU::VCC_HI_LO16:
263         case AMDGPU::VCC_HI_HI16:
264           Info.UsesVCC = true;
265           continue;
266 
267         case AMDGPU::FLAT_SCR:
268         case AMDGPU::FLAT_SCR_LO:
269         case AMDGPU::FLAT_SCR_HI:
270           continue;
271 
272         case AMDGPU::XNACK_MASK:
273         case AMDGPU::XNACK_MASK_LO:
274         case AMDGPU::XNACK_MASK_HI:
275           llvm_unreachable("xnack_mask registers should not be used");
276 
277         case AMDGPU::LDS_DIRECT:
278           llvm_unreachable("lds_direct register should not be used");
279 
280         case AMDGPU::TBA:
281         case AMDGPU::TBA_LO:
282         case AMDGPU::TBA_HI:
283         case AMDGPU::TMA:
284         case AMDGPU::TMA_LO:
285         case AMDGPU::TMA_HI:
286           llvm_unreachable("trap handler registers should not be used");
287 
288         case AMDGPU::SRC_VCCZ:
289           llvm_unreachable("src_vccz register should not be used");
290 
291         case AMDGPU::SRC_EXECZ:
292           llvm_unreachable("src_execz register should not be used");
293 
294         case AMDGPU::SRC_SCC:
295           llvm_unreachable("src_scc register should not be used");
296 
297         default:
298           break;
299         }
300 
301         if (AMDGPU::SReg_32RegClass.contains(Reg) ||
302             AMDGPU::SReg_LO16RegClass.contains(Reg) ||
303             AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
304           assert(!AMDGPU::TTMP_32RegClass.contains(Reg) &&
305                  "trap handler registers should not be used");
306           IsSGPR = true;
307           Width = 1;
308         } else if (AMDGPU::VGPR_32RegClass.contains(Reg) ||
309                    AMDGPU::VGPR_LO16RegClass.contains(Reg) ||
310                    AMDGPU::VGPR_HI16RegClass.contains(Reg)) {
311           IsSGPR = false;
312           Width = 1;
313         } else if (AMDGPU::AGPR_32RegClass.contains(Reg) ||
314                    AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
315           IsSGPR = false;
316           IsAGPR = true;
317           Width = 1;
318         } else if (AMDGPU::SReg_64RegClass.contains(Reg)) {
319           assert(!AMDGPU::TTMP_64RegClass.contains(Reg) &&
320                  "trap handler registers should not be used");
321           IsSGPR = true;
322           Width = 2;
323         } else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
324           IsSGPR = false;
325           Width = 2;
326         } else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
327           IsSGPR = false;
328           IsAGPR = true;
329           Width = 2;
330         } else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
331           IsSGPR = false;
332           Width = 3;
333         } else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
334           IsSGPR = true;
335           Width = 3;
336         } else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
337           IsSGPR = false;
338           IsAGPR = true;
339           Width = 3;
340         } else if (AMDGPU::SReg_128RegClass.contains(Reg)) {
341           assert(!AMDGPU::TTMP_128RegClass.contains(Reg) &&
342                  "trap handler registers should not be used");
343           IsSGPR = true;
344           Width = 4;
345         } else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
346           IsSGPR = false;
347           Width = 4;
348         } else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
349           IsSGPR = false;
350           IsAGPR = true;
351           Width = 4;
352         } else if (AMDGPU::VReg_160RegClass.contains(Reg)) {
353           IsSGPR = false;
354           Width = 5;
355         } else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
356           IsSGPR = true;
357           Width = 5;
358         } else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
359           IsSGPR = false;
360           IsAGPR = true;
361           Width = 5;
362         } else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
363           IsSGPR = false;
364           Width = 6;
365         } else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
366           IsSGPR = true;
367           Width = 6;
368         } else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
369           IsSGPR = false;
370           IsAGPR = true;
371           Width = 6;
372         } else if (AMDGPU::VReg_224RegClass.contains(Reg)) {
373           IsSGPR = false;
374           Width = 7;
375         } else if (AMDGPU::SReg_224RegClass.contains(Reg)) {
376           IsSGPR = true;
377           Width = 7;
378         } else if (AMDGPU::AReg_224RegClass.contains(Reg)) {
379           IsSGPR = false;
380           IsAGPR = true;
381           Width = 7;
382         } else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
383           assert(!AMDGPU::TTMP_256RegClass.contains(Reg) &&
384                  "trap handler registers should not be used");
385           IsSGPR = true;
386           Width = 8;
387         } else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
388           IsSGPR = false;
389           Width = 8;
390         } else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
391           IsSGPR = false;
392           IsAGPR = true;
393           Width = 8;
394         } else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
395           assert(!AMDGPU::TTMP_512RegClass.contains(Reg) &&
396                  "trap handler registers should not be used");
397           IsSGPR = true;
398           Width = 16;
399         } else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
400           IsSGPR = false;
401           Width = 16;
402         } else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
403           IsSGPR = false;
404           IsAGPR = true;
405           Width = 16;
406         } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
407           IsSGPR = true;
408           Width = 32;
409         } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
410           IsSGPR = false;
411           Width = 32;
412         } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
413           IsSGPR = false;
414           IsAGPR = true;
415           Width = 32;
416         } else {
417           llvm_unreachable("Unknown register class");
418         }
419         unsigned HWReg = TRI.getHWRegIndex(Reg);
420         int MaxUsed = HWReg + Width - 1;
421         if (IsSGPR) {
422           MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
423         } else if (IsAGPR) {
424           MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
425         } else {
426           MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
427         }
428       }
429 
430       if (MI.isCall()) {
431         // Pseudo used just to encode the underlying global. Is there a better
432         // way to track this?
433 
434         const MachineOperand *CalleeOp =
435             TII->getNamedOperand(MI, AMDGPU::OpName::callee);
436 
437         const Function *Callee = getCalleeFunction(*CalleeOp);
438         DenseMap<const Function *, SIFunctionResourceInfo>::const_iterator I =
439             CallGraphResourceInfo.end();
440 
441         // Avoid crashing on undefined behavior with an illegal call to a
442         // kernel. If a callsite's calling convention doesn't match the
443         // function's, it's undefined behavior. If the callsite calling
444         // convention does match, that would have errored earlier.
445         if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv()))
446           report_fatal_error("invalid call to entry function");
447 
448         bool IsIndirect = !Callee || Callee->isDeclaration();
449         if (!IsIndirect)
450           I = CallGraphResourceInfo.find(Callee);
451 
452         // FIXME: Call site could have norecurse on it
453         if (!Callee || !Callee->doesNotRecurse()) {
454           Info.HasRecursion = true;
455 
456           // TODO: If we happen to know there is no stack usage in the
457           // callgraph, we don't need to assume an infinitely growing stack.
458           if (!MI.isReturn()) {
459             // We don't need to assume an unknown stack size for tail calls.
460 
461             // FIXME: This only benefits in the case where the kernel does not
462             // directly call the tail called function. If a kernel directly
463             // calls a tail recursive function, we'll assume maximum stack size
464             // based on the regular call instruction.
465             CalleeFrameSize =
466               std::max(CalleeFrameSize,
467                        static_cast<uint64_t>(AssumedStackSizeForExternalCall));
468           }
469         }
470 
471         if (IsIndirect || I == CallGraphResourceInfo.end()) {
472           CalleeFrameSize =
473               std::max(CalleeFrameSize,
474                        static_cast<uint64_t>(AssumedStackSizeForExternalCall));
475 
476           // Register usage of indirect calls gets handled later
477           Info.UsesVCC = true;
478           Info.UsesFlatScratch = ST.hasFlatAddressSpace();
479           Info.HasDynamicallySizedStack = true;
480           Info.HasIndirectCall = true;
481         } else {
482           // We force CodeGen to run in SCC order, so the callee's register
483           // usage etc. should be the cumulative usage of all callees.
484           MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR);
485           MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR);
486           MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR);
487           CalleeFrameSize =
488               std::max(I->second.PrivateSegmentSize, CalleeFrameSize);
489           Info.UsesVCC |= I->second.UsesVCC;
490           Info.UsesFlatScratch |= I->second.UsesFlatScratch;
491           Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack;
492           Info.HasRecursion |= I->second.HasRecursion;
493           Info.HasIndirectCall |= I->second.HasIndirectCall;
494         }
495       }
496     }
497   }
498 
499   Info.NumExplicitSGPR = MaxSGPR + 1;
500   Info.NumVGPR = MaxVGPR + 1;
501   Info.NumAGPR = MaxAGPR + 1;
502   Info.PrivateSegmentSize += CalleeFrameSize;
503 
504   return Info;
505 }
506 
507 void AMDGPUResourceUsageAnalysis::propagateIndirectCallRegisterUsage() {
508   // Collect the maximum number of registers from non-hardware-entrypoints.
509   // All these functions are potential targets for indirect calls.
510   int32_t NonKernelMaxSGPRs = 0;
511   int32_t NonKernelMaxVGPRs = 0;
512   int32_t NonKernelMaxAGPRs = 0;
513 
514   for (const auto &I : CallGraphResourceInfo) {
515     if (!AMDGPU::isEntryFunctionCC(I.getFirst()->getCallingConv())) {
516       auto &Info = I.getSecond();
517       NonKernelMaxSGPRs = std::max(NonKernelMaxSGPRs, Info.NumExplicitSGPR);
518       NonKernelMaxVGPRs = std::max(NonKernelMaxVGPRs, Info.NumVGPR);
519       NonKernelMaxAGPRs = std::max(NonKernelMaxAGPRs, Info.NumAGPR);
520     }
521   }
522 
523   // Add register usage for functions with indirect calls.
524   // For calls to unknown functions, we assume the maximum register usage of
525   // all non-hardware-entrypoints in the current module.
526   for (auto &I : CallGraphResourceInfo) {
527     auto &Info = I.getSecond();
528     if (Info.HasIndirectCall) {
529       Info.NumExplicitSGPR = std::max(Info.NumExplicitSGPR, NonKernelMaxSGPRs);
530       Info.NumVGPR = std::max(Info.NumVGPR, NonKernelMaxVGPRs);
531       Info.NumAGPR = std::max(Info.NumAGPR, NonKernelMaxAGPRs);
532     }
533   }
534 }
535