1 //===- AMDGPUResourceUsageAnalysis.h ---- analysis of resources -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// \brief Analyzes how many registers and other resources are used by
11 /// functions.
12 ///
13 /// The results of this analysis are used to fill the register usage, flat
14 /// usage, etc. into hardware registers.
15 ///
16 /// The analysis takes callees into account. E.g. if a function A that needs 10
17 /// VGPRs calls a function B that needs 20 VGPRs, querying the VGPR usage of A
18 /// will return 20.
19 /// It is assumed that an indirect call can go into any function except
20 /// hardware-entrypoints. Therefore the register usage of functions with
21 /// indirect calls is estimated as the maximum of all non-entrypoint functions
22 /// in the module.
23 ///
24 //===----------------------------------------------------------------------===//
25 
26 #include "AMDGPUResourceUsageAnalysis.h"
27 #include "AMDGPU.h"
28 #include "GCNSubtarget.h"
29 #include "SIMachineFunctionInfo.h"
30 #include "llvm/Analysis/CallGraph.h"
31 #include "llvm/CodeGen/TargetPassConfig.h"
32 #include "llvm/IR/GlobalAlias.h"
33 #include "llvm/IR/GlobalValue.h"
34 #include "llvm/Target/TargetMachine.h"
35 
36 using namespace llvm;
37 using namespace llvm::AMDGPU;
38 
39 #define DEBUG_TYPE "amdgpu-resource-usage"
40 
41 char llvm::AMDGPUResourceUsageAnalysis::ID = 0;
42 char &llvm::AMDGPUResourceUsageAnalysisID = AMDGPUResourceUsageAnalysis::ID;
43 
44 // We need to tell the runtime some amount ahead of time if we don't know the
45 // true stack size. Assume a smaller number if this is only due to dynamic /
46 // non-entry block allocas.
47 static cl::opt<uint32_t> AssumedStackSizeForExternalCall(
48     "amdgpu-assume-external-call-stack-size",
49     cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden,
50     cl::init(16384));
51 
52 static cl::opt<uint32_t> AssumedStackSizeForDynamicSizeObjects(
53     "amdgpu-assume-dynamic-stack-object-size",
54     cl::desc("Assumed extra stack use if there are any "
55              "variable sized objects (in bytes)"),
56     cl::Hidden, cl::init(4096));
57 
58 INITIALIZE_PASS(AMDGPUResourceUsageAnalysis, DEBUG_TYPE,
59                 "Function register usage analysis", true, true)
60 
61 static const Function *getCalleeFunction(const MachineOperand &Op) {
62   if (Op.isImm()) {
63     assert(Op.getImm() == 0);
64     return nullptr;
65   }
66   if (auto *GA = dyn_cast<GlobalAlias>(Op.getGlobal()))
67     return cast<Function>(GA->getOperand(0));
68   return cast<Function>(Op.getGlobal());
69 }
70 
71 static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,
72                                   const SIInstrInfo &TII, unsigned Reg) {
73   for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) {
74     if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent()))
75       return true;
76   }
77 
78   return false;
79 }
80 
81 int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumSGPRs(
82     const GCNSubtarget &ST) const {
83   return NumExplicitSGPR +
84          IsaInfo::getNumExtraSGPRs(&ST, UsesVCC, UsesFlatScratch,
85                                    ST.getTargetID().isXnackOnOrAny());
86 }
87 
88 int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
89     const GCNSubtarget &ST, int32_t ArgNumAGPR, int32_t ArgNumVGPR) const {
90   if (ST.hasGFX90AInsts() && ArgNumAGPR)
91     return alignTo(ArgNumVGPR, 4) + ArgNumAGPR;
92   return std::max(ArgNumVGPR, ArgNumAGPR);
93 }
94 
95 int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
96     const GCNSubtarget &ST) const {
97   return getTotalNumVGPRs(ST, NumAGPR, NumVGPR);
98 }
99 
100 bool AMDGPUResourceUsageAnalysis::runOnModule(Module &M) {
101   auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
102   if (!TPC)
103     return false;
104 
105   MachineModuleInfo &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
106   const TargetMachine &TM = TPC->getTM<TargetMachine>();
107   bool HasIndirectCall = false;
108 
109   for (Function &F : M) {
110     if (F.isDeclaration())
111       continue;
112 
113     MachineFunction *MF = MMI.getMachineFunction(F);
114     assert(MF && "function must have been generated already");
115 
116     auto CI = CallGraphResourceInfo.insert(
117         std::make_pair(&F, SIFunctionResourceInfo()));
118     SIFunctionResourceInfo &Info = CI.first->second;
119     assert(CI.second && "should only be called once per function");
120     Info = analyzeResourceUsage(*MF, TM);
121     HasIndirectCall |= Info.HasIndirectCall;
122   }
123 
124   if (HasIndirectCall)
125     propagateIndirectCallRegisterUsage();
126 
127   return false;
128 }
129 
130 AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo
131 AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
132     const MachineFunction &MF, const TargetMachine &TM) const {
133   SIFunctionResourceInfo Info;
134 
135   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
136   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
137   const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
138   const MachineRegisterInfo &MRI = MF.getRegInfo();
139   const SIInstrInfo *TII = ST.getInstrInfo();
140   const SIRegisterInfo &TRI = TII->getRegisterInfo();
141 
142   Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||
143                          MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI) ||
144                          MRI.isLiveIn(MFI->getPreloadedReg(
145                              AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT));
146 
147   // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
148   // instructions aren't used to access the scratch buffer. Inline assembly may
149   // need it though.
150   //
151   // If we only have implicit uses of flat_scr on flat instructions, it is not
152   // really needed.
153   if (Info.UsesFlatScratch && !MFI->hasFlatScratchInit() &&
154       (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) &&
155        !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) &&
156        !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) {
157     Info.UsesFlatScratch = false;
158   }
159 
160   Info.PrivateSegmentSize = FrameInfo.getStackSize();
161 
162   // Assume a big number if there are any unknown sized objects.
163   Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
164   if (Info.HasDynamicallySizedStack)
165     Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects;
166 
167   if (MFI->isStackRealigned())
168     Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();
169 
170   Info.UsesVCC =
171       MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI);
172 
173   // If there are no calls, MachineRegisterInfo can tell us the used register
174   // count easily.
175   // A tail call isn't considered a call for MachineFrameInfo's purposes.
176   if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
177     MCPhysReg HighestVGPRReg = AMDGPU::NoRegister;
178     for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) {
179       if (MRI.isPhysRegUsed(Reg)) {
180         HighestVGPRReg = Reg;
181         break;
182       }
183     }
184 
185     if (ST.hasMAIInsts()) {
186       MCPhysReg HighestAGPRReg = AMDGPU::NoRegister;
187       for (MCPhysReg Reg : reverse(AMDGPU::AGPR_32RegClass.getRegisters())) {
188         if (MRI.isPhysRegUsed(Reg)) {
189           HighestAGPRReg = Reg;
190           break;
191         }
192       }
193       Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister
194                          ? 0
195                          : TRI.getHWRegIndex(HighestAGPRReg) + 1;
196     }
197 
198     MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
199     for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {
200       if (MRI.isPhysRegUsed(Reg)) {
201         HighestSGPRReg = Reg;
202         break;
203       }
204     }
205 
206     // We found the maximum register index. They start at 0, so add one to get
207     // the number of registers.
208     Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister
209                        ? 0
210                        : TRI.getHWRegIndex(HighestVGPRReg) + 1;
211     Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister
212                                ? 0
213                                : TRI.getHWRegIndex(HighestSGPRReg) + 1;
214 
215     return Info;
216   }
217 
218   int32_t MaxVGPR = -1;
219   int32_t MaxAGPR = -1;
220   int32_t MaxSGPR = -1;
221   uint64_t CalleeFrameSize = 0;
222 
223   for (const MachineBasicBlock &MBB : MF) {
224     for (const MachineInstr &MI : MBB) {
225       // TODO: Check regmasks? Do they occur anywhere except calls?
226       for (const MachineOperand &MO : MI.operands()) {
227         unsigned Width = 0;
228         bool IsSGPR = false;
229         bool IsAGPR = false;
230 
231         if (!MO.isReg())
232           continue;
233 
234         Register Reg = MO.getReg();
235         switch (Reg) {
236         case AMDGPU::EXEC:
237         case AMDGPU::EXEC_LO:
238         case AMDGPU::EXEC_HI:
239         case AMDGPU::SCC:
240         case AMDGPU::M0:
241         case AMDGPU::M0_LO16:
242         case AMDGPU::M0_HI16:
243         case AMDGPU::SRC_SHARED_BASE:
244         case AMDGPU::SRC_SHARED_LIMIT:
245         case AMDGPU::SRC_PRIVATE_BASE:
246         case AMDGPU::SRC_PRIVATE_LIMIT:
247         case AMDGPU::SGPR_NULL:
248         case AMDGPU::MODE:
249           continue;
250 
251         case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
252           llvm_unreachable("src_pops_exiting_wave_id should not be used");
253 
254         case AMDGPU::NoRegister:
255           assert(MI.isDebugInstr() &&
256                  "Instruction uses invalid noreg register");
257           continue;
258 
259         case AMDGPU::VCC:
260         case AMDGPU::VCC_LO:
261         case AMDGPU::VCC_HI:
262         case AMDGPU::VCC_LO_LO16:
263         case AMDGPU::VCC_LO_HI16:
264         case AMDGPU::VCC_HI_LO16:
265         case AMDGPU::VCC_HI_HI16:
266           Info.UsesVCC = true;
267           continue;
268 
269         case AMDGPU::FLAT_SCR:
270         case AMDGPU::FLAT_SCR_LO:
271         case AMDGPU::FLAT_SCR_HI:
272           continue;
273 
274         case AMDGPU::XNACK_MASK:
275         case AMDGPU::XNACK_MASK_LO:
276         case AMDGPU::XNACK_MASK_HI:
277           llvm_unreachable("xnack_mask registers should not be used");
278 
279         case AMDGPU::LDS_DIRECT:
280           llvm_unreachable("lds_direct register should not be used");
281 
282         case AMDGPU::TBA:
283         case AMDGPU::TBA_LO:
284         case AMDGPU::TBA_HI:
285         case AMDGPU::TMA:
286         case AMDGPU::TMA_LO:
287         case AMDGPU::TMA_HI:
288           llvm_unreachable("trap handler registers should not be used");
289 
290         case AMDGPU::SRC_VCCZ:
291           llvm_unreachable("src_vccz register should not be used");
292 
293         case AMDGPU::SRC_EXECZ:
294           llvm_unreachable("src_execz register should not be used");
295 
296         case AMDGPU::SRC_SCC:
297           llvm_unreachable("src_scc register should not be used");
298 
299         default:
300           break;
301         }
302 
303         if (AMDGPU::SReg_32RegClass.contains(Reg) ||
304             AMDGPU::SReg_LO16RegClass.contains(Reg) ||
305             AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
306           assert(!AMDGPU::TTMP_32RegClass.contains(Reg) &&
307                  "trap handler registers should not be used");
308           IsSGPR = true;
309           Width = 1;
310         } else if (AMDGPU::VGPR_32RegClass.contains(Reg) ||
311                    AMDGPU::VGPR_LO16RegClass.contains(Reg) ||
312                    AMDGPU::VGPR_HI16RegClass.contains(Reg)) {
313           IsSGPR = false;
314           Width = 1;
315         } else if (AMDGPU::AGPR_32RegClass.contains(Reg) ||
316                    AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
317           IsSGPR = false;
318           IsAGPR = true;
319           Width = 1;
320         } else if (AMDGPU::SReg_64RegClass.contains(Reg)) {
321           assert(!AMDGPU::TTMP_64RegClass.contains(Reg) &&
322                  "trap handler registers should not be used");
323           IsSGPR = true;
324           Width = 2;
325         } else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
326           IsSGPR = false;
327           Width = 2;
328         } else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
329           IsSGPR = false;
330           IsAGPR = true;
331           Width = 2;
332         } else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
333           IsSGPR = false;
334           Width = 3;
335         } else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
336           IsSGPR = true;
337           Width = 3;
338         } else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
339           IsSGPR = false;
340           IsAGPR = true;
341           Width = 3;
342         } else if (AMDGPU::SReg_128RegClass.contains(Reg)) {
343           assert(!AMDGPU::TTMP_128RegClass.contains(Reg) &&
344                  "trap handler registers should not be used");
345           IsSGPR = true;
346           Width = 4;
347         } else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
348           IsSGPR = false;
349           Width = 4;
350         } else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
351           IsSGPR = false;
352           IsAGPR = true;
353           Width = 4;
354         } else if (AMDGPU::VReg_160RegClass.contains(Reg)) {
355           IsSGPR = false;
356           Width = 5;
357         } else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
358           IsSGPR = true;
359           Width = 5;
360         } else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
361           IsSGPR = false;
362           IsAGPR = true;
363           Width = 5;
364         } else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
365           IsSGPR = false;
366           Width = 6;
367         } else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
368           IsSGPR = true;
369           Width = 6;
370         } else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
371           IsSGPR = false;
372           IsAGPR = true;
373           Width = 6;
374         } else if (AMDGPU::VReg_224RegClass.contains(Reg)) {
375           IsSGPR = false;
376           Width = 7;
377         } else if (AMDGPU::SReg_224RegClass.contains(Reg)) {
378           IsSGPR = true;
379           Width = 7;
380         } else if (AMDGPU::AReg_224RegClass.contains(Reg)) {
381           IsSGPR = false;
382           IsAGPR = true;
383           Width = 7;
384         } else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
385           assert(!AMDGPU::TTMP_256RegClass.contains(Reg) &&
386                  "trap handler registers should not be used");
387           IsSGPR = true;
388           Width = 8;
389         } else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
390           IsSGPR = false;
391           Width = 8;
392         } else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
393           IsSGPR = false;
394           IsAGPR = true;
395           Width = 8;
396         } else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
397           assert(!AMDGPU::TTMP_512RegClass.contains(Reg) &&
398                  "trap handler registers should not be used");
399           IsSGPR = true;
400           Width = 16;
401         } else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
402           IsSGPR = false;
403           Width = 16;
404         } else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
405           IsSGPR = false;
406           IsAGPR = true;
407           Width = 16;
408         } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
409           IsSGPR = true;
410           Width = 32;
411         } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
412           IsSGPR = false;
413           Width = 32;
414         } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
415           IsSGPR = false;
416           IsAGPR = true;
417           Width = 32;
418         } else {
419           llvm_unreachable("Unknown register class");
420         }
421         unsigned HWReg = TRI.getHWRegIndex(Reg);
422         int MaxUsed = HWReg + Width - 1;
423         if (IsSGPR) {
424           MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
425         } else if (IsAGPR) {
426           MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
427         } else {
428           MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
429         }
430       }
431 
432       if (MI.isCall()) {
433         // Pseudo used just to encode the underlying global. Is there a better
434         // way to track this?
435 
436         const MachineOperand *CalleeOp =
437             TII->getNamedOperand(MI, AMDGPU::OpName::callee);
438 
439         const Function *Callee = getCalleeFunction(*CalleeOp);
440         DenseMap<const Function *, SIFunctionResourceInfo>::const_iterator I =
441             CallGraphResourceInfo.end();
442 
443         // Avoid crashing on undefined behavior with an illegal call to a
444         // kernel. If a callsite's calling convention doesn't match the
445         // function's, it's undefined behavior. If the callsite calling
446         // convention does match, that would have errored earlier.
447         if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv()))
448           report_fatal_error("invalid call to entry function");
449 
450         bool IsIndirect = !Callee || Callee->isDeclaration();
451         if (!IsIndirect)
452           I = CallGraphResourceInfo.find(Callee);
453 
454         // FIXME: Call site could have norecurse on it
455         if (!Callee || !Callee->doesNotRecurse()) {
456           Info.HasRecursion = true;
457 
458           // TODO: If we happen to know there is no stack usage in the
459           // callgraph, we don't need to assume an infinitely growing stack.
460           if (!MI.isReturn()) {
461             // We don't need to assume an unknown stack size for tail calls.
462 
463             // FIXME: This only benefits in the case where the kernel does not
464             // directly call the tail called function. If a kernel directly
465             // calls a tail recursive function, we'll assume maximum stack size
466             // based on the regular call instruction.
467             CalleeFrameSize =
468               std::max(CalleeFrameSize,
469                        static_cast<uint64_t>(AssumedStackSizeForExternalCall));
470           }
471         }
472 
473         if (IsIndirect || I == CallGraphResourceInfo.end()) {
474           CalleeFrameSize =
475               std::max(CalleeFrameSize,
476                        static_cast<uint64_t>(AssumedStackSizeForExternalCall));
477 
478           // Register usage of indirect calls gets handled later
479           Info.UsesVCC = true;
480           Info.UsesFlatScratch = ST.hasFlatAddressSpace();
481           Info.HasDynamicallySizedStack = true;
482           Info.HasIndirectCall = true;
483         } else {
484           // We force CodeGen to run in SCC order, so the callee's register
485           // usage etc. should be the cumulative usage of all callees.
486           MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR);
487           MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR);
488           MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR);
489           CalleeFrameSize =
490               std::max(I->second.PrivateSegmentSize, CalleeFrameSize);
491           Info.UsesVCC |= I->second.UsesVCC;
492           Info.UsesFlatScratch |= I->second.UsesFlatScratch;
493           Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack;
494           Info.HasRecursion |= I->second.HasRecursion;
495           Info.HasIndirectCall |= I->second.HasIndirectCall;
496         }
497       }
498     }
499   }
500 
501   Info.NumExplicitSGPR = MaxSGPR + 1;
502   Info.NumVGPR = MaxVGPR + 1;
503   Info.NumAGPR = MaxAGPR + 1;
504   Info.PrivateSegmentSize += CalleeFrameSize;
505 
506   return Info;
507 }
508 
509 void AMDGPUResourceUsageAnalysis::propagateIndirectCallRegisterUsage() {
510   // Collect the maximum number of registers from non-hardware-entrypoints.
511   // All these functions are potential targets for indirect calls.
512   int32_t NonKernelMaxSGPRs = 0;
513   int32_t NonKernelMaxVGPRs = 0;
514   int32_t NonKernelMaxAGPRs = 0;
515 
516   for (const auto &I : CallGraphResourceInfo) {
517     if (!AMDGPU::isEntryFunctionCC(I.getFirst()->getCallingConv())) {
518       auto &Info = I.getSecond();
519       NonKernelMaxSGPRs = std::max(NonKernelMaxSGPRs, Info.NumExplicitSGPR);
520       NonKernelMaxVGPRs = std::max(NonKernelMaxVGPRs, Info.NumVGPR);
521       NonKernelMaxAGPRs = std::max(NonKernelMaxAGPRs, Info.NumAGPR);
522     }
523   }
524 
525   // Add register usage for functions with indirect calls.
526   // For calls to unknown functions, we assume the maximum register usage of
527   // all non-hardware-entrypoints in the current module.
528   for (auto &I : CallGraphResourceInfo) {
529     auto &Info = I.getSecond();
530     if (Info.HasIndirectCall) {
531       Info.NumExplicitSGPR = std::max(Info.NumExplicitSGPR, NonKernelMaxSGPRs);
532       Info.NumVGPR = std::max(Info.NumVGPR, NonKernelMaxVGPRs);
533       Info.NumAGPR = std::max(Info.NumAGPR, NonKernelMaxAGPRs);
534     }
535   }
536 }
537