1 //===- AMDGPUResourceUsageAnalysis.h ---- analysis of resources -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// \brief Analyzes how many registers and other resources are used by
11 /// functions.
12 ///
13 /// The results of this analysis are used to fill the register usage, flat
14 /// usage, etc. into hardware registers.
15 ///
16 /// The analysis takes callees into account. E.g. if a function A that needs 10
17 /// VGPRs calls a function B that needs 20 VGPRs, querying the VGPR usage of A
18 /// will return 20.
19 /// It is assumed that an indirect call can go into any function except
20 /// hardware-entrypoints. Therefore the register usage of functions with
21 /// indirect calls is estimated as the maximum of all non-entrypoint functions
22 /// in the module.
23 ///
24 //===----------------------------------------------------------------------===//
25 
26 #include "AMDGPUResourceUsageAnalysis.h"
27 #include "AMDGPU.h"
28 #include "GCNSubtarget.h"
29 #include "SIMachineFunctionInfo.h"
30 #include "llvm/Analysis/CallGraph.h"
31 #include "llvm/CodeGen/MachineFrameInfo.h"
32 #include "llvm/CodeGen/TargetPassConfig.h"
33 #include "llvm/IR/GlobalAlias.h"
34 #include "llvm/IR/GlobalValue.h"
35 #include "llvm/Target/TargetMachine.h"
36 
37 using namespace llvm;
38 using namespace llvm::AMDGPU;
39 
40 #define DEBUG_TYPE "amdgpu-resource-usage"
41 
42 char llvm::AMDGPUResourceUsageAnalysis::ID = 0;
43 char &llvm::AMDGPUResourceUsageAnalysisID = AMDGPUResourceUsageAnalysis::ID;
44 
45 // We need to tell the runtime some amount ahead of time if we don't know the
46 // true stack size. Assume a smaller number if this is only due to dynamic /
47 // non-entry block allocas.
48 static cl::opt<uint32_t> AssumedStackSizeForExternalCall(
49     "amdgpu-assume-external-call-stack-size",
50     cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden,
51     cl::init(16384));
52 
53 static cl::opt<uint32_t> AssumedStackSizeForDynamicSizeObjects(
54     "amdgpu-assume-dynamic-stack-object-size",
55     cl::desc("Assumed extra stack use if there are any "
56              "variable sized objects (in bytes)"),
57     cl::Hidden, cl::init(4096));
58 
59 INITIALIZE_PASS(AMDGPUResourceUsageAnalysis, DEBUG_TYPE,
60                 "Function register usage analysis", true, true)
61 
62 static const Function *getCalleeFunction(const MachineOperand &Op) {
63   if (Op.isImm()) {
64     assert(Op.getImm() == 0);
65     return nullptr;
66   }
67   if (auto *GA = dyn_cast<GlobalAlias>(Op.getGlobal()))
68     return cast<Function>(GA->getOperand(0));
69   return cast<Function>(Op.getGlobal());
70 }
71 
72 static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,
73                                   const SIInstrInfo &TII, unsigned Reg) {
74   for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) {
75     if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent()))
76       return true;
77   }
78 
79   return false;
80 }
81 
82 int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumSGPRs(
83     const GCNSubtarget &ST) const {
84   return NumExplicitSGPR +
85          IsaInfo::getNumExtraSGPRs(&ST, UsesVCC, UsesFlatScratch,
86                                    ST.getTargetID().isXnackOnOrAny());
87 }
88 
89 int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
90     const GCNSubtarget &ST, int32_t ArgNumAGPR, int32_t ArgNumVGPR) const {
91   return AMDGPU::getTotalNumVGPRs(ST.hasGFX90AInsts(), ArgNumAGPR, ArgNumVGPR);
92 }
93 
94 int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
95     const GCNSubtarget &ST) const {
96   return getTotalNumVGPRs(ST, NumAGPR, NumVGPR);
97 }
98 
99 bool AMDGPUResourceUsageAnalysis::runOnModule(Module &M) {
100   auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
101   if (!TPC)
102     return false;
103 
104   MachineModuleInfo &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
105   const TargetMachine &TM = TPC->getTM<TargetMachine>();
106   bool HasIndirectCall = false;
107 
108   for (Function &F : M) {
109     if (F.isDeclaration())
110       continue;
111 
112     MachineFunction *MF = MMI.getMachineFunction(F);
113     assert(MF && "function must have been generated already");
114 
115     auto CI = CallGraphResourceInfo.insert(
116         std::make_pair(&F, SIFunctionResourceInfo()));
117     SIFunctionResourceInfo &Info = CI.first->second;
118     assert(CI.second && "should only be called once per function");
119     Info = analyzeResourceUsage(*MF, TM);
120     HasIndirectCall |= Info.HasIndirectCall;
121   }
122 
123   if (HasIndirectCall)
124     propagateIndirectCallRegisterUsage();
125 
126   return false;
127 }
128 
129 AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo
130 AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
131     const MachineFunction &MF, const TargetMachine &TM) const {
132   SIFunctionResourceInfo Info;
133 
134   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
135   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
136   const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
137   const MachineRegisterInfo &MRI = MF.getRegInfo();
138   const SIInstrInfo *TII = ST.getInstrInfo();
139   const SIRegisterInfo &TRI = TII->getRegisterInfo();
140 
141   Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||
142                          MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI) ||
143                          MRI.isLiveIn(MFI->getPreloadedReg(
144                              AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT));
145 
146   // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
147   // instructions aren't used to access the scratch buffer. Inline assembly may
148   // need it though.
149   //
150   // If we only have implicit uses of flat_scr on flat instructions, it is not
151   // really needed.
152   if (Info.UsesFlatScratch && !MFI->hasFlatScratchInit() &&
153       (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) &&
154        !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) &&
155        !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) {
156     Info.UsesFlatScratch = false;
157   }
158 
159   Info.PrivateSegmentSize = FrameInfo.getStackSize();
160 
161   // Assume a big number if there are any unknown sized objects.
162   Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
163   if (Info.HasDynamicallySizedStack)
164     Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects;
165 
166   if (MFI->isStackRealigned())
167     Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();
168 
169   Info.UsesVCC =
170       MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI);
171 
172   // If there are no calls, MachineRegisterInfo can tell us the used register
173   // count easily.
174   // A tail call isn't considered a call for MachineFrameInfo's purposes.
175   if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
176     MCPhysReg HighestVGPRReg = AMDGPU::NoRegister;
177     for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) {
178       if (MRI.isPhysRegUsed(Reg)) {
179         HighestVGPRReg = Reg;
180         break;
181       }
182     }
183 
184     if (ST.hasMAIInsts()) {
185       MCPhysReg HighestAGPRReg = AMDGPU::NoRegister;
186       for (MCPhysReg Reg : reverse(AMDGPU::AGPR_32RegClass.getRegisters())) {
187         if (MRI.isPhysRegUsed(Reg)) {
188           HighestAGPRReg = Reg;
189           break;
190         }
191       }
192       Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister
193                          ? 0
194                          : TRI.getHWRegIndex(HighestAGPRReg) + 1;
195     }
196 
197     MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
198     for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {
199       if (MRI.isPhysRegUsed(Reg)) {
200         HighestSGPRReg = Reg;
201         break;
202       }
203     }
204 
205     // We found the maximum register index. They start at 0, so add one to get
206     // the number of registers.
207     Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister
208                        ? 0
209                        : TRI.getHWRegIndex(HighestVGPRReg) + 1;
210     Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister
211                                ? 0
212                                : TRI.getHWRegIndex(HighestSGPRReg) + 1;
213 
214     return Info;
215   }
216 
217   int32_t MaxVGPR = -1;
218   int32_t MaxAGPR = -1;
219   int32_t MaxSGPR = -1;
220   uint64_t CalleeFrameSize = 0;
221 
222   for (const MachineBasicBlock &MBB : MF) {
223     for (const MachineInstr &MI : MBB) {
224       // TODO: Check regmasks? Do they occur anywhere except calls?
225       for (const MachineOperand &MO : MI.operands()) {
226         unsigned Width = 0;
227         bool IsSGPR = false;
228         bool IsAGPR = false;
229 
230         if (!MO.isReg())
231           continue;
232 
233         Register Reg = MO.getReg();
234         switch (Reg) {
235         case AMDGPU::EXEC:
236         case AMDGPU::EXEC_LO:
237         case AMDGPU::EXEC_HI:
238         case AMDGPU::SCC:
239         case AMDGPU::M0:
240         case AMDGPU::M0_LO16:
241         case AMDGPU::M0_HI16:
242         case AMDGPU::SRC_SHARED_BASE:
243         case AMDGPU::SRC_SHARED_LIMIT:
244         case AMDGPU::SRC_PRIVATE_BASE:
245         case AMDGPU::SRC_PRIVATE_LIMIT:
246         case AMDGPU::SGPR_NULL:
247         case AMDGPU::MODE:
248           continue;
249 
250         case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
251           llvm_unreachable("src_pops_exiting_wave_id should not be used");
252 
253         case AMDGPU::NoRegister:
254           assert(MI.isDebugInstr() &&
255                  "Instruction uses invalid noreg register");
256           continue;
257 
258         case AMDGPU::VCC:
259         case AMDGPU::VCC_LO:
260         case AMDGPU::VCC_HI:
261         case AMDGPU::VCC_LO_LO16:
262         case AMDGPU::VCC_LO_HI16:
263         case AMDGPU::VCC_HI_LO16:
264         case AMDGPU::VCC_HI_HI16:
265           Info.UsesVCC = true;
266           continue;
267 
268         case AMDGPU::FLAT_SCR:
269         case AMDGPU::FLAT_SCR_LO:
270         case AMDGPU::FLAT_SCR_HI:
271           continue;
272 
273         case AMDGPU::XNACK_MASK:
274         case AMDGPU::XNACK_MASK_LO:
275         case AMDGPU::XNACK_MASK_HI:
276           llvm_unreachable("xnack_mask registers should not be used");
277 
278         case AMDGPU::LDS_DIRECT:
279           llvm_unreachable("lds_direct register should not be used");
280 
281         case AMDGPU::TBA:
282         case AMDGPU::TBA_LO:
283         case AMDGPU::TBA_HI:
284         case AMDGPU::TMA:
285         case AMDGPU::TMA_LO:
286         case AMDGPU::TMA_HI:
287           llvm_unreachable("trap handler registers should not be used");
288 
289         case AMDGPU::SRC_VCCZ:
290           llvm_unreachable("src_vccz register should not be used");
291 
292         case AMDGPU::SRC_EXECZ:
293           llvm_unreachable("src_execz register should not be used");
294 
295         case AMDGPU::SRC_SCC:
296           llvm_unreachable("src_scc register should not be used");
297 
298         default:
299           break;
300         }
301 
302         if (AMDGPU::SReg_32RegClass.contains(Reg) ||
303             AMDGPU::SReg_LO16RegClass.contains(Reg) ||
304             AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
305           assert(!AMDGPU::TTMP_32RegClass.contains(Reg) &&
306                  "trap handler registers should not be used");
307           IsSGPR = true;
308           Width = 1;
309         } else if (AMDGPU::VGPR_32RegClass.contains(Reg) ||
310                    AMDGPU::VGPR_LO16RegClass.contains(Reg) ||
311                    AMDGPU::VGPR_HI16RegClass.contains(Reg)) {
312           IsSGPR = false;
313           Width = 1;
314         } else if (AMDGPU::AGPR_32RegClass.contains(Reg) ||
315                    AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
316           IsSGPR = false;
317           IsAGPR = true;
318           Width = 1;
319         } else if (AMDGPU::SReg_64RegClass.contains(Reg)) {
320           assert(!AMDGPU::TTMP_64RegClass.contains(Reg) &&
321                  "trap handler registers should not be used");
322           IsSGPR = true;
323           Width = 2;
324         } else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
325           IsSGPR = false;
326           Width = 2;
327         } else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
328           IsSGPR = false;
329           IsAGPR = true;
330           Width = 2;
331         } else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
332           IsSGPR = false;
333           Width = 3;
334         } else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
335           IsSGPR = true;
336           Width = 3;
337         } else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
338           IsSGPR = false;
339           IsAGPR = true;
340           Width = 3;
341         } else if (AMDGPU::SReg_128RegClass.contains(Reg)) {
342           assert(!AMDGPU::TTMP_128RegClass.contains(Reg) &&
343                  "trap handler registers should not be used");
344           IsSGPR = true;
345           Width = 4;
346         } else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
347           IsSGPR = false;
348           Width = 4;
349         } else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
350           IsSGPR = false;
351           IsAGPR = true;
352           Width = 4;
353         } else if (AMDGPU::VReg_160RegClass.contains(Reg)) {
354           IsSGPR = false;
355           Width = 5;
356         } else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
357           IsSGPR = true;
358           Width = 5;
359         } else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
360           IsSGPR = false;
361           IsAGPR = true;
362           Width = 5;
363         } else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
364           IsSGPR = false;
365           Width = 6;
366         } else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
367           IsSGPR = true;
368           Width = 6;
369         } else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
370           IsSGPR = false;
371           IsAGPR = true;
372           Width = 6;
373         } else if (AMDGPU::VReg_224RegClass.contains(Reg)) {
374           IsSGPR = false;
375           Width = 7;
376         } else if (AMDGPU::SReg_224RegClass.contains(Reg)) {
377           IsSGPR = true;
378           Width = 7;
379         } else if (AMDGPU::AReg_224RegClass.contains(Reg)) {
380           IsSGPR = false;
381           IsAGPR = true;
382           Width = 7;
383         } else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
384           assert(!AMDGPU::TTMP_256RegClass.contains(Reg) &&
385                  "trap handler registers should not be used");
386           IsSGPR = true;
387           Width = 8;
388         } else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
389           IsSGPR = false;
390           Width = 8;
391         } else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
392           IsSGPR = false;
393           IsAGPR = true;
394           Width = 8;
395         } else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
396           assert(!AMDGPU::TTMP_512RegClass.contains(Reg) &&
397                  "trap handler registers should not be used");
398           IsSGPR = true;
399           Width = 16;
400         } else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
401           IsSGPR = false;
402           Width = 16;
403         } else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
404           IsSGPR = false;
405           IsAGPR = true;
406           Width = 16;
407         } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
408           IsSGPR = true;
409           Width = 32;
410         } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
411           IsSGPR = false;
412           Width = 32;
413         } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
414           IsSGPR = false;
415           IsAGPR = true;
416           Width = 32;
417         } else {
418           llvm_unreachable("Unknown register class");
419         }
420         unsigned HWReg = TRI.getHWRegIndex(Reg);
421         int MaxUsed = HWReg + Width - 1;
422         if (IsSGPR) {
423           MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
424         } else if (IsAGPR) {
425           MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
426         } else {
427           MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
428         }
429       }
430 
431       if (MI.isCall()) {
432         // Pseudo used just to encode the underlying global. Is there a better
433         // way to track this?
434 
435         const MachineOperand *CalleeOp =
436             TII->getNamedOperand(MI, AMDGPU::OpName::callee);
437 
438         const Function *Callee = getCalleeFunction(*CalleeOp);
439         DenseMap<const Function *, SIFunctionResourceInfo>::const_iterator I =
440             CallGraphResourceInfo.end();
441 
442         // Avoid crashing on undefined behavior with an illegal call to a
443         // kernel. If a callsite's calling convention doesn't match the
444         // function's, it's undefined behavior. If the callsite calling
445         // convention does match, that would have errored earlier.
446         if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv()))
447           report_fatal_error("invalid call to entry function");
448 
449         bool IsIndirect = !Callee || Callee->isDeclaration();
450         if (!IsIndirect)
451           I = CallGraphResourceInfo.find(Callee);
452 
453         // FIXME: Call site could have norecurse on it
454         if (!Callee || !Callee->doesNotRecurse()) {
455           Info.HasRecursion = true;
456 
457           // TODO: If we happen to know there is no stack usage in the
458           // callgraph, we don't need to assume an infinitely growing stack.
459           if (!MI.isReturn()) {
460             // We don't need to assume an unknown stack size for tail calls.
461 
462             // FIXME: This only benefits in the case where the kernel does not
463             // directly call the tail called function. If a kernel directly
464             // calls a tail recursive function, we'll assume maximum stack size
465             // based on the regular call instruction.
466             CalleeFrameSize =
467               std::max(CalleeFrameSize,
468                        static_cast<uint64_t>(AssumedStackSizeForExternalCall));
469           }
470         }
471 
472         if (IsIndirect || I == CallGraphResourceInfo.end()) {
473           CalleeFrameSize =
474               std::max(CalleeFrameSize,
475                        static_cast<uint64_t>(AssumedStackSizeForExternalCall));
476 
477           // Register usage of indirect calls gets handled later
478           Info.UsesVCC = true;
479           Info.UsesFlatScratch = ST.hasFlatAddressSpace();
480           Info.HasDynamicallySizedStack = true;
481           Info.HasIndirectCall = true;
482         } else {
483           // We force CodeGen to run in SCC order, so the callee's register
484           // usage etc. should be the cumulative usage of all callees.
485           MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR);
486           MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR);
487           MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR);
488           CalleeFrameSize =
489               std::max(I->second.PrivateSegmentSize, CalleeFrameSize);
490           Info.UsesVCC |= I->second.UsesVCC;
491           Info.UsesFlatScratch |= I->second.UsesFlatScratch;
492           Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack;
493           Info.HasRecursion |= I->second.HasRecursion;
494           Info.HasIndirectCall |= I->second.HasIndirectCall;
495         }
496       }
497     }
498   }
499 
500   Info.NumExplicitSGPR = MaxSGPR + 1;
501   Info.NumVGPR = MaxVGPR + 1;
502   Info.NumAGPR = MaxAGPR + 1;
503   Info.PrivateSegmentSize += CalleeFrameSize;
504 
505   return Info;
506 }
507 
508 void AMDGPUResourceUsageAnalysis::propagateIndirectCallRegisterUsage() {
509   // Collect the maximum number of registers from non-hardware-entrypoints.
510   // All these functions are potential targets for indirect calls.
511   int32_t NonKernelMaxSGPRs = 0;
512   int32_t NonKernelMaxVGPRs = 0;
513   int32_t NonKernelMaxAGPRs = 0;
514 
515   for (const auto &I : CallGraphResourceInfo) {
516     if (!AMDGPU::isEntryFunctionCC(I.getFirst()->getCallingConv())) {
517       auto &Info = I.getSecond();
518       NonKernelMaxSGPRs = std::max(NonKernelMaxSGPRs, Info.NumExplicitSGPR);
519       NonKernelMaxVGPRs = std::max(NonKernelMaxVGPRs, Info.NumVGPR);
520       NonKernelMaxAGPRs = std::max(NonKernelMaxAGPRs, Info.NumAGPR);
521     }
522   }
523 
524   // Add register usage for functions with indirect calls.
525   // For calls to unknown functions, we assume the maximum register usage of
526   // all non-hardware-entrypoints in the current module.
527   for (auto &I : CallGraphResourceInfo) {
528     auto &Info = I.getSecond();
529     if (Info.HasIndirectCall) {
530       Info.NumExplicitSGPR = std::max(Info.NumExplicitSGPR, NonKernelMaxSGPRs);
531       Info.NumVGPR = std::max(Info.NumVGPR, NonKernelMaxVGPRs);
532       Info.NumAGPR = std::max(Info.NumAGPR, NonKernelMaxAGPRs);
533     }
534   }
535 }
536