1 //===- AMDGPUResourceUsageAnalysis.h ---- analysis of resources -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// \brief Analyzes how many registers and other resources are used by
11 /// functions.
12 ///
13 /// The results of this analysis are used to fill the register usage, flat
14 /// usage, etc. into hardware registers.
15 ///
16 /// The analysis takes callees into account. E.g. if a function A that needs 10
17 /// VGPRs calls a function B that needs 20 VGPRs, querying the VGPR usage of A
18 /// will return 20.
19 /// It is assumed that an indirect call can go into any function except
20 /// hardware-entrypoints. Therefore the register usage of functions with
21 /// indirect calls is estimated as the maximum of all non-entrypoint functions
22 /// in the module.
23 ///
24 //===----------------------------------------------------------------------===//
25
26 #include "AMDGPUResourceUsageAnalysis.h"
27 #include "AMDGPU.h"
28 #include "GCNSubtarget.h"
29 #include "SIMachineFunctionInfo.h"
30 #include "llvm/ADT/PostOrderIterator.h"
31 #include "llvm/Analysis/CallGraph.h"
32 #include "llvm/CodeGen/MachineFrameInfo.h"
33 #include "llvm/CodeGen/TargetPassConfig.h"
34 #include "llvm/IR/GlobalAlias.h"
35 #include "llvm/IR/GlobalValue.h"
36 #include "llvm/Target/TargetMachine.h"
37
38 using namespace llvm;
39 using namespace llvm::AMDGPU;
40
41 #define DEBUG_TYPE "amdgpu-resource-usage"
42
43 char llvm::AMDGPUResourceUsageAnalysis::ID = 0;
44 char &llvm::AMDGPUResourceUsageAnalysisID = AMDGPUResourceUsageAnalysis::ID;
45
46 // In code object v4 and older, we need to tell the runtime some amount ahead of
47 // time if we don't know the true stack size. Assume a smaller number if this is
48 // only due to dynamic / non-entry block allocas.
49 static cl::opt<uint32_t> AssumedStackSizeForExternalCall(
50 "amdgpu-assume-external-call-stack-size",
51 cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden,
52 cl::init(16384));
53
54 static cl::opt<uint32_t> AssumedStackSizeForDynamicSizeObjects(
55 "amdgpu-assume-dynamic-stack-object-size",
56 cl::desc("Assumed extra stack use if there are any "
57 "variable sized objects (in bytes)"),
58 cl::Hidden, cl::init(4096));
59
60 INITIALIZE_PASS(AMDGPUResourceUsageAnalysis, DEBUG_TYPE,
61 "Function register usage analysis", true, true)
62
getCalleeFunction(const MachineOperand & Op)63 static const Function *getCalleeFunction(const MachineOperand &Op) {
64 if (Op.isImm()) {
65 assert(Op.getImm() == 0);
66 return nullptr;
67 }
68 if (auto *GA = dyn_cast<GlobalAlias>(Op.getGlobal()))
69 return cast<Function>(GA->getOperand(0));
70 return cast<Function>(Op.getGlobal());
71 }
72
hasAnyNonFlatUseOfReg(const MachineRegisterInfo & MRI,const SIInstrInfo & TII,unsigned Reg)73 static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,
74 const SIInstrInfo &TII, unsigned Reg) {
75 for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) {
76 if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent()))
77 return true;
78 }
79
80 return false;
81 }
82
getTotalNumSGPRs(const GCNSubtarget & ST) const83 int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumSGPRs(
84 const GCNSubtarget &ST) const {
85 return NumExplicitSGPR +
86 IsaInfo::getNumExtraSGPRs(&ST, UsesVCC, UsesFlatScratch,
87 ST.getTargetID().isXnackOnOrAny());
88 }
89
getTotalNumVGPRs(const GCNSubtarget & ST,int32_t ArgNumAGPR,int32_t ArgNumVGPR) const90 int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
91 const GCNSubtarget &ST, int32_t ArgNumAGPR, int32_t ArgNumVGPR) const {
92 return AMDGPU::getTotalNumVGPRs(ST.hasGFX90AInsts(), ArgNumAGPR, ArgNumVGPR);
93 }
94
getTotalNumVGPRs(const GCNSubtarget & ST) const95 int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
96 const GCNSubtarget &ST) const {
97 return getTotalNumVGPRs(ST, NumAGPR, NumVGPR);
98 }
99
runOnModule(Module & M)100 bool AMDGPUResourceUsageAnalysis::runOnModule(Module &M) {
101 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
102 if (!TPC)
103 return false;
104
105 MachineModuleInfo &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
106 const TargetMachine &TM = TPC->getTM<TargetMachine>();
107 const MCSubtargetInfo &STI = *TM.getMCSubtargetInfo();
108 bool HasIndirectCall = false;
109
110 CallGraph CG = CallGraph(M);
111 auto End = po_end(&CG);
112
113 // By default, for code object v5 and later, track only the minimum scratch
114 // size
115 if (AMDGPU::getAMDHSACodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5 ||
116 STI.getTargetTriple().getOS() == Triple::AMDPAL) {
117 if (!AssumedStackSizeForDynamicSizeObjects.getNumOccurrences())
118 AssumedStackSizeForDynamicSizeObjects = 0;
119 if (!AssumedStackSizeForExternalCall.getNumOccurrences())
120 AssumedStackSizeForExternalCall = 0;
121 }
122
123 for (auto IT = po_begin(&CG); IT != End; ++IT) {
124 Function *F = IT->getFunction();
125 if (!F || F->isDeclaration())
126 continue;
127
128 MachineFunction *MF = MMI.getMachineFunction(*F);
129 assert(MF && "function must have been generated already");
130
131 auto CI =
132 CallGraphResourceInfo.insert(std::pair(F, SIFunctionResourceInfo()));
133 SIFunctionResourceInfo &Info = CI.first->second;
134 assert(CI.second && "should only be called once per function");
135 Info = analyzeResourceUsage(*MF, TM);
136 HasIndirectCall |= Info.HasIndirectCall;
137 }
138
139 // It's possible we have unreachable functions in the module which weren't
140 // visited by the PO traversal. Make sure we have some resource counts to
141 // report.
142 for (const auto &IT : CG) {
143 const Function *F = IT.first;
144 if (!F || F->isDeclaration())
145 continue;
146
147 auto CI =
148 CallGraphResourceInfo.insert(std::pair(F, SIFunctionResourceInfo()));
149 if (!CI.second) // Skip already visited functions
150 continue;
151
152 SIFunctionResourceInfo &Info = CI.first->second;
153 MachineFunction *MF = MMI.getMachineFunction(*F);
154 assert(MF && "function must have been generated already");
155 Info = analyzeResourceUsage(*MF, TM);
156 HasIndirectCall |= Info.HasIndirectCall;
157 }
158
159 if (HasIndirectCall)
160 propagateIndirectCallRegisterUsage();
161
162 return false;
163 }
164
165 AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo
analyzeResourceUsage(const MachineFunction & MF,const TargetMachine & TM) const166 AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
167 const MachineFunction &MF, const TargetMachine &TM) const {
168 SIFunctionResourceInfo Info;
169
170 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
171 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
172 const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
173 const MachineRegisterInfo &MRI = MF.getRegInfo();
174 const SIInstrInfo *TII = ST.getInstrInfo();
175 const SIRegisterInfo &TRI = TII->getRegisterInfo();
176
177 Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||
178 MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI) ||
179 MRI.isLiveIn(MFI->getPreloadedReg(
180 AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT));
181
182 // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
183 // instructions aren't used to access the scratch buffer. Inline assembly may
184 // need it though.
185 //
186 // If we only have implicit uses of flat_scr on flat instructions, it is not
187 // really needed.
188 if (Info.UsesFlatScratch && !MFI->getUserSGPRInfo().hasFlatScratchInit() &&
189 (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) &&
190 !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) &&
191 !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) {
192 Info.UsesFlatScratch = false;
193 }
194
195 Info.PrivateSegmentSize = FrameInfo.getStackSize();
196
197 // Assume a big number if there are any unknown sized objects.
198 Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
199 if (Info.HasDynamicallySizedStack)
200 Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects;
201
202 if (MFI->isStackRealigned())
203 Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();
204
205 Info.UsesVCC =
206 MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI);
207
208 // If there are no calls, MachineRegisterInfo can tell us the used register
209 // count easily.
210 // A tail call isn't considered a call for MachineFrameInfo's purposes.
211 if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
212 MCPhysReg HighestVGPRReg = AMDGPU::NoRegister;
213 for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) {
214 if (MRI.isPhysRegUsed(Reg)) {
215 HighestVGPRReg = Reg;
216 break;
217 }
218 }
219
220 if (ST.hasMAIInsts()) {
221 MCPhysReg HighestAGPRReg = AMDGPU::NoRegister;
222 for (MCPhysReg Reg : reverse(AMDGPU::AGPR_32RegClass.getRegisters())) {
223 if (MRI.isPhysRegUsed(Reg)) {
224 HighestAGPRReg = Reg;
225 break;
226 }
227 }
228 Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister
229 ? 0
230 : TRI.getHWRegIndex(HighestAGPRReg) + 1;
231 }
232
233 MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
234 for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {
235 if (MRI.isPhysRegUsed(Reg)) {
236 HighestSGPRReg = Reg;
237 break;
238 }
239 }
240
241 // We found the maximum register index. They start at 0, so add one to get
242 // the number of registers.
243 Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister
244 ? 0
245 : TRI.getHWRegIndex(HighestVGPRReg) + 1;
246 Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister
247 ? 0
248 : TRI.getHWRegIndex(HighestSGPRReg) + 1;
249
250 return Info;
251 }
252
253 int32_t MaxVGPR = -1;
254 int32_t MaxAGPR = -1;
255 int32_t MaxSGPR = -1;
256 uint64_t CalleeFrameSize = 0;
257
258 for (const MachineBasicBlock &MBB : MF) {
259 for (const MachineInstr &MI : MBB) {
260 // TODO: Check regmasks? Do they occur anywhere except calls?
261 for (const MachineOperand &MO : MI.operands()) {
262 unsigned Width = 0;
263 bool IsSGPR = false;
264 bool IsAGPR = false;
265
266 if (!MO.isReg())
267 continue;
268
269 Register Reg = MO.getReg();
270 switch (Reg) {
271 case AMDGPU::EXEC:
272 case AMDGPU::EXEC_LO:
273 case AMDGPU::EXEC_HI:
274 case AMDGPU::SCC:
275 case AMDGPU::M0:
276 case AMDGPU::M0_LO16:
277 case AMDGPU::M0_HI16:
278 case AMDGPU::SRC_SHARED_BASE_LO:
279 case AMDGPU::SRC_SHARED_BASE:
280 case AMDGPU::SRC_SHARED_LIMIT_LO:
281 case AMDGPU::SRC_SHARED_LIMIT:
282 case AMDGPU::SRC_PRIVATE_BASE_LO:
283 case AMDGPU::SRC_PRIVATE_BASE:
284 case AMDGPU::SRC_PRIVATE_LIMIT_LO:
285 case AMDGPU::SRC_PRIVATE_LIMIT:
286 case AMDGPU::SGPR_NULL:
287 case AMDGPU::SGPR_NULL64:
288 case AMDGPU::MODE:
289 continue;
290
291 case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
292 llvm_unreachable("src_pops_exiting_wave_id should not be used");
293
294 case AMDGPU::NoRegister:
295 assert(MI.isDebugInstr() &&
296 "Instruction uses invalid noreg register");
297 continue;
298
299 case AMDGPU::VCC:
300 case AMDGPU::VCC_LO:
301 case AMDGPU::VCC_HI:
302 case AMDGPU::VCC_LO_LO16:
303 case AMDGPU::VCC_LO_HI16:
304 case AMDGPU::VCC_HI_LO16:
305 case AMDGPU::VCC_HI_HI16:
306 Info.UsesVCC = true;
307 continue;
308
309 case AMDGPU::FLAT_SCR:
310 case AMDGPU::FLAT_SCR_LO:
311 case AMDGPU::FLAT_SCR_HI:
312 continue;
313
314 case AMDGPU::XNACK_MASK:
315 case AMDGPU::XNACK_MASK_LO:
316 case AMDGPU::XNACK_MASK_HI:
317 llvm_unreachable("xnack_mask registers should not be used");
318
319 case AMDGPU::LDS_DIRECT:
320 llvm_unreachable("lds_direct register should not be used");
321
322 case AMDGPU::TBA:
323 case AMDGPU::TBA_LO:
324 case AMDGPU::TBA_HI:
325 case AMDGPU::TMA:
326 case AMDGPU::TMA_LO:
327 case AMDGPU::TMA_HI:
328 llvm_unreachable("trap handler registers should not be used");
329
330 case AMDGPU::SRC_VCCZ:
331 llvm_unreachable("src_vccz register should not be used");
332
333 case AMDGPU::SRC_EXECZ:
334 llvm_unreachable("src_execz register should not be used");
335
336 case AMDGPU::SRC_SCC:
337 llvm_unreachable("src_scc register should not be used");
338
339 default:
340 break;
341 }
342
343 if (AMDGPU::SGPR_32RegClass.contains(Reg) ||
344 AMDGPU::SGPR_LO16RegClass.contains(Reg) ||
345 AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
346 IsSGPR = true;
347 Width = 1;
348 } else if (AMDGPU::VGPR_32RegClass.contains(Reg) ||
349 AMDGPU::VGPR_16RegClass.contains(Reg)) {
350 IsSGPR = false;
351 Width = 1;
352 } else if (AMDGPU::AGPR_32RegClass.contains(Reg) ||
353 AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
354 IsSGPR = false;
355 IsAGPR = true;
356 Width = 1;
357 } else if (AMDGPU::SGPR_64RegClass.contains(Reg)) {
358 IsSGPR = true;
359 Width = 2;
360 } else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
361 IsSGPR = false;
362 Width = 2;
363 } else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
364 IsSGPR = false;
365 IsAGPR = true;
366 Width = 2;
367 } else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
368 IsSGPR = false;
369 Width = 3;
370 } else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
371 IsSGPR = true;
372 Width = 3;
373 } else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
374 IsSGPR = false;
375 IsAGPR = true;
376 Width = 3;
377 } else if (AMDGPU::SGPR_128RegClass.contains(Reg)) {
378 IsSGPR = true;
379 Width = 4;
380 } else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
381 IsSGPR = false;
382 Width = 4;
383 } else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
384 IsSGPR = false;
385 IsAGPR = true;
386 Width = 4;
387 } else if (AMDGPU::VReg_160RegClass.contains(Reg)) {
388 IsSGPR = false;
389 Width = 5;
390 } else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
391 IsSGPR = true;
392 Width = 5;
393 } else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
394 IsSGPR = false;
395 IsAGPR = true;
396 Width = 5;
397 } else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
398 IsSGPR = false;
399 Width = 6;
400 } else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
401 IsSGPR = true;
402 Width = 6;
403 } else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
404 IsSGPR = false;
405 IsAGPR = true;
406 Width = 6;
407 } else if (AMDGPU::VReg_224RegClass.contains(Reg)) {
408 IsSGPR = false;
409 Width = 7;
410 } else if (AMDGPU::SReg_224RegClass.contains(Reg)) {
411 IsSGPR = true;
412 Width = 7;
413 } else if (AMDGPU::AReg_224RegClass.contains(Reg)) {
414 IsSGPR = false;
415 IsAGPR = true;
416 Width = 7;
417 } else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
418 IsSGPR = true;
419 Width = 8;
420 } else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
421 IsSGPR = false;
422 Width = 8;
423 } else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
424 IsSGPR = false;
425 IsAGPR = true;
426 Width = 8;
427 } else if (AMDGPU::VReg_288RegClass.contains(Reg)) {
428 IsSGPR = false;
429 Width = 9;
430 } else if (AMDGPU::SReg_288RegClass.contains(Reg)) {
431 IsSGPR = true;
432 Width = 9;
433 } else if (AMDGPU::AReg_288RegClass.contains(Reg)) {
434 IsSGPR = false;
435 IsAGPR = true;
436 Width = 9;
437 } else if (AMDGPU::VReg_320RegClass.contains(Reg)) {
438 IsSGPR = false;
439 Width = 10;
440 } else if (AMDGPU::SReg_320RegClass.contains(Reg)) {
441 IsSGPR = true;
442 Width = 10;
443 } else if (AMDGPU::AReg_320RegClass.contains(Reg)) {
444 IsSGPR = false;
445 IsAGPR = true;
446 Width = 10;
447 } else if (AMDGPU::VReg_352RegClass.contains(Reg)) {
448 IsSGPR = false;
449 Width = 11;
450 } else if (AMDGPU::SReg_352RegClass.contains(Reg)) {
451 IsSGPR = true;
452 Width = 11;
453 } else if (AMDGPU::AReg_352RegClass.contains(Reg)) {
454 IsSGPR = false;
455 IsAGPR = true;
456 Width = 11;
457 } else if (AMDGPU::VReg_384RegClass.contains(Reg)) {
458 IsSGPR = false;
459 Width = 12;
460 } else if (AMDGPU::SReg_384RegClass.contains(Reg)) {
461 IsSGPR = true;
462 Width = 12;
463 } else if (AMDGPU::AReg_384RegClass.contains(Reg)) {
464 IsSGPR = false;
465 IsAGPR = true;
466 Width = 12;
467 } else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
468 IsSGPR = true;
469 Width = 16;
470 } else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
471 IsSGPR = false;
472 Width = 16;
473 } else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
474 IsSGPR = false;
475 IsAGPR = true;
476 Width = 16;
477 } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
478 IsSGPR = true;
479 Width = 32;
480 } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
481 IsSGPR = false;
482 Width = 32;
483 } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
484 IsSGPR = false;
485 IsAGPR = true;
486 Width = 32;
487 } else {
488 // We only expect TTMP registers or registers that do not belong to
489 // any RC.
490 assert((AMDGPU::TTMP_32RegClass.contains(Reg) ||
491 AMDGPU::TTMP_64RegClass.contains(Reg) ||
492 AMDGPU::TTMP_128RegClass.contains(Reg) ||
493 AMDGPU::TTMP_256RegClass.contains(Reg) ||
494 AMDGPU::TTMP_512RegClass.contains(Reg) ||
495 !TRI.getPhysRegBaseClass(Reg)) &&
496 "Unknown register class");
497 }
498 unsigned HWReg = TRI.getHWRegIndex(Reg);
499 int MaxUsed = HWReg + Width - 1;
500 if (IsSGPR) {
501 MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
502 } else if (IsAGPR) {
503 MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
504 } else {
505 MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
506 }
507 }
508
509 if (MI.isCall()) {
510 // Pseudo used just to encode the underlying global. Is there a better
511 // way to track this?
512
513 const MachineOperand *CalleeOp =
514 TII->getNamedOperand(MI, AMDGPU::OpName::callee);
515
516 const Function *Callee = getCalleeFunction(*CalleeOp);
517 DenseMap<const Function *, SIFunctionResourceInfo>::const_iterator I =
518 CallGraphResourceInfo.end();
519
520 // Avoid crashing on undefined behavior with an illegal call to a
521 // kernel. If a callsite's calling convention doesn't match the
522 // function's, it's undefined behavior. If the callsite calling
523 // convention does match, that would have errored earlier.
524 if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv()))
525 report_fatal_error("invalid call to entry function");
526
527 bool IsIndirect = !Callee || Callee->isDeclaration();
528 if (!IsIndirect)
529 I = CallGraphResourceInfo.find(Callee);
530
531 // FIXME: Call site could have norecurse on it
532 if (!Callee || !Callee->doesNotRecurse()) {
533 Info.HasRecursion = true;
534
535 // TODO: If we happen to know there is no stack usage in the
536 // callgraph, we don't need to assume an infinitely growing stack.
537 if (!MI.isReturn()) {
538 // We don't need to assume an unknown stack size for tail calls.
539
540 // FIXME: This only benefits in the case where the kernel does not
541 // directly call the tail called function. If a kernel directly
542 // calls a tail recursive function, we'll assume maximum stack size
543 // based on the regular call instruction.
544 CalleeFrameSize =
545 std::max(CalleeFrameSize,
546 static_cast<uint64_t>(AssumedStackSizeForExternalCall));
547 }
548 }
549
550 if (IsIndirect || I == CallGraphResourceInfo.end()) {
551 CalleeFrameSize =
552 std::max(CalleeFrameSize,
553 static_cast<uint64_t>(AssumedStackSizeForExternalCall));
554
555 // Register usage of indirect calls gets handled later
556 Info.UsesVCC = true;
557 Info.UsesFlatScratch = ST.hasFlatAddressSpace();
558 Info.HasDynamicallySizedStack = true;
559 Info.HasIndirectCall = true;
560 } else {
561 // We force CodeGen to run in SCC order, so the callee's register
562 // usage etc. should be the cumulative usage of all callees.
563 MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR);
564 MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR);
565 MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR);
566 CalleeFrameSize =
567 std::max(I->second.PrivateSegmentSize, CalleeFrameSize);
568 Info.UsesVCC |= I->second.UsesVCC;
569 Info.UsesFlatScratch |= I->second.UsesFlatScratch;
570 Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack;
571 Info.HasRecursion |= I->second.HasRecursion;
572 Info.HasIndirectCall |= I->second.HasIndirectCall;
573 }
574 }
575 }
576 }
577
578 Info.NumExplicitSGPR = MaxSGPR + 1;
579 Info.NumVGPR = MaxVGPR + 1;
580 Info.NumAGPR = MaxAGPR + 1;
581 Info.PrivateSegmentSize += CalleeFrameSize;
582
583 return Info;
584 }
585
propagateIndirectCallRegisterUsage()586 void AMDGPUResourceUsageAnalysis::propagateIndirectCallRegisterUsage() {
587 // Collect the maximum number of registers from non-hardware-entrypoints.
588 // All these functions are potential targets for indirect calls.
589 int32_t NonKernelMaxSGPRs = 0;
590 int32_t NonKernelMaxVGPRs = 0;
591 int32_t NonKernelMaxAGPRs = 0;
592
593 for (const auto &I : CallGraphResourceInfo) {
594 if (!AMDGPU::isEntryFunctionCC(I.getFirst()->getCallingConv())) {
595 auto &Info = I.getSecond();
596 NonKernelMaxSGPRs = std::max(NonKernelMaxSGPRs, Info.NumExplicitSGPR);
597 NonKernelMaxVGPRs = std::max(NonKernelMaxVGPRs, Info.NumVGPR);
598 NonKernelMaxAGPRs = std::max(NonKernelMaxAGPRs, Info.NumAGPR);
599 }
600 }
601
602 // Add register usage for functions with indirect calls.
603 // For calls to unknown functions, we assume the maximum register usage of
604 // all non-hardware-entrypoints in the current module.
605 for (auto &I : CallGraphResourceInfo) {
606 auto &Info = I.getSecond();
607 if (Info.HasIndirectCall) {
608 Info.NumExplicitSGPR = std::max(Info.NumExplicitSGPR, NonKernelMaxSGPRs);
609 Info.NumVGPR = std::max(Info.NumVGPR, NonKernelMaxVGPRs);
610 Info.NumAGPR = std::max(Info.NumAGPR, NonKernelMaxAGPRs);
611 }
612 }
613 }
614