1 //===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 ///
11 /// The AMDGPUAsmPrinter is used to print both assembly string and also binary
12 /// code.  When passed an MCAsmStreamer it prints assembly and when passed
13 /// an MCObjectStreamer it outputs binary code.
14 //
15 //===----------------------------------------------------------------------===//
16 //
17 
18 #include "AMDGPUAsmPrinter.h"
19 #include "AMDGPU.h"
20 #include "AMDGPUHSAMetadataStreamer.h"
21 #include "AMDKernelCodeT.h"
22 #include "GCNSubtarget.h"
23 #include "MCTargetDesc/AMDGPUInstPrinter.h"
24 #include "MCTargetDesc/AMDGPUTargetStreamer.h"
25 #include "R600AsmPrinter.h"
26 #include "SIMachineFunctionInfo.h"
27 #include "TargetInfo/AMDGPUTargetInfo.h"
28 #include "Utils/AMDGPUBaseInfo.h"
29 #include "llvm/IR/DiagnosticInfo.h"
30 #include "llvm/MC/MCAssembler.h"
31 #include "llvm/MC/MCContext.h"
32 #include "llvm/MC/MCSectionELF.h"
33 #include "llvm/MC/MCStreamer.h"
34 #include "llvm/Support/AMDHSAKernelDescriptor.h"
35 #include "llvm/Support/TargetRegistry.h"
36 #include "llvm/Target/TargetLoweringObjectFile.h"
37 #include "llvm/Target/TargetMachine.h"
38 
39 using namespace llvm;
40 using namespace llvm::AMDGPU;
41 
42 // We need to tell the runtime some amount ahead of time if we don't know the
43 // true stack size. Assume a smaller number if this is only due to dynamic /
44 // non-entry block allocas.
45 static cl::opt<uint32_t> AssumedStackSizeForExternalCall(
46   "amdgpu-assume-external-call-stack-size",
47   cl::desc("Assumed stack use of any external call (in bytes)"),
48   cl::Hidden,
49   cl::init(16384));
50 
51 static cl::opt<uint32_t> AssumedStackSizeForDynamicSizeObjects(
52   "amdgpu-assume-dynamic-stack-object-size",
53   cl::desc("Assumed extra stack use if there are any "
54            "variable sized objects (in bytes)"),
55   cl::Hidden,
56   cl::init(4096));
57 
58 // This should get the default rounding mode from the kernel. We just set the
59 // default here, but this could change if the OpenCL rounding mode pragmas are
60 // used.
61 //
62 // The denormal mode here should match what is reported by the OpenCL runtime
63 // for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
64 // can also be override to flush with the -cl-denorms-are-zero compiler flag.
65 //
66 // AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
67 // precision, and leaves single precision to flush all and does not report
68 // CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
69 // CL_FP_DENORM for both.
70 //
71 // FIXME: It seems some instructions do not support single precision denormals
72 // regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,
73 // and sin_f32, cos_f32 on most parts).
74 
75 // We want to use these instructions, and using fp32 denormals also causes
76 // instructions to run at the double precision rate for the device so it's
77 // probably best to just report no single precision denormals.
78 static uint32_t getFPMode(AMDGPU::SIModeRegisterDefaults Mode) {
79   return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) |
80          FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) |
81          FP_DENORM_MODE_SP(Mode.fpDenormModeSPValue()) |
82          FP_DENORM_MODE_DP(Mode.fpDenormModeDPValue());
83 }
84 
85 static AsmPrinter *
86 createAMDGPUAsmPrinterPass(TargetMachine &tm,
87                            std::unique_ptr<MCStreamer> &&Streamer) {
88   return new AMDGPUAsmPrinter(tm, std::move(Streamer));
89 }
90 
91 extern "C" void LLVM_EXTERNAL_VISIBILITY LLVMInitializeAMDGPUAsmPrinter() {
92   TargetRegistry::RegisterAsmPrinter(getTheAMDGPUTarget(),
93                                      llvm::createR600AsmPrinterPass);
94   TargetRegistry::RegisterAsmPrinter(getTheGCNTarget(),
95                                      createAMDGPUAsmPrinterPass);
96 }
97 
98 AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM,
99                                    std::unique_ptr<MCStreamer> Streamer)
100   : AsmPrinter(TM, std::move(Streamer)) {
101   if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
102     if (isHsaAbiVersion2(getGlobalSTI())) {
103       HSAMetadataStream.reset(new HSAMD::MetadataStreamerV2());
104     } else {
105       HSAMetadataStream.reset(new HSAMD::MetadataStreamerV3());
106     }
107   }
108 }
109 
110 StringRef AMDGPUAsmPrinter::getPassName() const {
111   return "AMDGPU Assembly Printer";
112 }
113 
114 const MCSubtargetInfo *AMDGPUAsmPrinter::getGlobalSTI() const {
115   return TM.getMCSubtargetInfo();
116 }
117 
118 AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const {
119   if (!OutStreamer)
120     return nullptr;
121   return static_cast<AMDGPUTargetStreamer*>(OutStreamer->getTargetStreamer());
122 }
123 
124 void AMDGPUAsmPrinter::emitStartOfAsmFile(Module &M) {
125   if (isHsaAbiVersion3(getGlobalSTI())) {
126     std::string ExpectedTarget;
127     raw_string_ostream ExpectedTargetOS(ExpectedTarget);
128     IsaInfo::streamIsaVersion(getGlobalSTI(), ExpectedTargetOS);
129 
130     getTargetStreamer()->EmitDirectiveAMDGCNTarget(ExpectedTarget);
131   }
132 
133   if (TM.getTargetTriple().getOS() != Triple::AMDHSA &&
134       TM.getTargetTriple().getOS() != Triple::AMDPAL)
135     return;
136 
137   if (TM.getTargetTriple().getOS() == Triple::AMDHSA)
138     HSAMetadataStream->begin(M);
139 
140   if (TM.getTargetTriple().getOS() == Triple::AMDPAL)
141     getTargetStreamer()->getPALMetadata()->readFromIR(M);
142 
143   if (isHsaAbiVersion3(getGlobalSTI()))
144     return;
145 
146   // HSA emits NT_AMDGPU_HSA_CODE_OBJECT_VERSION for code objects v2.
147   if (TM.getTargetTriple().getOS() == Triple::AMDHSA)
148     getTargetStreamer()->EmitDirectiveHSACodeObjectVersion(2, 1);
149 
150   // HSA and PAL emit NT_AMDGPU_HSA_ISA for code objects v2.
151   IsaVersion Version = getIsaVersion(getGlobalSTI()->getCPU());
152   getTargetStreamer()->EmitDirectiveHSACodeObjectISA(
153       Version.Major, Version.Minor, Version.Stepping, "AMD", "AMDGPU");
154 }
155 
156 void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) {
157   // Following code requires TargetStreamer to be present.
158   if (!getTargetStreamer())
159     return;
160 
161   if (TM.getTargetTriple().getOS() != Triple::AMDHSA ||
162       isHsaAbiVersion2(getGlobalSTI())) {
163     // Emit ISA Version (NT_AMD_AMDGPU_ISA).
164     std::string ISAVersionString;
165     raw_string_ostream ISAVersionStream(ISAVersionString);
166     IsaInfo::streamIsaVersion(getGlobalSTI(), ISAVersionStream);
167     getTargetStreamer()->EmitISAVersion(ISAVersionStream.str());
168   }
169 
170   // Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA).
171   if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
172     HSAMetadataStream->end();
173     bool Success = HSAMetadataStream->emitTo(*getTargetStreamer());
174     (void)Success;
175     assert(Success && "Malformed HSA Metadata");
176   }
177 }
178 
179 bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough(
180   const MachineBasicBlock *MBB) const {
181   if (!AsmPrinter::isBlockOnlyReachableByFallthrough(MBB))
182     return false;
183 
184   if (MBB->empty())
185     return true;
186 
187   // If this is a block implementing a long branch, an expression relative to
188   // the start of the block is needed.  to the start of the block.
189   // XXX - Is there a smarter way to check this?
190   return (MBB->back().getOpcode() != AMDGPU::S_SETPC_B64);
191 }
192 
193 void AMDGPUAsmPrinter::emitFunctionBodyStart() {
194   const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
195   if (!MFI.isEntryFunction())
196     return;
197 
198   const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
199   const Function &F = MF->getFunction();
200   if ((STM.isMesaKernel(F) || isHsaAbiVersion2(getGlobalSTI())) &&
201       (F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
202        F.getCallingConv() == CallingConv::SPIR_KERNEL)) {
203     amd_kernel_code_t KernelCode;
204     getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF);
205     getTargetStreamer()->EmitAMDKernelCodeT(KernelCode);
206   }
207 
208   if (STM.isAmdHsaOS())
209     HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo);
210 }
211 
212 void AMDGPUAsmPrinter::emitFunctionBodyEnd() {
213   const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
214   if (!MFI.isEntryFunction())
215     return;
216 
217   if (TM.getTargetTriple().getOS() != Triple::AMDHSA ||
218       isHsaAbiVersion2(getGlobalSTI()))
219     return;
220 
221   auto &Streamer = getTargetStreamer()->getStreamer();
222   auto &Context = Streamer.getContext();
223   auto &ObjectFileInfo = *Context.getObjectFileInfo();
224   auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection();
225 
226   Streamer.PushSection();
227   Streamer.SwitchSection(&ReadOnlySection);
228 
229   // CP microcode requires the kernel descriptor to be allocated on 64 byte
230   // alignment.
231   Streamer.emitValueToAlignment(64, 0, 1, 0);
232   if (ReadOnlySection.getAlignment() < 64)
233     ReadOnlySection.setAlignment(Align(64));
234 
235   const MCSubtargetInfo &STI = MF->getSubtarget();
236 
237   SmallString<128> KernelName;
238   getNameWithPrefix(KernelName, &MF->getFunction());
239   getTargetStreamer()->EmitAmdhsaKernelDescriptor(
240       STI, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo),
241       CurrentProgramInfo.NumVGPRsForWavesPerEU,
242       CurrentProgramInfo.NumSGPRsForWavesPerEU -
243           IsaInfo::getNumExtraSGPRs(&STI,
244                                     CurrentProgramInfo.VCCUsed,
245                                     CurrentProgramInfo.FlatUsed),
246       CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
247       hasXNACK(STI));
248 
249   Streamer.PopSection();
250 }
251 
252 void AMDGPUAsmPrinter::emitFunctionEntryLabel() {
253   if (TM.getTargetTriple().getOS() == Triple::AMDHSA &&
254       isHsaAbiVersion3(getGlobalSTI())) {
255     AsmPrinter::emitFunctionEntryLabel();
256     return;
257   }
258 
259   const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
260   const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
261   if (MFI->isEntryFunction() && STM.isAmdHsaOrMesa(MF->getFunction())) {
262     SmallString<128> SymbolName;
263     getNameWithPrefix(SymbolName, &MF->getFunction()),
264     getTargetStreamer()->EmitAMDGPUSymbolType(
265         SymbolName, ELF::STT_AMDGPU_HSA_KERNEL);
266   }
267   if (DumpCodeInstEmitter) {
268     // Disassemble function name label to text.
269     DisasmLines.push_back(MF->getName().str() + ":");
270     DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
271     HexLines.push_back("");
272   }
273 
274   AsmPrinter::emitFunctionEntryLabel();
275 }
276 
277 void AMDGPUAsmPrinter::emitBasicBlockStart(const MachineBasicBlock &MBB) {
278   if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(&MBB)) {
279     // Write a line for the basic block label if it is not only fallthrough.
280     DisasmLines.push_back(
281         (Twine("BB") + Twine(getFunctionNumber())
282          + "_" + Twine(MBB.getNumber()) + ":").str());
283     DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
284     HexLines.push_back("");
285   }
286   AsmPrinter::emitBasicBlockStart(MBB);
287 }
288 
289 void AMDGPUAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
290   if (GV->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
291     if (GV->hasInitializer() && !isa<UndefValue>(GV->getInitializer())) {
292       OutContext.reportError({},
293                              Twine(GV->getName()) +
294                                  ": unsupported initializer for address space");
295       return;
296     }
297 
298     // LDS variables aren't emitted in HSA or PAL yet.
299     const Triple::OSType OS = TM.getTargetTriple().getOS();
300     if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
301       return;
302 
303     MCSymbol *GVSym = getSymbol(GV);
304 
305     GVSym->redefineIfPossible();
306     if (GVSym->isDefined() || GVSym->isVariable())
307       report_fatal_error("symbol '" + Twine(GVSym->getName()) +
308                          "' is already defined");
309 
310     const DataLayout &DL = GV->getParent()->getDataLayout();
311     uint64_t Size = DL.getTypeAllocSize(GV->getValueType());
312     Align Alignment = GV->getAlign().getValueOr(Align(4));
313 
314     emitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration());
315     emitLinkage(GV, GVSym);
316     if (auto TS = getTargetStreamer())
317       TS->emitAMDGPULDS(GVSym, Size, Alignment);
318     return;
319   }
320 
321   AsmPrinter::emitGlobalVariable(GV);
322 }
323 
324 bool AMDGPUAsmPrinter::doFinalization(Module &M) {
325   CallGraphResourceInfo.clear();
326 
327   // Pad with s_code_end to help tools and guard against instruction prefetch
328   // causing stale data in caches. Arguably this should be done by the linker,
329   // which is why this isn't done for Mesa.
330   const MCSubtargetInfo &STI = *getGlobalSTI();
331   if ((AMDGPU::isGFX10Plus(STI) || AMDGPU::isGFX90A(STI)) &&
332       (STI.getTargetTriple().getOS() == Triple::AMDHSA ||
333        STI.getTargetTriple().getOS() == Triple::AMDPAL)) {
334     OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
335     getTargetStreamer()->EmitCodeEnd(STI);
336   }
337 
338   return AsmPrinter::doFinalization(M);
339 }
340 
341 // Print comments that apply to both callable functions and entry points.
342 void AMDGPUAsmPrinter::emitCommonFunctionComments(
343   uint32_t NumVGPR,
344   Optional<uint32_t> NumAGPR,
345   uint32_t TotalNumVGPR,
346   uint32_t NumSGPR,
347   uint64_t ScratchSize,
348   uint64_t CodeSize,
349   const AMDGPUMachineFunction *MFI) {
350   OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
351   OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false);
352   OutStreamer->emitRawComment(" NumVgprs: " + Twine(NumVGPR), false);
353   if (NumAGPR) {
354     OutStreamer->emitRawComment(" NumAgprs: " + Twine(*NumAGPR), false);
355     OutStreamer->emitRawComment(" TotalNumVgprs: " + Twine(TotalNumVGPR),
356                                 false);
357   }
358   OutStreamer->emitRawComment(" ScratchSize: " + Twine(ScratchSize), false);
359   OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()),
360                               false);
361 }
362 
363 uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
364     const MachineFunction &MF) const {
365   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
366   uint16_t KernelCodeProperties = 0;
367 
368   if (MFI.hasPrivateSegmentBuffer()) {
369     KernelCodeProperties |=
370         amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
371   }
372   if (MFI.hasDispatchPtr()) {
373     KernelCodeProperties |=
374         amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
375   }
376   if (MFI.hasQueuePtr()) {
377     KernelCodeProperties |=
378         amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
379   }
380   if (MFI.hasKernargSegmentPtr()) {
381     KernelCodeProperties |=
382         amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
383   }
384   if (MFI.hasDispatchID()) {
385     KernelCodeProperties |=
386         amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
387   }
388   if (MFI.hasFlatScratchInit()) {
389     KernelCodeProperties |=
390         amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
391   }
392   if (MF.getSubtarget<GCNSubtarget>().isWave32()) {
393     KernelCodeProperties |=
394         amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
395   }
396 
397   return KernelCodeProperties;
398 }
399 
400 amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(
401     const MachineFunction &MF,
402     const SIProgramInfo &PI) const {
403   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
404   amdhsa::kernel_descriptor_t KernelDescriptor;
405   memset(&KernelDescriptor, 0x0, sizeof(KernelDescriptor));
406 
407   assert(isUInt<32>(PI.ScratchSize));
408   assert(isUInt<32>(PI.getComputePGMRSrc1()));
409   assert(isUInt<32>(PI.ComputePGMRSrc2));
410 
411   KernelDescriptor.group_segment_fixed_size = PI.LDSSize;
412   KernelDescriptor.private_segment_fixed_size = PI.ScratchSize;
413   KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1();
414   KernelDescriptor.compute_pgm_rsrc2 = PI.ComputePGMRSrc2;
415   KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);
416 
417   assert(STM.hasGFX90AInsts() || CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0);
418   if (STM.hasGFX90AInsts())
419     KernelDescriptor.compute_pgm_rsrc3 =
420       CurrentProgramInfo.ComputePGMRSrc3GFX90A;
421 
422   return KernelDescriptor;
423 }
424 
425 bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
426   CurrentProgramInfo = SIProgramInfo();
427 
428   const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
429 
430   // The starting address of all shader programs must be 256 bytes aligned.
431   // Regular functions just need the basic required instruction alignment.
432   MF.setAlignment(MFI->isEntryFunction() ? Align(256) : Align(4));
433 
434   SetupMachineFunction(MF);
435 
436   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
437   MCContext &Context = getObjFileLowering().getContext();
438   // FIXME: This should be an explicit check for Mesa.
439   if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) {
440     MCSectionELF *ConfigSection =
441         Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
442     OutStreamer->SwitchSection(ConfigSection);
443   }
444 
445   if (MFI->isModuleEntryFunction()) {
446     getSIProgramInfo(CurrentProgramInfo, MF);
447   } else {
448     auto I = CallGraphResourceInfo.insert(
449       std::make_pair(&MF.getFunction(), SIFunctionResourceInfo()));
450     SIFunctionResourceInfo &Info = I.first->second;
451     assert(I.second && "should only be called once per function");
452     Info = analyzeResourceUsage(MF);
453   }
454 
455   if (STM.isAmdPalOS()) {
456     if (MFI->isEntryFunction())
457       EmitPALMetadata(MF, CurrentProgramInfo);
458     else if (MFI->isModuleEntryFunction())
459       emitPALFunctionMetadata(MF);
460   } else if (!STM.isAmdHsaOS()) {
461     EmitProgramInfoSI(MF, CurrentProgramInfo);
462   }
463 
464   DumpCodeInstEmitter = nullptr;
465   if (STM.dumpCode()) {
466     // For -dumpcode, get the assembler out of the streamer, even if it does
467     // not really want to let us have it. This only works with -filetype=obj.
468     bool SaveFlag = OutStreamer->getUseAssemblerInfoForParsing();
469     OutStreamer->setUseAssemblerInfoForParsing(true);
470     MCAssembler *Assembler = OutStreamer->getAssemblerPtr();
471     OutStreamer->setUseAssemblerInfoForParsing(SaveFlag);
472     if (Assembler)
473       DumpCodeInstEmitter = Assembler->getEmitterPtr();
474   }
475 
476   DisasmLines.clear();
477   HexLines.clear();
478   DisasmLineMaxLen = 0;
479 
480   emitFunctionBody();
481 
482   if (isVerbose()) {
483     MCSectionELF *CommentSection =
484         Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
485     OutStreamer->SwitchSection(CommentSection);
486 
487     if (!MFI->isEntryFunction()) {
488       OutStreamer->emitRawComment(" Function info:", false);
489       SIFunctionResourceInfo &Info = CallGraphResourceInfo[&MF.getFunction()];
490       emitCommonFunctionComments(
491         Info.NumVGPR,
492         STM.hasMAIInsts() ? Info.NumAGPR : Optional<uint32_t>(),
493         Info.getTotalNumVGPRs(STM),
494         Info.getTotalNumSGPRs(MF.getSubtarget<GCNSubtarget>()),
495         Info.PrivateSegmentSize,
496         getFunctionCodeSize(MF), MFI);
497       return false;
498     }
499 
500     OutStreamer->emitRawComment(" Kernel info:", false);
501     emitCommonFunctionComments(CurrentProgramInfo.NumArchVGPR,
502                                STM.hasMAIInsts()
503                                  ? CurrentProgramInfo.NumAccVGPR
504                                  : Optional<uint32_t>(),
505                                CurrentProgramInfo.NumVGPR,
506                                CurrentProgramInfo.NumSGPR,
507                                CurrentProgramInfo.ScratchSize,
508                                getFunctionCodeSize(MF), MFI);
509 
510     OutStreamer->emitRawComment(
511       " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
512     OutStreamer->emitRawComment(
513       " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false);
514     OutStreamer->emitRawComment(
515       " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) +
516       " bytes/workgroup (compile time only)", false);
517 
518     OutStreamer->emitRawComment(
519       " SGPRBlocks: " + Twine(CurrentProgramInfo.SGPRBlocks), false);
520     OutStreamer->emitRawComment(
521       " VGPRBlocks: " + Twine(CurrentProgramInfo.VGPRBlocks), false);
522 
523     OutStreamer->emitRawComment(
524       " NumSGPRsForWavesPerEU: " +
525       Twine(CurrentProgramInfo.NumSGPRsForWavesPerEU), false);
526     OutStreamer->emitRawComment(
527       " NumVGPRsForWavesPerEU: " +
528       Twine(CurrentProgramInfo.NumVGPRsForWavesPerEU), false);
529 
530     if (STM.hasGFX90AInsts())
531       OutStreamer->emitRawComment(
532         " AccumOffset: " +
533         Twine((CurrentProgramInfo.AccumOffset + 1) * 4), false);
534 
535     OutStreamer->emitRawComment(
536       " Occupancy: " +
537       Twine(CurrentProgramInfo.Occupancy), false);
538 
539     OutStreamer->emitRawComment(
540       " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);
541 
542     OutStreamer->emitRawComment(
543       " COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
544       Twine(G_00B84C_SCRATCH_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
545     OutStreamer->emitRawComment(
546       " COMPUTE_PGM_RSRC2:USER_SGPR: " +
547       Twine(G_00B84C_USER_SGPR(CurrentProgramInfo.ComputePGMRSrc2)), false);
548     OutStreamer->emitRawComment(
549       " COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
550       Twine(G_00B84C_TRAP_HANDLER(CurrentProgramInfo.ComputePGMRSrc2)), false);
551     OutStreamer->emitRawComment(
552       " COMPUTE_PGM_RSRC2:TGID_X_EN: " +
553       Twine(G_00B84C_TGID_X_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
554     OutStreamer->emitRawComment(
555       " COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
556       Twine(G_00B84C_TGID_Y_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
557     OutStreamer->emitRawComment(
558       " COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
559       Twine(G_00B84C_TGID_Z_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
560     OutStreamer->emitRawComment(
561       " COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
562       Twine(G_00B84C_TIDIG_COMP_CNT(CurrentProgramInfo.ComputePGMRSrc2)),
563       false);
564 
565     assert(STM.hasGFX90AInsts() ||
566            CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0);
567     if (STM.hasGFX90AInsts()) {
568       OutStreamer->emitRawComment(
569         " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
570         Twine((AMDHSA_BITS_GET(CurrentProgramInfo.ComputePGMRSrc3GFX90A,
571                                amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET))),
572                                false);
573       OutStreamer->emitRawComment(
574         " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
575         Twine((AMDHSA_BITS_GET(CurrentProgramInfo.ComputePGMRSrc3GFX90A,
576                                amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT))),
577                                false);
578     }
579   }
580 
581   if (DumpCodeInstEmitter) {
582 
583     OutStreamer->SwitchSection(
584         Context.getELFSection(".AMDGPU.disasm", ELF::SHT_PROGBITS, 0));
585 
586     for (size_t i = 0; i < DisasmLines.size(); ++i) {
587       std::string Comment = "\n";
588       if (!HexLines[i].empty()) {
589         Comment = std::string(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
590         Comment += " ; " + HexLines[i] + "\n";
591       }
592 
593       OutStreamer->emitBytes(StringRef(DisasmLines[i]));
594       OutStreamer->emitBytes(StringRef(Comment));
595     }
596   }
597 
598   return false;
599 }
600 
601 uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const {
602   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
603   const SIInstrInfo *TII = STM.getInstrInfo();
604 
605   uint64_t CodeSize = 0;
606 
607   for (const MachineBasicBlock &MBB : MF) {
608     for (const MachineInstr &MI : MBB) {
609       // TODO: CodeSize should account for multiple functions.
610 
611       // TODO: Should we count size of debug info?
612       if (MI.isDebugInstr())
613         continue;
614 
615       CodeSize += TII->getInstSizeInBytes(MI);
616     }
617   }
618 
619   return CodeSize;
620 }
621 
622 static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,
623                                   const SIInstrInfo &TII,
624                                   unsigned Reg) {
625   for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) {
626     if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent()))
627       return true;
628   }
629 
630   return false;
631 }
632 
633 int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumSGPRs(
634   const GCNSubtarget &ST) const {
635   return NumExplicitSGPR + IsaInfo::getNumExtraSGPRs(&ST,
636                                                      UsesVCC, UsesFlatScratch);
637 }
638 
639 int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumVGPRs(
640   const GCNSubtarget &ST) const {
641   if (ST.hasGFX90AInsts() && NumAGPR)
642     return alignTo(NumVGPR, 4) + NumAGPR;
643   return std::max(NumVGPR, NumAGPR);
644 }
645 
646 static const Function *getCalleeFunction(const MachineOperand &Op) {
647   if (Op.isImm()) {
648     assert(Op.getImm() == 0);
649     return nullptr;
650   }
651 
652   return cast<Function>(Op.getGlobal());
653 }
654 
655 AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
656   const MachineFunction &MF) const {
657   SIFunctionResourceInfo Info;
658 
659   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
660   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
661   const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
662   const MachineRegisterInfo &MRI = MF.getRegInfo();
663   const SIInstrInfo *TII = ST.getInstrInfo();
664   const SIRegisterInfo &TRI = TII->getRegisterInfo();
665 
666   Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||
667                          MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI);
668 
669   // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
670   // instructions aren't used to access the scratch buffer. Inline assembly may
671   // need it though.
672   //
673   // If we only have implicit uses of flat_scr on flat instructions, it is not
674   // really needed.
675   if (Info.UsesFlatScratch && !MFI->hasFlatScratchInit() &&
676       (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) &&
677        !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) &&
678        !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) {
679     Info.UsesFlatScratch = false;
680   }
681 
682   Info.PrivateSegmentSize = FrameInfo.getStackSize();
683 
684   // Assume a big number if there are any unknown sized objects.
685   Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
686   if (Info.HasDynamicallySizedStack)
687     Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects;
688 
689   if (MFI->isStackRealigned())
690     Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();
691 
692   Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC_LO) ||
693                  MRI.isPhysRegUsed(AMDGPU::VCC_HI);
694 
695   // If there are no calls, MachineRegisterInfo can tell us the used register
696   // count easily.
697   // A tail call isn't considered a call for MachineFrameInfo's purposes.
698   if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
699     MCPhysReg HighestVGPRReg = AMDGPU::NoRegister;
700     for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) {
701       if (MRI.isPhysRegUsed(Reg)) {
702         HighestVGPRReg = Reg;
703         break;
704       }
705     }
706 
707     if (ST.hasMAIInsts()) {
708       MCPhysReg HighestAGPRReg = AMDGPU::NoRegister;
709       for (MCPhysReg Reg : reverse(AMDGPU::AGPR_32RegClass.getRegisters())) {
710         if (MRI.isPhysRegUsed(Reg)) {
711           HighestAGPRReg = Reg;
712           break;
713         }
714       }
715       Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister ? 0 :
716         TRI.getHWRegIndex(HighestAGPRReg) + 1;
717     }
718 
719     MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
720     for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {
721       if (MRI.isPhysRegUsed(Reg)) {
722         HighestSGPRReg = Reg;
723         break;
724       }
725     }
726 
727     // We found the maximum register index. They start at 0, so add one to get the
728     // number of registers.
729     Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister ? 0 :
730       TRI.getHWRegIndex(HighestVGPRReg) + 1;
731     Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister ? 0 :
732       TRI.getHWRegIndex(HighestSGPRReg) + 1;
733 
734     return Info;
735   }
736 
737   int32_t MaxVGPR = -1;
738   int32_t MaxAGPR = -1;
739   int32_t MaxSGPR = -1;
740   uint64_t CalleeFrameSize = 0;
741 
742   for (const MachineBasicBlock &MBB : MF) {
743     for (const MachineInstr &MI : MBB) {
744       // TODO: Check regmasks? Do they occur anywhere except calls?
745       for (const MachineOperand &MO : MI.operands()) {
746         unsigned Width = 0;
747         bool IsSGPR = false;
748         bool IsAGPR = false;
749 
750         if (!MO.isReg())
751           continue;
752 
753         Register Reg = MO.getReg();
754         switch (Reg) {
755         case AMDGPU::EXEC:
756         case AMDGPU::EXEC_LO:
757         case AMDGPU::EXEC_HI:
758         case AMDGPU::SCC:
759         case AMDGPU::M0:
760         case AMDGPU::SRC_SHARED_BASE:
761         case AMDGPU::SRC_SHARED_LIMIT:
762         case AMDGPU::SRC_PRIVATE_BASE:
763         case AMDGPU::SRC_PRIVATE_LIMIT:
764         case AMDGPU::SGPR_NULL:
765         case AMDGPU::MODE:
766           continue;
767 
768         case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
769           llvm_unreachable("src_pops_exiting_wave_id should not be used");
770 
771         case AMDGPU::NoRegister:
772           assert(MI.isDebugInstr() && "Instruction uses invalid noreg register");
773           continue;
774 
775         case AMDGPU::VCC:
776         case AMDGPU::VCC_LO:
777         case AMDGPU::VCC_HI:
778         case AMDGPU::VCC_LO_LO16:
779         case AMDGPU::VCC_LO_HI16:
780         case AMDGPU::VCC_HI_LO16:
781         case AMDGPU::VCC_HI_HI16:
782           Info.UsesVCC = true;
783           continue;
784 
785         case AMDGPU::FLAT_SCR:
786         case AMDGPU::FLAT_SCR_LO:
787         case AMDGPU::FLAT_SCR_HI:
788           continue;
789 
790         case AMDGPU::XNACK_MASK:
791         case AMDGPU::XNACK_MASK_LO:
792         case AMDGPU::XNACK_MASK_HI:
793           llvm_unreachable("xnack_mask registers should not be used");
794 
795         case AMDGPU::LDS_DIRECT:
796           llvm_unreachable("lds_direct register should not be used");
797 
798         case AMDGPU::TBA:
799         case AMDGPU::TBA_LO:
800         case AMDGPU::TBA_HI:
801         case AMDGPU::TMA:
802         case AMDGPU::TMA_LO:
803         case AMDGPU::TMA_HI:
804           llvm_unreachable("trap handler registers should not be used");
805 
806         case AMDGPU::SRC_VCCZ:
807           llvm_unreachable("src_vccz register should not be used");
808 
809         case AMDGPU::SRC_EXECZ:
810           llvm_unreachable("src_execz register should not be used");
811 
812         case AMDGPU::SRC_SCC:
813           llvm_unreachable("src_scc register should not be used");
814 
815         default:
816           break;
817         }
818 
819         if (AMDGPU::SReg_32RegClass.contains(Reg) ||
820             AMDGPU::SReg_LO16RegClass.contains(Reg) ||
821             AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
822           assert(!AMDGPU::TTMP_32RegClass.contains(Reg) &&
823                  "trap handler registers should not be used");
824           IsSGPR = true;
825           Width = 1;
826         } else if (AMDGPU::VGPR_32RegClass.contains(Reg) ||
827                    AMDGPU::VGPR_LO16RegClass.contains(Reg) ||
828                    AMDGPU::VGPR_HI16RegClass.contains(Reg)) {
829           IsSGPR = false;
830           Width = 1;
831         } else if (AMDGPU::AGPR_32RegClass.contains(Reg) ||
832                    AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
833           IsSGPR = false;
834           IsAGPR = true;
835           Width = 1;
836         } else if (AMDGPU::SReg_64RegClass.contains(Reg)) {
837           assert(!AMDGPU::TTMP_64RegClass.contains(Reg) &&
838                  "trap handler registers should not be used");
839           IsSGPR = true;
840           Width = 2;
841         } else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
842           IsSGPR = false;
843           Width = 2;
844         } else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
845           IsSGPR = false;
846           IsAGPR = true;
847           Width = 2;
848         } else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
849           IsSGPR = false;
850           Width = 3;
851         } else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
852           IsSGPR = true;
853           Width = 3;
854         } else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
855           IsSGPR = false;
856           IsAGPR = true;
857           Width = 3;
858         } else if (AMDGPU::SReg_128RegClass.contains(Reg)) {
859           assert(!AMDGPU::TTMP_128RegClass.contains(Reg) &&
860             "trap handler registers should not be used");
861           IsSGPR = true;
862           Width = 4;
863         } else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
864           IsSGPR = false;
865           Width = 4;
866         } else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
867           IsSGPR = false;
868           IsAGPR = true;
869           Width = 4;
870         } else if (AMDGPU::VReg_160RegClass.contains(Reg)) {
871           IsSGPR = false;
872           Width = 5;
873         } else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
874           IsSGPR = true;
875           Width = 5;
876         } else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
877           IsSGPR = false;
878           IsAGPR = true;
879           Width = 5;
880         } else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
881           IsSGPR = false;
882           Width = 6;
883         } else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
884           IsSGPR = true;
885           Width = 6;
886         } else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
887           IsSGPR = false;
888           IsAGPR = true;
889           Width = 6;
890         } else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
891           assert(!AMDGPU::TTMP_256RegClass.contains(Reg) &&
892             "trap handler registers should not be used");
893           IsSGPR = true;
894           Width = 8;
895         } else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
896           IsSGPR = false;
897           Width = 8;
898         } else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
899           IsSGPR = false;
900           IsAGPR = true;
901           Width = 8;
902         } else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
903           assert(!AMDGPU::TTMP_512RegClass.contains(Reg) &&
904             "trap handler registers should not be used");
905           IsSGPR = true;
906           Width = 16;
907         } else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
908           IsSGPR = false;
909           Width = 16;
910         } else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
911           IsSGPR = false;
912           IsAGPR = true;
913           Width = 16;
914         } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
915           IsSGPR = true;
916           Width = 32;
917         } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
918           IsSGPR = false;
919           Width = 32;
920         } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
921           IsSGPR = false;
922           IsAGPR = true;
923           Width = 32;
924         } else {
925           llvm_unreachable("Unknown register class");
926         }
927         unsigned HWReg = TRI.getHWRegIndex(Reg);
928         int MaxUsed = HWReg + Width - 1;
929         if (IsSGPR) {
930           MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
931         } else if (IsAGPR) {
932           MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
933         } else {
934           MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
935         }
936       }
937 
938       if (MI.isCall()) {
939         // Pseudo used just to encode the underlying global. Is there a better
940         // way to track this?
941 
942         const MachineOperand *CalleeOp
943           = TII->getNamedOperand(MI, AMDGPU::OpName::callee);
944 
945         const Function *Callee = getCalleeFunction(*CalleeOp);
946         DenseMap<const Function *, SIFunctionResourceInfo>::const_iterator I =
947             CallGraphResourceInfo.end();
948         bool IsExternal = !Callee || Callee->isDeclaration();
949         if (!IsExternal)
950           I = CallGraphResourceInfo.find(Callee);
951 
952         if (IsExternal || I == CallGraphResourceInfo.end()) {
953           // Avoid crashing on undefined behavior with an illegal call to a
954           // kernel. If a callsite's calling convention doesn't match the
955           // function's, it's undefined behavior. If the callsite calling
956           // convention does match, that would have errored earlier.
957           // FIXME: The verifier shouldn't allow this.
958           if (!IsExternal &&
959               AMDGPU::isEntryFunctionCC(Callee->getCallingConv()))
960             report_fatal_error("invalid call to entry function");
961 
962           // If this is a call to an external function, we can't do much. Make
963           // conservative guesses.
964 
965           // 48 SGPRs - vcc, - flat_scr, -xnack
966           int MaxSGPRGuess =
967             47 - IsaInfo::getNumExtraSGPRs(&ST, true, ST.hasFlatAddressSpace());
968           MaxSGPR = std::max(MaxSGPR, MaxSGPRGuess);
969           MaxVGPR = std::max(MaxVGPR, 23);
970           MaxAGPR = std::max(MaxAGPR, 23);
971 
972           CalleeFrameSize = std::max(CalleeFrameSize,
973             static_cast<uint64_t>(AssumedStackSizeForExternalCall));
974 
975           Info.UsesVCC = true;
976           Info.UsesFlatScratch = ST.hasFlatAddressSpace();
977           Info.HasDynamicallySizedStack = true;
978         } else {
979           // We force CodeGen to run in SCC order, so the callee's register
980           // usage etc. should be the cumulative usage of all callees.
981 
982           MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR);
983           MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR);
984           MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR);
985           CalleeFrameSize
986             = std::max(I->second.PrivateSegmentSize, CalleeFrameSize);
987           Info.UsesVCC |= I->second.UsesVCC;
988           Info.UsesFlatScratch |= I->second.UsesFlatScratch;
989           Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack;
990           Info.HasRecursion |= I->second.HasRecursion;
991         }
992 
993         // FIXME: Call site could have norecurse on it
994         if (!Callee || !Callee->doesNotRecurse())
995           Info.HasRecursion = true;
996       }
997     }
998   }
999 
1000   Info.NumExplicitSGPR = MaxSGPR + 1;
1001   Info.NumVGPR = MaxVGPR + 1;
1002   Info.NumAGPR = MaxAGPR + 1;
1003   Info.PrivateSegmentSize += CalleeFrameSize;
1004 
1005   return Info;
1006 }
1007 
1008 void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
1009                                         const MachineFunction &MF) {
1010   SIFunctionResourceInfo Info = analyzeResourceUsage(MF);
1011   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1012 
1013   ProgInfo.NumArchVGPR = Info.NumVGPR;
1014   ProgInfo.NumAccVGPR = Info.NumAGPR;
1015   ProgInfo.NumVGPR = Info.getTotalNumVGPRs(STM);
1016   ProgInfo.AccumOffset = alignTo(std::max(1, Info.NumVGPR), 4) / 4 - 1;
1017   ProgInfo.TgSplit = STM.isTgSplitEnabled();
1018   ProgInfo.NumSGPR = Info.NumExplicitSGPR;
1019   ProgInfo.ScratchSize = Info.PrivateSegmentSize;
1020   ProgInfo.VCCUsed = Info.UsesVCC;
1021   ProgInfo.FlatUsed = Info.UsesFlatScratch;
1022   ProgInfo.DynamicCallStack = Info.HasDynamicallySizedStack || Info.HasRecursion;
1023 
1024   const uint64_t MaxScratchPerWorkitem =
1025       GCNSubtarget::MaxWaveScratchSize / STM.getWavefrontSize();
1026   if (ProgInfo.ScratchSize > MaxScratchPerWorkitem) {
1027     DiagnosticInfoStackSize DiagStackSize(MF.getFunction(),
1028                                           ProgInfo.ScratchSize, DS_Error);
1029     MF.getFunction().getContext().diagnose(DiagStackSize);
1030   }
1031 
1032   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1033 
1034   // TODO(scott.linder): The calculations related to SGPR/VGPR blocks are
1035   // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
1036   // unified.
1037   unsigned ExtraSGPRs = IsaInfo::getNumExtraSGPRs(
1038       &STM, ProgInfo.VCCUsed, ProgInfo.FlatUsed);
1039 
1040   // Check the addressable register limit before we add ExtraSGPRs.
1041   if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1042       !STM.hasSGPRInitBug()) {
1043     unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
1044     if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) {
1045       // This can happen due to a compiler bug or when using inline asm.
1046       LLVMContext &Ctx = MF.getFunction().getContext();
1047       DiagnosticInfoResourceLimit Diag(MF.getFunction(),
1048                                        "addressable scalar registers",
1049                                        ProgInfo.NumSGPR, DS_Error,
1050                                        DK_ResourceLimit,
1051                                        MaxAddressableNumSGPRs);
1052       Ctx.diagnose(Diag);
1053       ProgInfo.NumSGPR = MaxAddressableNumSGPRs - 1;
1054     }
1055   }
1056 
1057   // Account for extra SGPRs and VGPRs reserved for debugger use.
1058   ProgInfo.NumSGPR += ExtraSGPRs;
1059 
1060   const Function &F = MF.getFunction();
1061 
1062   // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
1063   // dispatch registers are function args.
1064   unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;
1065 
1066   if (isShader(F.getCallingConv())) {
1067     // FIXME: We should be using the number of registers determined during
1068     // calling convention lowering to legalize the types.
1069     const DataLayout &DL = F.getParent()->getDataLayout();
1070     for (auto &Arg : F.args()) {
1071       unsigned NumRegs = (DL.getTypeSizeInBits(Arg.getType()) + 31) / 32;
1072       if (Arg.hasAttribute(Attribute::InReg))
1073         WaveDispatchNumSGPR += NumRegs;
1074       else
1075         WaveDispatchNumVGPR += NumRegs;
1076     }
1077     ProgInfo.NumSGPR = std::max(ProgInfo.NumSGPR, WaveDispatchNumSGPR);
1078     ProgInfo.NumVGPR = std::max(ProgInfo.NumVGPR, WaveDispatchNumVGPR);
1079   }
1080 
1081   // Adjust number of registers used to meet default/requested minimum/maximum
1082   // number of waves per execution unit request.
1083   ProgInfo.NumSGPRsForWavesPerEU = std::max(
1084     std::max(ProgInfo.NumSGPR, 1u), STM.getMinNumSGPRs(MFI->getMaxWavesPerEU()));
1085   ProgInfo.NumVGPRsForWavesPerEU = std::max(
1086     std::max(ProgInfo.NumVGPR, 1u), STM.getMinNumVGPRs(MFI->getMaxWavesPerEU()));
1087 
1088   if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ||
1089       STM.hasSGPRInitBug()) {
1090     unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
1091     if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) {
1092       // This can happen due to a compiler bug or when using inline asm to use
1093       // the registers which are usually reserved for vcc etc.
1094       LLVMContext &Ctx = MF.getFunction().getContext();
1095       DiagnosticInfoResourceLimit Diag(MF.getFunction(),
1096                                        "scalar registers",
1097                                        ProgInfo.NumSGPR, DS_Error,
1098                                        DK_ResourceLimit,
1099                                        MaxAddressableNumSGPRs);
1100       Ctx.diagnose(Diag);
1101       ProgInfo.NumSGPR = MaxAddressableNumSGPRs;
1102       ProgInfo.NumSGPRsForWavesPerEU = MaxAddressableNumSGPRs;
1103     }
1104   }
1105 
1106   if (STM.hasSGPRInitBug()) {
1107     ProgInfo.NumSGPR =
1108         AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
1109     ProgInfo.NumSGPRsForWavesPerEU =
1110         AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
1111   }
1112 
1113   if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) {
1114     LLVMContext &Ctx = MF.getFunction().getContext();
1115     DiagnosticInfoResourceLimit Diag(MF.getFunction(), "user SGPRs",
1116                                      MFI->getNumUserSGPRs(), DS_Error);
1117     Ctx.diagnose(Diag);
1118   }
1119 
1120   if (MFI->getLDSSize() > static_cast<unsigned>(STM.getLocalMemorySize())) {
1121     LLVMContext &Ctx = MF.getFunction().getContext();
1122     DiagnosticInfoResourceLimit Diag(MF.getFunction(), "local memory",
1123                                      MFI->getLDSSize(), DS_Error);
1124     Ctx.diagnose(Diag);
1125   }
1126 
1127   ProgInfo.SGPRBlocks = IsaInfo::getNumSGPRBlocks(
1128       &STM, ProgInfo.NumSGPRsForWavesPerEU);
1129   ProgInfo.VGPRBlocks = IsaInfo::getNumVGPRBlocks(
1130       &STM, ProgInfo.NumVGPRsForWavesPerEU);
1131 
1132   const SIModeRegisterDefaults Mode = MFI->getMode();
1133 
1134   // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
1135   // register.
1136   ProgInfo.FloatMode = getFPMode(Mode);
1137 
1138   ProgInfo.IEEEMode = Mode.IEEE;
1139 
1140   // Make clamp modifier on NaN input returns 0.
1141   ProgInfo.DX10Clamp = Mode.DX10Clamp;
1142 
1143   unsigned LDSAlignShift;
1144   if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
1145     // LDS is allocated in 64 dword blocks.
1146     LDSAlignShift = 8;
1147   } else {
1148     // LDS is allocated in 128 dword blocks.
1149     LDSAlignShift = 9;
1150   }
1151 
1152   unsigned LDSSpillSize =
1153     MFI->getLDSWaveSpillSize() * MFI->getMaxFlatWorkGroupSize();
1154 
1155   ProgInfo.LDSSize = MFI->getLDSSize() + LDSSpillSize;
1156   ProgInfo.LDSBlocks =
1157       alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;
1158 
1159   // Scratch is allocated in 256 dword blocks.
1160   unsigned ScratchAlignShift = 10;
1161   // We need to program the hardware with the amount of scratch memory that
1162   // is used by the entire wave.  ProgInfo.ScratchSize is the amount of
1163   // scratch memory used per thread.
1164   ProgInfo.ScratchBlocks =
1165       alignTo(ProgInfo.ScratchSize * STM.getWavefrontSize(),
1166               1ULL << ScratchAlignShift) >>
1167       ScratchAlignShift;
1168 
1169   if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
1170     ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1;
1171     ProgInfo.MemOrdered = 1;
1172   }
1173 
1174   // 0 = X, 1 = XY, 2 = XYZ
1175   unsigned TIDIGCompCnt = 0;
1176   if (MFI->hasWorkItemIDZ())
1177     TIDIGCompCnt = 2;
1178   else if (MFI->hasWorkItemIDY())
1179     TIDIGCompCnt = 1;
1180 
1181   ProgInfo.ComputePGMRSrc2 =
1182       S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) |
1183       S_00B84C_USER_SGPR(MFI->getNumUserSGPRs()) |
1184       // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
1185       S_00B84C_TRAP_HANDLER(STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled()) |
1186       S_00B84C_TGID_X_EN(MFI->hasWorkGroupIDX()) |
1187       S_00B84C_TGID_Y_EN(MFI->hasWorkGroupIDY()) |
1188       S_00B84C_TGID_Z_EN(MFI->hasWorkGroupIDZ()) |
1189       S_00B84C_TG_SIZE_EN(MFI->hasWorkGroupInfo()) |
1190       S_00B84C_TIDIG_COMP_CNT(TIDIGCompCnt) |
1191       S_00B84C_EXCP_EN_MSB(0) |
1192       // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
1193       S_00B84C_LDS_SIZE(STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks) |
1194       S_00B84C_EXCP_EN(0);
1195 
1196   if (STM.hasGFX90AInsts()) {
1197     AMDHSA_BITS_SET(ProgInfo.ComputePGMRSrc3GFX90A,
1198                     amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
1199                     ProgInfo.AccumOffset);
1200     AMDHSA_BITS_SET(ProgInfo.ComputePGMRSrc3GFX90A,
1201                     amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
1202                     ProgInfo.TgSplit);
1203   }
1204 
1205   ProgInfo.Occupancy = STM.computeOccupancy(MF.getFunction(), ProgInfo.LDSSize,
1206                                             ProgInfo.NumSGPRsForWavesPerEU,
1207                                             ProgInfo.NumVGPRsForWavesPerEU);
1208 }
1209 
1210 static unsigned getRsrcReg(CallingConv::ID CallConv) {
1211   switch (CallConv) {
1212   default: LLVM_FALLTHROUGH;
1213   case CallingConv::AMDGPU_CS: return R_00B848_COMPUTE_PGM_RSRC1;
1214   case CallingConv::AMDGPU_LS: return R_00B528_SPI_SHADER_PGM_RSRC1_LS;
1215   case CallingConv::AMDGPU_HS: return R_00B428_SPI_SHADER_PGM_RSRC1_HS;
1216   case CallingConv::AMDGPU_ES: return R_00B328_SPI_SHADER_PGM_RSRC1_ES;
1217   case CallingConv::AMDGPU_GS: return R_00B228_SPI_SHADER_PGM_RSRC1_GS;
1218   case CallingConv::AMDGPU_VS: return R_00B128_SPI_SHADER_PGM_RSRC1_VS;
1219   case CallingConv::AMDGPU_PS: return R_00B028_SPI_SHADER_PGM_RSRC1_PS;
1220   }
1221 }
1222 
1223 void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
1224                                          const SIProgramInfo &CurrentProgramInfo) {
1225   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1226   unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv());
1227 
1228   if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
1229     OutStreamer->emitInt32(R_00B848_COMPUTE_PGM_RSRC1);
1230 
1231     OutStreamer->emitInt32(CurrentProgramInfo.getComputePGMRSrc1());
1232 
1233     OutStreamer->emitInt32(R_00B84C_COMPUTE_PGM_RSRC2);
1234     OutStreamer->emitInt32(CurrentProgramInfo.ComputePGMRSrc2);
1235 
1236     OutStreamer->emitInt32(R_00B860_COMPUTE_TMPRING_SIZE);
1237     OutStreamer->emitInt32(S_00B860_WAVESIZE(CurrentProgramInfo.ScratchBlocks));
1238 
1239     // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
1240     // 0" comment but I don't see a corresponding field in the register spec.
1241   } else {
1242     OutStreamer->emitInt32(RsrcReg);
1243     OutStreamer->emitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) |
1244                               S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4);
1245     OutStreamer->emitInt32(R_0286E8_SPI_TMPRING_SIZE);
1246     OutStreamer->emitIntValue(
1247         S_0286E8_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4);
1248   }
1249 
1250   if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
1251     OutStreamer->emitInt32(R_00B02C_SPI_SHADER_PGM_RSRC2_PS);
1252     OutStreamer->emitInt32(
1253         S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks));
1254     OutStreamer->emitInt32(R_0286CC_SPI_PS_INPUT_ENA);
1255     OutStreamer->emitInt32(MFI->getPSInputEnable());
1256     OutStreamer->emitInt32(R_0286D0_SPI_PS_INPUT_ADDR);
1257     OutStreamer->emitInt32(MFI->getPSInputAddr());
1258   }
1259 
1260   OutStreamer->emitInt32(R_SPILLED_SGPRS);
1261   OutStreamer->emitInt32(MFI->getNumSpilledSGPRs());
1262   OutStreamer->emitInt32(R_SPILLED_VGPRS);
1263   OutStreamer->emitInt32(MFI->getNumSpilledVGPRs());
1264 }
1265 
1266 // This is the equivalent of EmitProgramInfoSI above, but for when the OS type
1267 // is AMDPAL.  It stores each compute/SPI register setting and other PAL
1268 // metadata items into the PALMD::Metadata, combining with any provided by the
1269 // frontend as LLVM metadata. Once all functions are written, the PAL metadata
1270 // is then written as a single block in the .note section.
1271 void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
1272        const SIProgramInfo &CurrentProgramInfo) {
1273   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1274   auto CC = MF.getFunction().getCallingConv();
1275   auto MD = getTargetStreamer()->getPALMetadata();
1276 
1277   MD->setEntryPoint(CC, MF.getFunction().getName());
1278   MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU);
1279   MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU);
1280   MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC));
1281   if (AMDGPU::isCompute(CC)) {
1282     MD->setRsrc2(CC, CurrentProgramInfo.ComputePGMRSrc2);
1283   } else {
1284     if (CurrentProgramInfo.ScratchBlocks > 0)
1285       MD->setRsrc2(CC, S_00B84C_SCRATCH_EN(1));
1286   }
1287   // ScratchSize is in bytes, 16 aligned.
1288   MD->setScratchSize(CC, alignTo(CurrentProgramInfo.ScratchSize, 16));
1289   if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
1290     MD->setRsrc2(CC, S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks));
1291     MD->setSpiPsInputEna(MFI->getPSInputEnable());
1292     MD->setSpiPsInputAddr(MFI->getPSInputAddr());
1293   }
1294 
1295   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1296   if (STM.isWave32())
1297     MD->setWave32(MF.getFunction().getCallingConv());
1298 }
1299 
1300 void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
1301   auto *MD = getTargetStreamer()->getPALMetadata();
1302   const MachineFrameInfo &MFI = MF.getFrameInfo();
1303   MD->setFunctionScratchSize(MF, MFI.getStackSize());
1304   // Set compute registers
1305   MD->setRsrc1(CallingConv::AMDGPU_CS,
1306                CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS));
1307   MD->setRsrc2(CallingConv::AMDGPU_CS, CurrentProgramInfo.ComputePGMRSrc2);
1308 }
1309 
1310 // This is supposed to be log2(Size)
1311 static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) {
1312   switch (Size) {
1313   case 4:
1314     return AMD_ELEMENT_4_BYTES;
1315   case 8:
1316     return AMD_ELEMENT_8_BYTES;
1317   case 16:
1318     return AMD_ELEMENT_16_BYTES;
1319   default:
1320     llvm_unreachable("invalid private_element_size");
1321   }
1322 }
1323 
1324 void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
1325                                         const SIProgramInfo &CurrentProgramInfo,
1326                                         const MachineFunction &MF) const {
1327   const Function &F = MF.getFunction();
1328   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
1329          F.getCallingConv() == CallingConv::SPIR_KERNEL);
1330 
1331   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1332   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1333 
1334   AMDGPU::initDefaultAMDKernelCodeT(Out, &STM);
1335 
1336   Out.compute_pgm_resource_registers =
1337       CurrentProgramInfo.getComputePGMRSrc1() |
1338       (CurrentProgramInfo.ComputePGMRSrc2 << 32);
1339   Out.code_properties |= AMD_CODE_PROPERTY_IS_PTR64;
1340 
1341   if (CurrentProgramInfo.DynamicCallStack)
1342     Out.code_properties |= AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK;
1343 
1344   AMD_HSA_BITS_SET(Out.code_properties,
1345                    AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE,
1346                    getElementByteSizeValue(STM.getMaxPrivateElementSize(true)));
1347 
1348   if (MFI->hasPrivateSegmentBuffer()) {
1349     Out.code_properties |=
1350       AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
1351   }
1352 
1353   if (MFI->hasDispatchPtr())
1354     Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
1355 
1356   if (MFI->hasQueuePtr())
1357     Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
1358 
1359   if (MFI->hasKernargSegmentPtr())
1360     Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
1361 
1362   if (MFI->hasDispatchID())
1363     Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
1364 
1365   if (MFI->hasFlatScratchInit())
1366     Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
1367 
1368   if (MFI->hasDispatchPtr())
1369     Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
1370 
1371   if (STM.isXNACKEnabled())
1372     Out.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED;
1373 
1374   Align MaxKernArgAlign;
1375   Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
1376   Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
1377   Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
1378   Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
1379   Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
1380 
1381   // kernarg_segment_alignment is specified as log of the alignment.
1382   // The minimum alignment is 16.
1383   Out.kernarg_segment_alignment = Log2(std::max(Align(16), MaxKernArgAlign));
1384 }
1385 
1386 bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
1387                                        const char *ExtraCode, raw_ostream &O) {
1388   // First try the generic code, which knows about modifiers like 'c' and 'n'.
1389   if (!AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O))
1390     return false;
1391 
1392   if (ExtraCode && ExtraCode[0]) {
1393     if (ExtraCode[1] != 0)
1394       return true; // Unknown modifier.
1395 
1396     switch (ExtraCode[0]) {
1397     case 'r':
1398       break;
1399     default:
1400       return true;
1401     }
1402   }
1403 
1404   // TODO: Should be able to support other operand types like globals.
1405   const MachineOperand &MO = MI->getOperand(OpNo);
1406   if (MO.isReg()) {
1407     AMDGPUInstPrinter::printRegOperand(MO.getReg(), O,
1408                                        *MF->getSubtarget().getRegisterInfo());
1409     return false;
1410   } else if (MO.isImm()) {
1411     int64_t Val = MO.getImm();
1412     if (AMDGPU::isInlinableIntLiteral(Val)) {
1413       O << Val;
1414     } else if (isUInt<16>(Val)) {
1415       O << format("0x%" PRIx16, static_cast<uint16_t>(Val));
1416     } else if (isUInt<32>(Val)) {
1417       O << format("0x%" PRIx32, static_cast<uint32_t>(Val));
1418     } else {
1419       O << format("0x%" PRIx64, static_cast<uint64_t>(Val));
1420     }
1421     return false;
1422   }
1423   return true;
1424 }
1425