1 //===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer  -------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 ///
11 /// The AMDGPUAsmPrinter is used to print both assembly string and also binary
12 /// code.  When passed an MCAsmStreamer it prints assembly and when passed
13 /// an MCObjectStreamer it outputs binary code.
14 //
15 //===----------------------------------------------------------------------===//
16 //
17 
18 #include "AMDGPUAsmPrinter.h"
19 #include "AMDGPU.h"
20 #include "AMDGPUSubtarget.h"
21 #include "AMDGPUTargetMachine.h"
22 #include "InstPrinter/AMDGPUInstPrinter.h"
23 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
24 #include "MCTargetDesc/AMDGPUTargetStreamer.h"
25 #include "R600AsmPrinter.h"
26 #include "R600Defines.h"
27 #include "R600MachineFunctionInfo.h"
28 #include "R600RegisterInfo.h"
29 #include "SIDefines.h"
30 #include "SIInstrInfo.h"
31 #include "SIMachineFunctionInfo.h"
32 #include "SIRegisterInfo.h"
33 #include "Utils/AMDGPUBaseInfo.h"
34 #include "llvm/BinaryFormat/ELF.h"
35 #include "llvm/CodeGen/MachineFrameInfo.h"
36 #include "llvm/IR/DiagnosticInfo.h"
37 #include "llvm/MC/MCContext.h"
38 #include "llvm/MC/MCSectionELF.h"
39 #include "llvm/MC/MCStreamer.h"
40 #include "llvm/Support/AMDGPUMetadata.h"
41 #include "llvm/Support/MathExtras.h"
42 #include "llvm/Support/TargetParser.h"
43 #include "llvm/Support/TargetRegistry.h"
44 #include "llvm/Target/TargetLoweringObjectFile.h"
45 
46 using namespace llvm;
47 using namespace llvm::AMDGPU;
48 using namespace llvm::AMDGPU::HSAMD;
49 
50 // TODO: This should get the default rounding mode from the kernel. We just set
51 // the default here, but this could change if the OpenCL rounding mode pragmas
52 // are used.
53 //
54 // The denormal mode here should match what is reported by the OpenCL runtime
55 // for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
56 // can also be override to flush with the -cl-denorms-are-zero compiler flag.
57 //
58 // AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
59 // precision, and leaves single precision to flush all and does not report
60 // CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
61 // CL_FP_DENORM for both.
62 //
63 // FIXME: It seems some instructions do not support single precision denormals
64 // regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,
65 // and sin_f32, cos_f32 on most parts).
66 
67 // We want to use these instructions, and using fp32 denormals also causes
68 // instructions to run at the double precision rate for the device so it's
69 // probably best to just report no single precision denormals.
70 static uint32_t getFPMode(const MachineFunction &F) {
71   const GCNSubtarget& ST = F.getSubtarget<GCNSubtarget>();
72   // TODO: Is there any real use for the flush in only / flush out only modes?
73 
74   uint32_t FP32Denormals =
75     ST.hasFP32Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
76 
77   uint32_t FP64Denormals =
78     ST.hasFP64Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
79 
80   return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) |
81          FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) |
82          FP_DENORM_MODE_SP(FP32Denormals) |
83          FP_DENORM_MODE_DP(FP64Denormals);
84 }
85 
86 static AsmPrinter *
87 createAMDGPUAsmPrinterPass(TargetMachine &tm,
88                            std::unique_ptr<MCStreamer> &&Streamer) {
89   return new AMDGPUAsmPrinter(tm, std::move(Streamer));
90 }
91 
92 extern "C" void LLVMInitializeAMDGPUAsmPrinter() {
93   TargetRegistry::RegisterAsmPrinter(getTheAMDGPUTarget(),
94                                      llvm::createR600AsmPrinterPass);
95   TargetRegistry::RegisterAsmPrinter(getTheGCNTarget(),
96                                      createAMDGPUAsmPrinterPass);
97 }
98 
99 AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM,
100                                    std::unique_ptr<MCStreamer> Streamer)
101   : AsmPrinter(TM, std::move(Streamer)) {
102     if (IsaInfo::hasCodeObjectV3(getSTI()))
103       HSAMetadataStream.reset(new MetadataStreamerV3());
104     else
105       HSAMetadataStream.reset(new MetadataStreamerV2());
106 }
107 
108 StringRef AMDGPUAsmPrinter::getPassName() const {
109   return "AMDGPU Assembly Printer";
110 }
111 
112 const MCSubtargetInfo* AMDGPUAsmPrinter::getSTI() const {
113   return TM.getMCSubtargetInfo();
114 }
115 
116 AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const {
117   if (!OutStreamer)
118     return nullptr;
119   return static_cast<AMDGPUTargetStreamer*>(OutStreamer->getTargetStreamer());
120 }
121 
122 void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) {
123   if (IsaInfo::hasCodeObjectV3(getSTI())) {
124     std::string ExpectedTarget;
125     raw_string_ostream ExpectedTargetOS(ExpectedTarget);
126     IsaInfo::streamIsaVersion(getSTI(), ExpectedTargetOS);
127 
128     getTargetStreamer()->EmitDirectiveAMDGCNTarget(ExpectedTarget);
129   }
130 
131   if (TM.getTargetTriple().getOS() != Triple::AMDHSA &&
132       TM.getTargetTriple().getOS() != Triple::AMDPAL)
133     return;
134 
135   if (TM.getTargetTriple().getOS() == Triple::AMDHSA)
136     HSAMetadataStream->begin(M);
137 
138   if (TM.getTargetTriple().getOS() == Triple::AMDPAL)
139     readPALMetadata(M);
140 
141   if (IsaInfo::hasCodeObjectV3(getSTI()))
142     return;
143 
144   // HSA emits NT_AMDGPU_HSA_CODE_OBJECT_VERSION for code objects v2.
145   if (TM.getTargetTriple().getOS() == Triple::AMDHSA)
146     getTargetStreamer()->EmitDirectiveHSACodeObjectVersion(2, 1);
147 
148   // HSA and PAL emit NT_AMDGPU_HSA_ISA for code objects v2.
149   IsaVersion Version = getIsaVersion(getSTI()->getCPU());
150   getTargetStreamer()->EmitDirectiveHSACodeObjectISA(
151       Version.Major, Version.Minor, Version.Stepping, "AMD", "AMDGPU");
152 }
153 
154 void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) {
155   // Following code requires TargetStreamer to be present.
156   if (!getTargetStreamer())
157     return;
158 
159   if (!IsaInfo::hasCodeObjectV3(getSTI())) {
160     // Emit ISA Version (NT_AMD_AMDGPU_ISA).
161     std::string ISAVersionString;
162     raw_string_ostream ISAVersionStream(ISAVersionString);
163     IsaInfo::streamIsaVersion(getSTI(), ISAVersionStream);
164     getTargetStreamer()->EmitISAVersion(ISAVersionStream.str());
165   }
166 
167   // Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA).
168   if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
169     HSAMetadataStream->end();
170     bool Success = HSAMetadataStream->emitTo(*getTargetStreamer());
171     (void)Success;
172     assert(Success && "Malformed HSA Metadata");
173   }
174 
175   if (!IsaInfo::hasCodeObjectV3(getSTI())) {
176     // Emit PAL Metadata (NT_AMD_AMDGPU_PAL_METADATA).
177     if (TM.getTargetTriple().getOS() == Triple::AMDPAL) {
178       // Copy the PAL metadata from the map where we collected it into a vector,
179       // then write it as a .note.
180       PALMD::Metadata PALMetadataVector;
181       for (auto i : PALMetadataMap) {
182         PALMetadataVector.push_back(i.first);
183         PALMetadataVector.push_back(i.second);
184       }
185       getTargetStreamer()->EmitPALMetadata(PALMetadataVector);
186     }
187   }
188 }
189 
190 bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough(
191   const MachineBasicBlock *MBB) const {
192   if (!AsmPrinter::isBlockOnlyReachableByFallthrough(MBB))
193     return false;
194 
195   if (MBB->empty())
196     return true;
197 
198   // If this is a block implementing a long branch, an expression relative to
199   // the start of the block is needed.  to the start of the block.
200   // XXX - Is there a smarter way to check this?
201   return (MBB->back().getOpcode() != AMDGPU::S_SETPC_B64);
202 }
203 
204 void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
205   const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
206   if (!MFI.isEntryFunction())
207     return;
208 
209   const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
210   const Function &F = MF->getFunction();
211   if (!STM.hasCodeObjectV3() && STM.isAmdHsaOrMesa(F) &&
212       (F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
213        F.getCallingConv() == CallingConv::SPIR_KERNEL)) {
214     amd_kernel_code_t KernelCode;
215     getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF);
216     getTargetStreamer()->EmitAMDKernelCodeT(KernelCode);
217   }
218 
219   if (STM.isAmdHsaOS())
220     HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo);
221 }
222 
223 void AMDGPUAsmPrinter::EmitFunctionBodyEnd() {
224   const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
225   if (!MFI.isEntryFunction())
226     return;
227   if (!IsaInfo::hasCodeObjectV3(getSTI()) ||
228       TM.getTargetTriple().getOS() != Triple::AMDHSA)
229     return;
230 
231   auto &Streamer = getTargetStreamer()->getStreamer();
232   auto &Context = Streamer.getContext();
233   auto &ObjectFileInfo = *Context.getObjectFileInfo();
234   auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection();
235 
236   Streamer.PushSection();
237   Streamer.SwitchSection(&ReadOnlySection);
238 
239   // CP microcode requires the kernel descriptor to be allocated on 64 byte
240   // alignment.
241   Streamer.EmitValueToAlignment(64, 0, 1, 0);
242   if (ReadOnlySection.getAlignment() < 64)
243     ReadOnlySection.setAlignment(64);
244 
245   SmallString<128> KernelName;
246   getNameWithPrefix(KernelName, &MF->getFunction());
247   getTargetStreamer()->EmitAmdhsaKernelDescriptor(
248       *getSTI(), KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo),
249       CurrentProgramInfo.NumVGPRsForWavesPerEU,
250       CurrentProgramInfo.NumSGPRsForWavesPerEU -
251           IsaInfo::getNumExtraSGPRs(getSTI(),
252                                     CurrentProgramInfo.VCCUsed,
253                                     CurrentProgramInfo.FlatUsed),
254       CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
255       hasXNACK(*getSTI()));
256 
257   Streamer.PopSection();
258 }
259 
260 void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
261   if (IsaInfo::hasCodeObjectV3(getSTI()) &&
262       TM.getTargetTriple().getOS() == Triple::AMDHSA) {
263     AsmPrinter::EmitFunctionEntryLabel();
264     return;
265   }
266 
267   const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
268   const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
269   if (MFI->isEntryFunction() && STM.isAmdHsaOrMesa(MF->getFunction())) {
270     SmallString<128> SymbolName;
271     getNameWithPrefix(SymbolName, &MF->getFunction()),
272     getTargetStreamer()->EmitAMDGPUSymbolType(
273         SymbolName, ELF::STT_AMDGPU_HSA_KERNEL);
274   }
275   const GCNSubtarget &STI = MF->getSubtarget<GCNSubtarget>();
276   if (STI.dumpCode()) {
277     // Disassemble function name label to text.
278     DisasmLines.push_back(MF->getName().str() + ":");
279     DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
280     HexLines.push_back("");
281   }
282 
283   AsmPrinter::EmitFunctionEntryLabel();
284 }
285 
286 void AMDGPUAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) const {
287   const GCNSubtarget &STI = MBB.getParent()->getSubtarget<GCNSubtarget>();
288   if (STI.dumpCode() && !isBlockOnlyReachableByFallthrough(&MBB)) {
289     // Write a line for the basic block label if it is not only fallthrough.
290     DisasmLines.push_back(
291         (Twine("BB") + Twine(getFunctionNumber())
292          + "_" + Twine(MBB.getNumber()) + ":").str());
293     DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
294     HexLines.push_back("");
295   }
296   AsmPrinter::EmitBasicBlockStart(MBB);
297 }
298 
299 void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
300 
301   // Group segment variables aren't emitted in HSA.
302   if (AMDGPU::isGroupSegment(GV))
303     return;
304 
305   AsmPrinter::EmitGlobalVariable(GV);
306 }
307 
308 bool AMDGPUAsmPrinter::doFinalization(Module &M) {
309   CallGraphResourceInfo.clear();
310   return AsmPrinter::doFinalization(M);
311 }
312 
313 // For the amdpal OS type, read the amdgpu.pal.metadata supplied by the
314 // frontend into our PALMetadataMap, ready for per-function modification.  It
315 // is a NamedMD containing an MDTuple containing a number of MDNodes each of
316 // which is an integer value, and each two integer values forms a key=value
317 // pair that we store as PALMetadataMap[key]=value in the map.
318 void AMDGPUAsmPrinter::readPALMetadata(Module &M) {
319   auto NamedMD = M.getNamedMetadata("amdgpu.pal.metadata");
320   if (!NamedMD || !NamedMD->getNumOperands())
321     return;
322   auto Tuple = dyn_cast<MDTuple>(NamedMD->getOperand(0));
323   if (!Tuple)
324     return;
325   for (unsigned I = 0, E = Tuple->getNumOperands() & -2; I != E; I += 2) {
326     auto Key = mdconst::dyn_extract<ConstantInt>(Tuple->getOperand(I));
327     auto Val = mdconst::dyn_extract<ConstantInt>(Tuple->getOperand(I + 1));
328     if (!Key || !Val)
329       continue;
330     PALMetadataMap[Key->getZExtValue()] = Val->getZExtValue();
331   }
332 }
333 
334 // Print comments that apply to both callable functions and entry points.
335 void AMDGPUAsmPrinter::emitCommonFunctionComments(
336   uint32_t NumVGPR,
337   uint32_t NumSGPR,
338   uint64_t ScratchSize,
339   uint64_t CodeSize,
340   const AMDGPUMachineFunction *MFI) {
341   OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
342   OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false);
343   OutStreamer->emitRawComment(" NumVgprs: " + Twine(NumVGPR), false);
344   OutStreamer->emitRawComment(" ScratchSize: " + Twine(ScratchSize), false);
345   OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()),
346                               false);
347 }
348 
349 uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
350     const MachineFunction &MF) const {
351   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
352   uint16_t KernelCodeProperties = 0;
353 
354   if (MFI.hasPrivateSegmentBuffer()) {
355     KernelCodeProperties |=
356         amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
357   }
358   if (MFI.hasDispatchPtr()) {
359     KernelCodeProperties |=
360         amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
361   }
362   if (MFI.hasQueuePtr()) {
363     KernelCodeProperties |=
364         amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
365   }
366   if (MFI.hasKernargSegmentPtr()) {
367     KernelCodeProperties |=
368         amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
369   }
370   if (MFI.hasDispatchID()) {
371     KernelCodeProperties |=
372         amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
373   }
374   if (MFI.hasFlatScratchInit()) {
375     KernelCodeProperties |=
376         amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
377   }
378 
379   return KernelCodeProperties;
380 }
381 
382 amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(
383     const MachineFunction &MF,
384     const SIProgramInfo &PI) const {
385   amdhsa::kernel_descriptor_t KernelDescriptor;
386   memset(&KernelDescriptor, 0x0, sizeof(KernelDescriptor));
387 
388   assert(isUInt<32>(PI.ScratchSize));
389   assert(isUInt<32>(PI.ComputePGMRSrc1));
390   assert(isUInt<32>(PI.ComputePGMRSrc2));
391 
392   KernelDescriptor.group_segment_fixed_size = PI.LDSSize;
393   KernelDescriptor.private_segment_fixed_size = PI.ScratchSize;
394   KernelDescriptor.compute_pgm_rsrc1 = PI.ComputePGMRSrc1;
395   KernelDescriptor.compute_pgm_rsrc2 = PI.ComputePGMRSrc2;
396   KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);
397 
398   return KernelDescriptor;
399 }
400 
401 bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
402   CurrentProgramInfo = SIProgramInfo();
403 
404   const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
405 
406   // The starting address of all shader programs must be 256 bytes aligned.
407   // Regular functions just need the basic required instruction alignment.
408   MF.setAlignment(MFI->isEntryFunction() ? 8 : 2);
409 
410   SetupMachineFunction(MF);
411 
412   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
413   MCContext &Context = getObjFileLowering().getContext();
414   // FIXME: This should be an explicit check for Mesa.
415   if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) {
416     MCSectionELF *ConfigSection =
417         Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
418     OutStreamer->SwitchSection(ConfigSection);
419   }
420 
421   if (MFI->isEntryFunction()) {
422     getSIProgramInfo(CurrentProgramInfo, MF);
423   } else {
424     auto I = CallGraphResourceInfo.insert(
425       std::make_pair(&MF.getFunction(), SIFunctionResourceInfo()));
426     SIFunctionResourceInfo &Info = I.first->second;
427     assert(I.second && "should only be called once per function");
428     Info = analyzeResourceUsage(MF);
429   }
430 
431   if (STM.isAmdPalOS())
432     EmitPALMetadata(MF, CurrentProgramInfo);
433   else if (!STM.isAmdHsaOS()) {
434     EmitProgramInfoSI(MF, CurrentProgramInfo);
435   }
436 
437   DisasmLines.clear();
438   HexLines.clear();
439   DisasmLineMaxLen = 0;
440 
441   EmitFunctionBody();
442 
443   if (isVerbose()) {
444     MCSectionELF *CommentSection =
445         Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
446     OutStreamer->SwitchSection(CommentSection);
447 
448     if (!MFI->isEntryFunction()) {
449       OutStreamer->emitRawComment(" Function info:", false);
450       SIFunctionResourceInfo &Info = CallGraphResourceInfo[&MF.getFunction()];
451       emitCommonFunctionComments(
452         Info.NumVGPR,
453         Info.getTotalNumSGPRs(MF.getSubtarget<GCNSubtarget>()),
454         Info.PrivateSegmentSize,
455         getFunctionCodeSize(MF), MFI);
456       return false;
457     }
458 
459     OutStreamer->emitRawComment(" Kernel info:", false);
460     emitCommonFunctionComments(CurrentProgramInfo.NumVGPR,
461                                CurrentProgramInfo.NumSGPR,
462                                CurrentProgramInfo.ScratchSize,
463                                getFunctionCodeSize(MF), MFI);
464 
465     OutStreamer->emitRawComment(
466       " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
467     OutStreamer->emitRawComment(
468       " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false);
469     OutStreamer->emitRawComment(
470       " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) +
471       " bytes/workgroup (compile time only)", false);
472 
473     OutStreamer->emitRawComment(
474       " SGPRBlocks: " + Twine(CurrentProgramInfo.SGPRBlocks), false);
475     OutStreamer->emitRawComment(
476       " VGPRBlocks: " + Twine(CurrentProgramInfo.VGPRBlocks), false);
477 
478     OutStreamer->emitRawComment(
479       " NumSGPRsForWavesPerEU: " +
480       Twine(CurrentProgramInfo.NumSGPRsForWavesPerEU), false);
481     OutStreamer->emitRawComment(
482       " NumVGPRsForWavesPerEU: " +
483       Twine(CurrentProgramInfo.NumVGPRsForWavesPerEU), false);
484 
485     OutStreamer->emitRawComment(
486       " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);
487 
488     if (MF.getSubtarget<GCNSubtarget>().debuggerEmitPrologue()) {
489       OutStreamer->emitRawComment(
490         " DebuggerWavefrontPrivateSegmentOffsetSGPR: s" +
491         Twine(CurrentProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR), false);
492       OutStreamer->emitRawComment(
493         " DebuggerPrivateSegmentBufferSGPR: s" +
494         Twine(CurrentProgramInfo.DebuggerPrivateSegmentBufferSGPR), false);
495     }
496 
497     OutStreamer->emitRawComment(
498       " COMPUTE_PGM_RSRC2:USER_SGPR: " +
499       Twine(G_00B84C_USER_SGPR(CurrentProgramInfo.ComputePGMRSrc2)), false);
500     OutStreamer->emitRawComment(
501       " COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
502       Twine(G_00B84C_TRAP_HANDLER(CurrentProgramInfo.ComputePGMRSrc2)), false);
503     OutStreamer->emitRawComment(
504       " COMPUTE_PGM_RSRC2:TGID_X_EN: " +
505       Twine(G_00B84C_TGID_X_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
506     OutStreamer->emitRawComment(
507       " COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
508       Twine(G_00B84C_TGID_Y_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
509     OutStreamer->emitRawComment(
510       " COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
511       Twine(G_00B84C_TGID_Z_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
512     OutStreamer->emitRawComment(
513       " COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
514       Twine(G_00B84C_TIDIG_COMP_CNT(CurrentProgramInfo.ComputePGMRSrc2)),
515       false);
516   }
517 
518   if (STM.dumpCode()) {
519 
520     OutStreamer->SwitchSection(
521         Context.getELFSection(".AMDGPU.disasm", ELF::SHT_NOTE, 0));
522 
523     for (size_t i = 0; i < DisasmLines.size(); ++i) {
524       std::string Comment = "\n";
525       if (!HexLines[i].empty()) {
526         Comment = std::string(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
527         Comment += " ; " + HexLines[i] + "\n";
528       }
529 
530       OutStreamer->EmitBytes(StringRef(DisasmLines[i]));
531       OutStreamer->EmitBytes(StringRef(Comment));
532     }
533   }
534 
535   return false;
536 }
537 
538 uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const {
539   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
540   const SIInstrInfo *TII = STM.getInstrInfo();
541 
542   uint64_t CodeSize = 0;
543 
544   for (const MachineBasicBlock &MBB : MF) {
545     for (const MachineInstr &MI : MBB) {
546       // TODO: CodeSize should account for multiple functions.
547 
548       // TODO: Should we count size of debug info?
549       if (MI.isDebugInstr())
550         continue;
551 
552       CodeSize += TII->getInstSizeInBytes(MI);
553     }
554   }
555 
556   return CodeSize;
557 }
558 
559 static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,
560                                   const SIInstrInfo &TII,
561                                   unsigned Reg) {
562   for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) {
563     if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent()))
564       return true;
565   }
566 
567   return false;
568 }
569 
570 int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumSGPRs(
571   const GCNSubtarget &ST) const {
572   return NumExplicitSGPR + IsaInfo::getNumExtraSGPRs(&ST,
573                                                      UsesVCC, UsesFlatScratch);
574 }
575 
576 AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
577   const MachineFunction &MF) const {
578   SIFunctionResourceInfo Info;
579 
580   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
581   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
582   const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
583   const MachineRegisterInfo &MRI = MF.getRegInfo();
584   const SIInstrInfo *TII = ST.getInstrInfo();
585   const SIRegisterInfo &TRI = TII->getRegisterInfo();
586 
587   Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||
588                          MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI);
589 
590   // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
591   // instructions aren't used to access the scratch buffer. Inline assembly may
592   // need it though.
593   //
594   // If we only have implicit uses of flat_scr on flat instructions, it is not
595   // really needed.
596   if (Info.UsesFlatScratch && !MFI->hasFlatScratchInit() &&
597       (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) &&
598        !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) &&
599        !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) {
600     Info.UsesFlatScratch = false;
601   }
602 
603   Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
604   Info.PrivateSegmentSize = FrameInfo.getStackSize();
605   if (MFI->isStackRealigned())
606     Info.PrivateSegmentSize += FrameInfo.getMaxAlignment();
607 
608 
609   Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC_LO) ||
610                  MRI.isPhysRegUsed(AMDGPU::VCC_HI);
611 
612   // If there are no calls, MachineRegisterInfo can tell us the used register
613   // count easily.
614   // A tail call isn't considered a call for MachineFrameInfo's purposes.
615   if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
616     MCPhysReg HighestVGPRReg = AMDGPU::NoRegister;
617     for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) {
618       if (MRI.isPhysRegUsed(Reg)) {
619         HighestVGPRReg = Reg;
620         break;
621       }
622     }
623 
624     MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
625     for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {
626       if (MRI.isPhysRegUsed(Reg)) {
627         HighestSGPRReg = Reg;
628         break;
629       }
630     }
631 
632     // We found the maximum register index. They start at 0, so add one to get the
633     // number of registers.
634     Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister ? 0 :
635       TRI.getHWRegIndex(HighestVGPRReg) + 1;
636     Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister ? 0 :
637       TRI.getHWRegIndex(HighestSGPRReg) + 1;
638 
639     return Info;
640   }
641 
642   int32_t MaxVGPR = -1;
643   int32_t MaxSGPR = -1;
644   uint64_t CalleeFrameSize = 0;
645 
646   for (const MachineBasicBlock &MBB : MF) {
647     for (const MachineInstr &MI : MBB) {
648       // TODO: Check regmasks? Do they occur anywhere except calls?
649       for (const MachineOperand &MO : MI.operands()) {
650         unsigned Width = 0;
651         bool IsSGPR = false;
652 
653         if (!MO.isReg())
654           continue;
655 
656         unsigned Reg = MO.getReg();
657         switch (Reg) {
658         case AMDGPU::EXEC:
659         case AMDGPU::EXEC_LO:
660         case AMDGPU::EXEC_HI:
661         case AMDGPU::SCC:
662         case AMDGPU::M0:
663         case AMDGPU::SRC_SHARED_BASE:
664         case AMDGPU::SRC_SHARED_LIMIT:
665         case AMDGPU::SRC_PRIVATE_BASE:
666         case AMDGPU::SRC_PRIVATE_LIMIT:
667           continue;
668 
669         case AMDGPU::NoRegister:
670           assert(MI.isDebugInstr());
671           continue;
672 
673         case AMDGPU::VCC:
674         case AMDGPU::VCC_LO:
675         case AMDGPU::VCC_HI:
676           Info.UsesVCC = true;
677           continue;
678 
679         case AMDGPU::FLAT_SCR:
680         case AMDGPU::FLAT_SCR_LO:
681         case AMDGPU::FLAT_SCR_HI:
682           continue;
683 
684         case AMDGPU::XNACK_MASK:
685         case AMDGPU::XNACK_MASK_LO:
686         case AMDGPU::XNACK_MASK_HI:
687           llvm_unreachable("xnack_mask registers should not be used");
688 
689         case AMDGPU::TBA:
690         case AMDGPU::TBA_LO:
691         case AMDGPU::TBA_HI:
692         case AMDGPU::TMA:
693         case AMDGPU::TMA_LO:
694         case AMDGPU::TMA_HI:
695           llvm_unreachable("trap handler registers should not be used");
696 
697         default:
698           break;
699         }
700 
701         if (AMDGPU::SReg_32RegClass.contains(Reg)) {
702           assert(!AMDGPU::TTMP_32RegClass.contains(Reg) &&
703                  "trap handler registers should not be used");
704           IsSGPR = true;
705           Width = 1;
706         } else if (AMDGPU::VGPR_32RegClass.contains(Reg)) {
707           IsSGPR = false;
708           Width = 1;
709         } else if (AMDGPU::SReg_64RegClass.contains(Reg)) {
710           assert(!AMDGPU::TTMP_64RegClass.contains(Reg) &&
711                  "trap handler registers should not be used");
712           IsSGPR = true;
713           Width = 2;
714         } else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
715           IsSGPR = false;
716           Width = 2;
717         } else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
718           IsSGPR = false;
719           Width = 3;
720         } else if (AMDGPU::SReg_128RegClass.contains(Reg)) {
721           assert(!AMDGPU::TTMP_128RegClass.contains(Reg) &&
722             "trap handler registers should not be used");
723           IsSGPR = true;
724           Width = 4;
725         } else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
726           IsSGPR = false;
727           Width = 4;
728         } else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
729           assert(!AMDGPU::TTMP_256RegClass.contains(Reg) &&
730             "trap handler registers should not be used");
731           IsSGPR = true;
732           Width = 8;
733         } else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
734           IsSGPR = false;
735           Width = 8;
736         } else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
737           assert(!AMDGPU::TTMP_512RegClass.contains(Reg) &&
738             "trap handler registers should not be used");
739           IsSGPR = true;
740           Width = 16;
741         } else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
742           IsSGPR = false;
743           Width = 16;
744         } else {
745           llvm_unreachable("Unknown register class");
746         }
747         unsigned HWReg = TRI.getHWRegIndex(Reg);
748         int MaxUsed = HWReg + Width - 1;
749         if (IsSGPR) {
750           MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
751         } else {
752           MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
753         }
754       }
755 
756       if (MI.isCall()) {
757         // Pseudo used just to encode the underlying global. Is there a better
758         // way to track this?
759 
760         const MachineOperand *CalleeOp
761           = TII->getNamedOperand(MI, AMDGPU::OpName::callee);
762         const Function *Callee = cast<Function>(CalleeOp->getGlobal());
763         if (Callee->isDeclaration()) {
764           // If this is a call to an external function, we can't do much. Make
765           // conservative guesses.
766 
767           // 48 SGPRs - vcc, - flat_scr, -xnack
768           int MaxSGPRGuess =
769               47 - IsaInfo::getNumExtraSGPRs(getSTI(), true,
770                                              ST.hasFlatAddressSpace());
771           MaxSGPR = std::max(MaxSGPR, MaxSGPRGuess);
772           MaxVGPR = std::max(MaxVGPR, 23);
773 
774           CalleeFrameSize = std::max(CalleeFrameSize, UINT64_C(16384));
775           Info.UsesVCC = true;
776           Info.UsesFlatScratch = ST.hasFlatAddressSpace();
777           Info.HasDynamicallySizedStack = true;
778         } else {
779           // We force CodeGen to run in SCC order, so the callee's register
780           // usage etc. should be the cumulative usage of all callees.
781           auto I = CallGraphResourceInfo.find(Callee);
782           assert(I != CallGraphResourceInfo.end() &&
783                  "callee should have been handled before caller");
784 
785           MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR);
786           MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR);
787           CalleeFrameSize
788             = std::max(I->second.PrivateSegmentSize, CalleeFrameSize);
789           Info.UsesVCC |= I->second.UsesVCC;
790           Info.UsesFlatScratch |= I->second.UsesFlatScratch;
791           Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack;
792           Info.HasRecursion |= I->second.HasRecursion;
793         }
794 
795         if (!Callee->doesNotRecurse())
796           Info.HasRecursion = true;
797       }
798     }
799   }
800 
801   Info.NumExplicitSGPR = MaxSGPR + 1;
802   Info.NumVGPR = MaxVGPR + 1;
803   Info.PrivateSegmentSize += CalleeFrameSize;
804 
805   return Info;
806 }
807 
808 void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
809                                         const MachineFunction &MF) {
810   SIFunctionResourceInfo Info = analyzeResourceUsage(MF);
811 
812   ProgInfo.NumVGPR = Info.NumVGPR;
813   ProgInfo.NumSGPR = Info.NumExplicitSGPR;
814   ProgInfo.ScratchSize = Info.PrivateSegmentSize;
815   ProgInfo.VCCUsed = Info.UsesVCC;
816   ProgInfo.FlatUsed = Info.UsesFlatScratch;
817   ProgInfo.DynamicCallStack = Info.HasDynamicallySizedStack || Info.HasRecursion;
818 
819   if (!isUInt<32>(ProgInfo.ScratchSize)) {
820     DiagnosticInfoStackSize DiagStackSize(MF.getFunction(),
821                                           ProgInfo.ScratchSize, DS_Error);
822     MF.getFunction().getContext().diagnose(DiagStackSize);
823   }
824 
825   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
826   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
827   const SIInstrInfo *TII = STM.getInstrInfo();
828   const SIRegisterInfo *RI = &TII->getRegisterInfo();
829 
830   // TODO(scott.linder): The calculations related to SGPR/VGPR blocks are
831   // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
832   // unified.
833   unsigned ExtraSGPRs = IsaInfo::getNumExtraSGPRs(
834       getSTI(), ProgInfo.VCCUsed, ProgInfo.FlatUsed);
835 
836   // Check the addressable register limit before we add ExtraSGPRs.
837   if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
838       !STM.hasSGPRInitBug()) {
839     unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
840     if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) {
841       // This can happen due to a compiler bug or when using inline asm.
842       LLVMContext &Ctx = MF.getFunction().getContext();
843       DiagnosticInfoResourceLimit Diag(MF.getFunction(),
844                                        "addressable scalar registers",
845                                        ProgInfo.NumSGPR, DS_Error,
846                                        DK_ResourceLimit,
847                                        MaxAddressableNumSGPRs);
848       Ctx.diagnose(Diag);
849       ProgInfo.NumSGPR = MaxAddressableNumSGPRs - 1;
850     }
851   }
852 
853   // Account for extra SGPRs and VGPRs reserved for debugger use.
854   ProgInfo.NumSGPR += ExtraSGPRs;
855 
856   // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
857   // dispatch registers are function args.
858   unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;
859   for (auto &Arg : MF.getFunction().args()) {
860     unsigned NumRegs = (Arg.getType()->getPrimitiveSizeInBits() + 31) / 32;
861     if (Arg.hasAttribute(Attribute::InReg))
862       WaveDispatchNumSGPR += NumRegs;
863     else
864       WaveDispatchNumVGPR += NumRegs;
865   }
866   ProgInfo.NumSGPR = std::max(ProgInfo.NumSGPR, WaveDispatchNumSGPR);
867   ProgInfo.NumVGPR = std::max(ProgInfo.NumVGPR, WaveDispatchNumVGPR);
868 
869   // Adjust number of registers used to meet default/requested minimum/maximum
870   // number of waves per execution unit request.
871   ProgInfo.NumSGPRsForWavesPerEU = std::max(
872     std::max(ProgInfo.NumSGPR, 1u), STM.getMinNumSGPRs(MFI->getMaxWavesPerEU()));
873   ProgInfo.NumVGPRsForWavesPerEU = std::max(
874     std::max(ProgInfo.NumVGPR, 1u), STM.getMinNumVGPRs(MFI->getMaxWavesPerEU()));
875 
876   if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ||
877       STM.hasSGPRInitBug()) {
878     unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
879     if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) {
880       // This can happen due to a compiler bug or when using inline asm to use
881       // the registers which are usually reserved for vcc etc.
882       LLVMContext &Ctx = MF.getFunction().getContext();
883       DiagnosticInfoResourceLimit Diag(MF.getFunction(),
884                                        "scalar registers",
885                                        ProgInfo.NumSGPR, DS_Error,
886                                        DK_ResourceLimit,
887                                        MaxAddressableNumSGPRs);
888       Ctx.diagnose(Diag);
889       ProgInfo.NumSGPR = MaxAddressableNumSGPRs;
890       ProgInfo.NumSGPRsForWavesPerEU = MaxAddressableNumSGPRs;
891     }
892   }
893 
894   if (STM.hasSGPRInitBug()) {
895     ProgInfo.NumSGPR =
896         AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
897     ProgInfo.NumSGPRsForWavesPerEU =
898         AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
899   }
900 
901   if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) {
902     LLVMContext &Ctx = MF.getFunction().getContext();
903     DiagnosticInfoResourceLimit Diag(MF.getFunction(), "user SGPRs",
904                                      MFI->getNumUserSGPRs(), DS_Error);
905     Ctx.diagnose(Diag);
906   }
907 
908   if (MFI->getLDSSize() > static_cast<unsigned>(STM.getLocalMemorySize())) {
909     LLVMContext &Ctx = MF.getFunction().getContext();
910     DiagnosticInfoResourceLimit Diag(MF.getFunction(), "local memory",
911                                      MFI->getLDSSize(), DS_Error);
912     Ctx.diagnose(Diag);
913   }
914 
915   ProgInfo.SGPRBlocks = IsaInfo::getNumSGPRBlocks(
916       &STM, ProgInfo.NumSGPRsForWavesPerEU);
917   ProgInfo.VGPRBlocks = IsaInfo::getNumVGPRBlocks(
918       &STM, ProgInfo.NumVGPRsForWavesPerEU);
919 
920   // Update DebuggerWavefrontPrivateSegmentOffsetSGPR and
921   // DebuggerPrivateSegmentBufferSGPR fields if "amdgpu-debugger-emit-prologue"
922   // attribute was requested.
923   if (STM.debuggerEmitPrologue()) {
924     ProgInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR =
925       RI->getHWRegIndex(MFI->getScratchWaveOffsetReg());
926     ProgInfo.DebuggerPrivateSegmentBufferSGPR =
927       RI->getHWRegIndex(MFI->getScratchRSrcReg());
928   }
929 
930   // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
931   // register.
932   ProgInfo.FloatMode = getFPMode(MF);
933 
934   ProgInfo.IEEEMode = STM.enableIEEEBit(MF);
935 
936   // Make clamp modifier on NaN input returns 0.
937   ProgInfo.DX10Clamp = STM.enableDX10Clamp();
938 
939   unsigned LDSAlignShift;
940   if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
941     // LDS is allocated in 64 dword blocks.
942     LDSAlignShift = 8;
943   } else {
944     // LDS is allocated in 128 dword blocks.
945     LDSAlignShift = 9;
946   }
947 
948   unsigned LDSSpillSize =
949     MFI->getLDSWaveSpillSize() * MFI->getMaxFlatWorkGroupSize();
950 
951   ProgInfo.LDSSize = MFI->getLDSSize() + LDSSpillSize;
952   ProgInfo.LDSBlocks =
953       alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;
954 
955   // Scratch is allocated in 256 dword blocks.
956   unsigned ScratchAlignShift = 10;
957   // We need to program the hardware with the amount of scratch memory that
958   // is used by the entire wave.  ProgInfo.ScratchSize is the amount of
959   // scratch memory used per thread.
960   ProgInfo.ScratchBlocks =
961       alignTo(ProgInfo.ScratchSize * STM.getWavefrontSize(),
962               1ULL << ScratchAlignShift) >>
963       ScratchAlignShift;
964 
965   ProgInfo.ComputePGMRSrc1 =
966       S_00B848_VGPRS(ProgInfo.VGPRBlocks) |
967       S_00B848_SGPRS(ProgInfo.SGPRBlocks) |
968       S_00B848_PRIORITY(ProgInfo.Priority) |
969       S_00B848_FLOAT_MODE(ProgInfo.FloatMode) |
970       S_00B848_PRIV(ProgInfo.Priv) |
971       S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp) |
972       S_00B848_DEBUG_MODE(ProgInfo.DebugMode) |
973       S_00B848_IEEE_MODE(ProgInfo.IEEEMode);
974 
975   // 0 = X, 1 = XY, 2 = XYZ
976   unsigned TIDIGCompCnt = 0;
977   if (MFI->hasWorkItemIDZ())
978     TIDIGCompCnt = 2;
979   else if (MFI->hasWorkItemIDY())
980     TIDIGCompCnt = 1;
981 
982   ProgInfo.ComputePGMRSrc2 =
983       S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) |
984       S_00B84C_USER_SGPR(MFI->getNumUserSGPRs()) |
985       // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
986       S_00B84C_TRAP_HANDLER(STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled()) |
987       S_00B84C_TGID_X_EN(MFI->hasWorkGroupIDX()) |
988       S_00B84C_TGID_Y_EN(MFI->hasWorkGroupIDY()) |
989       S_00B84C_TGID_Z_EN(MFI->hasWorkGroupIDZ()) |
990       S_00B84C_TG_SIZE_EN(MFI->hasWorkGroupInfo()) |
991       S_00B84C_TIDIG_COMP_CNT(TIDIGCompCnt) |
992       S_00B84C_EXCP_EN_MSB(0) |
993       // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
994       S_00B84C_LDS_SIZE(STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks) |
995       S_00B84C_EXCP_EN(0);
996 }
997 
998 static unsigned getRsrcReg(CallingConv::ID CallConv) {
999   switch (CallConv) {
1000   default: LLVM_FALLTHROUGH;
1001   case CallingConv::AMDGPU_CS: return R_00B848_COMPUTE_PGM_RSRC1;
1002   case CallingConv::AMDGPU_LS: return R_00B528_SPI_SHADER_PGM_RSRC1_LS;
1003   case CallingConv::AMDGPU_HS: return R_00B428_SPI_SHADER_PGM_RSRC1_HS;
1004   case CallingConv::AMDGPU_ES: return R_00B328_SPI_SHADER_PGM_RSRC1_ES;
1005   case CallingConv::AMDGPU_GS: return R_00B228_SPI_SHADER_PGM_RSRC1_GS;
1006   case CallingConv::AMDGPU_VS: return R_00B128_SPI_SHADER_PGM_RSRC1_VS;
1007   case CallingConv::AMDGPU_PS: return R_00B028_SPI_SHADER_PGM_RSRC1_PS;
1008   }
1009 }
1010 
1011 void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
1012                                          const SIProgramInfo &CurrentProgramInfo) {
1013   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1014   unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv());
1015 
1016   if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
1017     OutStreamer->EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4);
1018 
1019     OutStreamer->EmitIntValue(CurrentProgramInfo.ComputePGMRSrc1, 4);
1020 
1021     OutStreamer->EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4);
1022     OutStreamer->EmitIntValue(CurrentProgramInfo.ComputePGMRSrc2, 4);
1023 
1024     OutStreamer->EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4);
1025     OutStreamer->EmitIntValue(S_00B860_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4);
1026 
1027     // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
1028     // 0" comment but I don't see a corresponding field in the register spec.
1029   } else {
1030     OutStreamer->EmitIntValue(RsrcReg, 4);
1031     OutStreamer->EmitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) |
1032                               S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4);
1033     OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4);
1034     OutStreamer->EmitIntValue(
1035         S_0286E8_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4);
1036   }
1037 
1038   if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
1039     OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);
1040     OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks), 4);
1041     OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);
1042     OutStreamer->EmitIntValue(MFI->getPSInputEnable(), 4);
1043     OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4);
1044     OutStreamer->EmitIntValue(MFI->getPSInputAddr(), 4);
1045   }
1046 
1047   OutStreamer->EmitIntValue(R_SPILLED_SGPRS, 4);
1048   OutStreamer->EmitIntValue(MFI->getNumSpilledSGPRs(), 4);
1049   OutStreamer->EmitIntValue(R_SPILLED_VGPRS, 4);
1050   OutStreamer->EmitIntValue(MFI->getNumSpilledVGPRs(), 4);
1051 }
1052 
1053 // This is the equivalent of EmitProgramInfoSI above, but for when the OS type
1054 // is AMDPAL.  It stores each compute/SPI register setting and other PAL
1055 // metadata items into the PALMetadataMap, combining with any provided by the
1056 // frontend as LLVM metadata. Once all functions are written, PALMetadataMap is
1057 // then written as a single block in the .note section.
1058 void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
1059        const SIProgramInfo &CurrentProgramInfo) {
1060   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1061   // Given the calling convention, calculate the register number for rsrc1. In
1062   // principle the register number could change in future hardware, but we know
1063   // it is the same for gfx6-9 (except that LS and ES don't exist on gfx9), so
1064   // we can use the same fixed value that .AMDGPU.config has for Mesa. Note
1065   // that we use a register number rather than a byte offset, so we need to
1066   // divide by 4.
1067   unsigned Rsrc1Reg = getRsrcReg(MF.getFunction().getCallingConv()) / 4;
1068   unsigned Rsrc2Reg = Rsrc1Reg + 1;
1069   // Also calculate the PAL metadata key for *S_SCRATCH_SIZE. It can be used
1070   // with a constant offset to access any non-register shader-specific PAL
1071   // metadata key.
1072   unsigned ScratchSizeKey = PALMD::Key::CS_SCRATCH_SIZE;
1073   switch (MF.getFunction().getCallingConv()) {
1074     case CallingConv::AMDGPU_PS:
1075       ScratchSizeKey = PALMD::Key::PS_SCRATCH_SIZE;
1076       break;
1077     case CallingConv::AMDGPU_VS:
1078       ScratchSizeKey = PALMD::Key::VS_SCRATCH_SIZE;
1079       break;
1080     case CallingConv::AMDGPU_GS:
1081       ScratchSizeKey = PALMD::Key::GS_SCRATCH_SIZE;
1082       break;
1083     case CallingConv::AMDGPU_ES:
1084       ScratchSizeKey = PALMD::Key::ES_SCRATCH_SIZE;
1085       break;
1086     case CallingConv::AMDGPU_HS:
1087       ScratchSizeKey = PALMD::Key::HS_SCRATCH_SIZE;
1088       break;
1089     case CallingConv::AMDGPU_LS:
1090       ScratchSizeKey = PALMD::Key::LS_SCRATCH_SIZE;
1091       break;
1092   }
1093   unsigned NumUsedVgprsKey = ScratchSizeKey +
1094       PALMD::Key::VS_NUM_USED_VGPRS - PALMD::Key::VS_SCRATCH_SIZE;
1095   unsigned NumUsedSgprsKey = ScratchSizeKey +
1096       PALMD::Key::VS_NUM_USED_SGPRS - PALMD::Key::VS_SCRATCH_SIZE;
1097   PALMetadataMap[NumUsedVgprsKey] = CurrentProgramInfo.NumVGPRsForWavesPerEU;
1098   PALMetadataMap[NumUsedSgprsKey] = CurrentProgramInfo.NumSGPRsForWavesPerEU;
1099   if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
1100     PALMetadataMap[Rsrc1Reg] |= CurrentProgramInfo.ComputePGMRSrc1;
1101     PALMetadataMap[Rsrc2Reg] |= CurrentProgramInfo.ComputePGMRSrc2;
1102     // ScratchSize is in bytes, 16 aligned.
1103     PALMetadataMap[ScratchSizeKey] |=
1104         alignTo(CurrentProgramInfo.ScratchSize, 16);
1105   } else {
1106     PALMetadataMap[Rsrc1Reg] |= S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) |
1107         S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks);
1108     if (CurrentProgramInfo.ScratchBlocks > 0)
1109       PALMetadataMap[Rsrc2Reg] |= S_00B84C_SCRATCH_EN(1);
1110     // ScratchSize is in bytes, 16 aligned.
1111     PALMetadataMap[ScratchSizeKey] |=
1112         alignTo(CurrentProgramInfo.ScratchSize, 16);
1113   }
1114   if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
1115     PALMetadataMap[Rsrc2Reg] |=
1116         S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks);
1117     PALMetadataMap[R_0286CC_SPI_PS_INPUT_ENA / 4] |= MFI->getPSInputEnable();
1118     PALMetadataMap[R_0286D0_SPI_PS_INPUT_ADDR / 4] |= MFI->getPSInputAddr();
1119   }
1120 }
1121 
1122 // This is supposed to be log2(Size)
1123 static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) {
1124   switch (Size) {
1125   case 4:
1126     return AMD_ELEMENT_4_BYTES;
1127   case 8:
1128     return AMD_ELEMENT_8_BYTES;
1129   case 16:
1130     return AMD_ELEMENT_16_BYTES;
1131   default:
1132     llvm_unreachable("invalid private_element_size");
1133   }
1134 }
1135 
1136 void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
1137                                         const SIProgramInfo &CurrentProgramInfo,
1138                                         const MachineFunction &MF) const {
1139   const Function &F = MF.getFunction();
1140   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
1141          F.getCallingConv() == CallingConv::SPIR_KERNEL);
1142 
1143   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1144   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1145 
1146   AMDGPU::initDefaultAMDKernelCodeT(Out, getSTI());
1147 
1148   Out.compute_pgm_resource_registers =
1149       CurrentProgramInfo.ComputePGMRSrc1 |
1150       (CurrentProgramInfo.ComputePGMRSrc2 << 32);
1151   Out.code_properties = AMD_CODE_PROPERTY_IS_PTR64;
1152 
1153   if (CurrentProgramInfo.DynamicCallStack)
1154     Out.code_properties |= AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK;
1155 
1156   AMD_HSA_BITS_SET(Out.code_properties,
1157                    AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE,
1158                    getElementByteSizeValue(STM.getMaxPrivateElementSize()));
1159 
1160   if (MFI->hasPrivateSegmentBuffer()) {
1161     Out.code_properties |=
1162       AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
1163   }
1164 
1165   if (MFI->hasDispatchPtr())
1166     Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
1167 
1168   if (MFI->hasQueuePtr())
1169     Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
1170 
1171   if (MFI->hasKernargSegmentPtr())
1172     Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
1173 
1174   if (MFI->hasDispatchID())
1175     Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
1176 
1177   if (MFI->hasFlatScratchInit())
1178     Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
1179 
1180   if (MFI->hasDispatchPtr())
1181     Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
1182 
1183   if (STM.debuggerSupported())
1184     Out.code_properties |= AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED;
1185 
1186   if (STM.isXNACKEnabled())
1187     Out.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED;
1188 
1189   unsigned MaxKernArgAlign;
1190   Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
1191   Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
1192   Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
1193   Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
1194   Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
1195 
1196   // These alignment values are specified in powers of two, so alignment =
1197   // 2^n.  The minimum alignment is 2^4 = 16.
1198   Out.kernarg_segment_alignment = std::max((size_t)4,
1199       countTrailingZeros(MaxKernArgAlign));
1200 
1201   if (STM.debuggerEmitPrologue()) {
1202     Out.debug_wavefront_private_segment_offset_sgpr =
1203       CurrentProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR;
1204     Out.debug_private_segment_buffer_sgpr =
1205       CurrentProgramInfo.DebuggerPrivateSegmentBufferSGPR;
1206   }
1207 }
1208 
1209 bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
1210                                        unsigned AsmVariant,
1211                                        const char *ExtraCode, raw_ostream &O) {
1212   // First try the generic code, which knows about modifiers like 'c' and 'n'.
1213   if (!AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O))
1214     return false;
1215 
1216   if (ExtraCode && ExtraCode[0]) {
1217     if (ExtraCode[1] != 0)
1218       return true; // Unknown modifier.
1219 
1220     switch (ExtraCode[0]) {
1221     case 'r':
1222       break;
1223     default:
1224       return true;
1225     }
1226   }
1227 
1228   // TODO: Should be able to support other operand types like globals.
1229   const MachineOperand &MO = MI->getOperand(OpNo);
1230   if (MO.isReg()) {
1231     AMDGPUInstPrinter::printRegOperand(MO.getReg(), O,
1232                                        *MF->getSubtarget().getRegisterInfo());
1233     return false;
1234   }
1235 
1236   return true;
1237 }
1238