1 //===-- AMDGPUAsmPrinter.cpp - AMDGPU Assebly printer  --------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 ///
12 /// The AMDGPUAsmPrinter is used to print both assembly string and also binary
13 /// code.  When passed an MCAsmStreamer it prints assembly and when passed
14 /// an MCObjectStreamer it outputs binary code.
15 //
16 //===----------------------------------------------------------------------===//
17 //
18 
19 #include "AMDGPUAsmPrinter.h"
20 #include "MCTargetDesc/AMDGPUTargetStreamer.h"
21 #include "InstPrinter/AMDGPUInstPrinter.h"
22 #include "Utils/AMDGPUBaseInfo.h"
23 #include "AMDGPU.h"
24 #include "AMDKernelCodeT.h"
25 #include "AMDGPUSubtarget.h"
26 #include "R600Defines.h"
27 #include "R600MachineFunctionInfo.h"
28 #include "R600RegisterInfo.h"
29 #include "SIDefines.h"
30 #include "SIMachineFunctionInfo.h"
31 #include "SIRegisterInfo.h"
32 #include "llvm/CodeGen/MachineFrameInfo.h"
33 #include "llvm/MC/MCContext.h"
34 #include "llvm/MC/MCSectionELF.h"
35 #include "llvm/MC/MCStreamer.h"
36 #include "llvm/Support/ELF.h"
37 #include "llvm/Support/MathExtras.h"
38 #include "llvm/Support/TargetRegistry.h"
39 #include "llvm/Target/TargetLoweringObjectFile.h"
40 
41 using namespace llvm;
42 
43 // TODO: This should get the default rounding mode from the kernel. We just set
44 // the default here, but this could change if the OpenCL rounding mode pragmas
45 // are used.
46 //
47 // The denormal mode here should match what is reported by the OpenCL runtime
48 // for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
49 // can also be override to flush with the -cl-denorms-are-zero compiler flag.
50 //
51 // AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
52 // precision, and leaves single precision to flush all and does not report
53 // CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
54 // CL_FP_DENORM for both.
55 //
56 // FIXME: It seems some instructions do not support single precision denormals
57 // regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,
58 // and sin_f32, cos_f32 on most parts).
59 
60 // We want to use these instructions, and using fp32 denormals also causes
61 // instructions to run at the double precision rate for the device so it's
62 // probably best to just report no single precision denormals.
63 static uint32_t getFPMode(const MachineFunction &F) {
64   const AMDGPUSubtarget& ST = F.getSubtarget<AMDGPUSubtarget>();
65   // TODO: Is there any real use for the flush in only / flush out only modes?
66 
67   uint32_t FP32Denormals =
68     ST.hasFP32Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
69 
70   uint32_t FP64Denormals =
71     ST.hasFP64Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
72 
73   return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) |
74          FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) |
75          FP_DENORM_MODE_SP(FP32Denormals) |
76          FP_DENORM_MODE_DP(FP64Denormals);
77 }
78 
79 static AsmPrinter *
80 createAMDGPUAsmPrinterPass(TargetMachine &tm,
81                            std::unique_ptr<MCStreamer> &&Streamer) {
82   return new AMDGPUAsmPrinter(tm, std::move(Streamer));
83 }
84 
85 extern "C" void LLVMInitializeAMDGPUAsmPrinter() {
86   TargetRegistry::RegisterAsmPrinter(TheAMDGPUTarget, createAMDGPUAsmPrinterPass);
87   TargetRegistry::RegisterAsmPrinter(TheGCNTarget, createAMDGPUAsmPrinterPass);
88 }
89 
90 AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM,
91                                    std::unique_ptr<MCStreamer> Streamer)
92     : AsmPrinter(TM, std::move(Streamer)) {}
93 
94 void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) {
95   if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
96     return;
97 
98   // Need to construct an MCSubtargetInfo here in case we have no functions
99   // in the module.
100   std::unique_ptr<MCSubtargetInfo> STI(TM.getTarget().createMCSubtargetInfo(
101         TM.getTargetTriple().str(), TM.getTargetCPU(),
102         TM.getTargetFeatureString()));
103 
104   AMDGPUTargetStreamer *TS =
105       static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
106 
107   TS->EmitDirectiveHSACodeObjectVersion(1, 0);
108   AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(STI->getFeatureBits());
109   TS->EmitDirectiveHSACodeObjectISA(ISA.Major, ISA.Minor, ISA.Stepping,
110                                     "AMD", "AMDGPU");
111 }
112 
113 void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
114   const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
115   SIProgramInfo KernelInfo;
116   if (STM.isAmdHsaOS()) {
117     getSIProgramInfo(KernelInfo, *MF);
118     EmitAmdKernelCodeT(*MF, KernelInfo);
119   }
120 }
121 
122 void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
123   const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
124   const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
125   if (MFI->isKernel() && STM.isAmdHsaOS()) {
126     AMDGPUTargetStreamer *TS =
127         static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
128     TS->EmitAMDGPUSymbolType(CurrentFnSym->getName(),
129                              ELF::STT_AMDGPU_HSA_KERNEL);
130   }
131 
132   AsmPrinter::EmitFunctionEntryLabel();
133 }
134 
135 static bool isModuleLinkage(const GlobalValue *GV) {
136   switch (GV->getLinkage()) {
137   case GlobalValue::LinkOnceODRLinkage:
138   case GlobalValue::LinkOnceAnyLinkage:
139   case GlobalValue::InternalLinkage:
140   case GlobalValue::CommonLinkage:
141    return true;
142   case GlobalValue::ExternalLinkage:
143    return false;
144   default: llvm_unreachable("unknown linkage type");
145   }
146 }
147 
148 void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
149 
150   if (TM.getTargetTriple().getOS() != Triple::AMDHSA) {
151     AsmPrinter::EmitGlobalVariable(GV);
152     return;
153   }
154 
155   if (GV->isDeclaration() || GV->getLinkage() == GlobalValue::PrivateLinkage) {
156     AsmPrinter::EmitGlobalVariable(GV);
157     return;
158   }
159 
160   // Group segment variables aren't emitted in HSA.
161   if (AMDGPU::isGroupSegment(GV))
162     return;
163 
164   AMDGPUTargetStreamer *TS =
165       static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
166   if (isModuleLinkage(GV)) {
167     TS->EmitAMDGPUHsaModuleScopeGlobal(GV->getName());
168   } else {
169     TS->EmitAMDGPUHsaProgramScopeGlobal(GV->getName());
170   }
171 
172   MCSymbolELF *GVSym = cast<MCSymbolELF>(getSymbol(GV));
173   const DataLayout &DL = getDataLayout();
174 
175   // Emit the size
176   uint64_t Size = DL.getTypeAllocSize(GV->getType()->getElementType());
177   OutStreamer->emitELFSize(GVSym, MCConstantExpr::create(Size, OutContext));
178   OutStreamer->PushSection();
179   OutStreamer->SwitchSection(
180       getObjFileLowering().SectionForGlobal(GV, *Mang, TM));
181   const Constant *C = GV->getInitializer();
182   OutStreamer->EmitLabel(GVSym);
183   EmitGlobalConstant(DL, C);
184   OutStreamer->PopSection();
185 }
186 
187 bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
188 
189   // The starting address of all shader programs must be 256 bytes aligned.
190   MF.setAlignment(8);
191 
192   SetupMachineFunction(MF);
193 
194   MCContext &Context = getObjFileLowering().getContext();
195   MCSectionELF *ConfigSection =
196       Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
197   OutStreamer->SwitchSection(ConfigSection);
198 
199   const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
200   SIProgramInfo KernelInfo;
201   if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
202     getSIProgramInfo(KernelInfo, MF);
203     if (!STM.isAmdHsaOS()) {
204       EmitProgramInfoSI(MF, KernelInfo);
205     }
206   } else {
207     EmitProgramInfoR600(MF);
208   }
209 
210   DisasmLines.clear();
211   HexLines.clear();
212   DisasmLineMaxLen = 0;
213 
214   EmitFunctionBody();
215 
216   if (isVerbose()) {
217     MCSectionELF *CommentSection =
218         Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
219     OutStreamer->SwitchSection(CommentSection);
220 
221     if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
222       OutStreamer->emitRawComment(" Kernel info:", false);
223       OutStreamer->emitRawComment(" codeLenInByte = " + Twine(KernelInfo.CodeLen),
224                                   false);
225       OutStreamer->emitRawComment(" NumSgprs: " + Twine(KernelInfo.NumSGPR),
226                                   false);
227       OutStreamer->emitRawComment(" NumVgprs: " + Twine(KernelInfo.NumVGPR),
228                                   false);
229       OutStreamer->emitRawComment(" FloatMode: " + Twine(KernelInfo.FloatMode),
230                                   false);
231       OutStreamer->emitRawComment(" IeeeMode: " + Twine(KernelInfo.IEEEMode),
232                                   false);
233       OutStreamer->emitRawComment(" ScratchSize: " + Twine(KernelInfo.ScratchSize),
234                                   false);
235       OutStreamer->emitRawComment(" LDSByteSize: " + Twine(KernelInfo.LDSSize) +
236                                   " bytes/workgroup (compile time only)", false);
237 
238       OutStreamer->emitRawComment(" ReservedVGPRFirst: " + Twine(KernelInfo.ReservedVGPRFirst),
239                                   false);
240       OutStreamer->emitRawComment(" ReservedVGPRCount: " + Twine(KernelInfo.ReservedVGPRCount),
241                                   false);
242 
243       OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
244                                   Twine(G_00B84C_USER_SGPR(KernelInfo.ComputePGMRSrc2)),
245                                   false);
246       OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " +
247                                   Twine(G_00B84C_TGID_X_EN(KernelInfo.ComputePGMRSrc2)),
248                                   false);
249       OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
250                                   Twine(G_00B84C_TGID_Y_EN(KernelInfo.ComputePGMRSrc2)),
251                                   false);
252       OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
253                                   Twine(G_00B84C_TGID_Z_EN(KernelInfo.ComputePGMRSrc2)),
254                                   false);
255       OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
256                                   Twine(G_00B84C_TIDIG_COMP_CNT(KernelInfo.ComputePGMRSrc2)),
257                                   false);
258 
259     } else {
260       R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
261       OutStreamer->emitRawComment(
262         Twine("SQ_PGM_RESOURCES:STACK_SIZE = " + Twine(MFI->StackSize)));
263     }
264   }
265 
266   if (STM.dumpCode()) {
267 
268     OutStreamer->SwitchSection(
269         Context.getELFSection(".AMDGPU.disasm", ELF::SHT_NOTE, 0));
270 
271     for (size_t i = 0; i < DisasmLines.size(); ++i) {
272       std::string Comment(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
273       Comment += " ; " + HexLines[i] + "\n";
274 
275       OutStreamer->EmitBytes(StringRef(DisasmLines[i]));
276       OutStreamer->EmitBytes(StringRef(Comment));
277     }
278   }
279 
280   return false;
281 }
282 
283 void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) {
284   unsigned MaxGPR = 0;
285   bool killPixel = false;
286   const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
287   const R600RegisterInfo *RI =
288       static_cast<const R600RegisterInfo *>(STM.getRegisterInfo());
289   const R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
290 
291   for (const MachineBasicBlock &MBB : MF) {
292     for (const MachineInstr &MI : MBB) {
293       if (MI.getOpcode() == AMDGPU::KILLGT)
294         killPixel = true;
295       unsigned numOperands = MI.getNumOperands();
296       for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
297         const MachineOperand &MO = MI.getOperand(op_idx);
298         if (!MO.isReg())
299           continue;
300         unsigned HWReg = RI->getEncodingValue(MO.getReg()) & 0xff;
301 
302         // Register with value > 127 aren't GPR
303         if (HWReg > 127)
304           continue;
305         MaxGPR = std::max(MaxGPR, HWReg);
306       }
307     }
308   }
309 
310   unsigned RsrcReg;
311   if (STM.getGeneration() >= AMDGPUSubtarget::EVERGREEN) {
312     // Evergreen / Northern Islands
313     switch (MF.getFunction()->getCallingConv()) {
314     default: // Fall through
315     case CallingConv::AMDGPU_CS: RsrcReg = R_0288D4_SQ_PGM_RESOURCES_LS; break;
316     case CallingConv::AMDGPU_GS: RsrcReg = R_028878_SQ_PGM_RESOURCES_GS; break;
317     case CallingConv::AMDGPU_PS: RsrcReg = R_028844_SQ_PGM_RESOURCES_PS; break;
318     case CallingConv::AMDGPU_VS: RsrcReg = R_028860_SQ_PGM_RESOURCES_VS; break;
319     }
320   } else {
321     // R600 / R700
322     switch (MF.getFunction()->getCallingConv()) {
323     default: // Fall through
324     case CallingConv::AMDGPU_GS: // Fall through
325     case CallingConv::AMDGPU_CS: // Fall through
326     case CallingConv::AMDGPU_VS: RsrcReg = R_028868_SQ_PGM_RESOURCES_VS; break;
327     case CallingConv::AMDGPU_PS: RsrcReg = R_028850_SQ_PGM_RESOURCES_PS; break;
328     }
329   }
330 
331   OutStreamer->EmitIntValue(RsrcReg, 4);
332   OutStreamer->EmitIntValue(S_NUM_GPRS(MaxGPR + 1) |
333                            S_STACK_SIZE(MFI->StackSize), 4);
334   OutStreamer->EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4);
335   OutStreamer->EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4);
336 
337   if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) {
338     OutStreamer->EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4);
339     OutStreamer->EmitIntValue(alignTo(MFI->LDSSize, 4) >> 2, 4);
340   }
341 }
342 
343 void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
344                                         const MachineFunction &MF) const {
345   const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
346   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
347   uint64_t CodeSize = 0;
348   unsigned MaxSGPR = 0;
349   unsigned MaxVGPR = 0;
350   bool VCCUsed = false;
351   bool FlatUsed = false;
352   const SIRegisterInfo *RI =
353       static_cast<const SIRegisterInfo *>(STM.getRegisterInfo());
354 
355   for (const MachineBasicBlock &MBB : MF) {
356     for (const MachineInstr &MI : MBB) {
357       // TODO: CodeSize should account for multiple functions.
358 
359       // TODO: Should we count size of debug info?
360       if (MI.isDebugValue())
361         continue;
362 
363       // FIXME: This is reporting 0 for many instructions.
364       CodeSize += MI.getDesc().Size;
365 
366       unsigned numOperands = MI.getNumOperands();
367       for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
368         const MachineOperand &MO = MI.getOperand(op_idx);
369         unsigned width = 0;
370         bool isSGPR = false;
371 
372         if (!MO.isReg())
373           continue;
374 
375         unsigned reg = MO.getReg();
376         switch (reg) {
377         case AMDGPU::EXEC:
378         case AMDGPU::EXEC_LO:
379         case AMDGPU::EXEC_HI:
380         case AMDGPU::SCC:
381         case AMDGPU::M0:
382           continue;
383 
384         case AMDGPU::VCC:
385         case AMDGPU::VCC_LO:
386         case AMDGPU::VCC_HI:
387           VCCUsed = true;
388           continue;
389 
390         case AMDGPU::FLAT_SCR:
391         case AMDGPU::FLAT_SCR_LO:
392         case AMDGPU::FLAT_SCR_HI:
393           FlatUsed = true;
394           continue;
395 
396         case AMDGPU::TBA:
397         case AMDGPU::TBA_LO:
398         case AMDGPU::TBA_HI:
399         case AMDGPU::TMA:
400         case AMDGPU::TMA_LO:
401         case AMDGPU::TMA_HI:
402           llvm_unreachable("Trap Handler registers should not be used");
403           continue;
404 
405         default:
406           break;
407         }
408 
409         if (AMDGPU::SReg_32RegClass.contains(reg)) {
410           if (AMDGPU::TTMP_32RegClass.contains(reg)) {
411             llvm_unreachable("Trap Handler registers should not be used");
412           }
413           isSGPR = true;
414           width = 1;
415         } else if (AMDGPU::VGPR_32RegClass.contains(reg)) {
416           isSGPR = false;
417           width = 1;
418         } else if (AMDGPU::SReg_64RegClass.contains(reg)) {
419           if (AMDGPU::TTMP_64RegClass.contains(reg)) {
420             llvm_unreachable("Trap Handler registers should not be used");
421           }
422           isSGPR = true;
423           width = 2;
424         } else if (AMDGPU::VReg_64RegClass.contains(reg)) {
425           isSGPR = false;
426           width = 2;
427         } else if (AMDGPU::VReg_96RegClass.contains(reg)) {
428           isSGPR = false;
429           width = 3;
430         } else if (AMDGPU::SReg_128RegClass.contains(reg)) {
431           isSGPR = true;
432           width = 4;
433         } else if (AMDGPU::VReg_128RegClass.contains(reg)) {
434           isSGPR = false;
435           width = 4;
436         } else if (AMDGPU::SReg_256RegClass.contains(reg)) {
437           isSGPR = true;
438           width = 8;
439         } else if (AMDGPU::VReg_256RegClass.contains(reg)) {
440           isSGPR = false;
441           width = 8;
442         } else if (AMDGPU::SReg_512RegClass.contains(reg)) {
443           isSGPR = true;
444           width = 16;
445         } else if (AMDGPU::VReg_512RegClass.contains(reg)) {
446           isSGPR = false;
447           width = 16;
448         } else {
449           llvm_unreachable("Unknown register class");
450         }
451         unsigned hwReg = RI->getEncodingValue(reg) & 0xff;
452         unsigned maxUsed = hwReg + width - 1;
453         if (isSGPR) {
454           MaxSGPR = maxUsed > MaxSGPR ? maxUsed : MaxSGPR;
455         } else {
456           MaxVGPR = maxUsed > MaxVGPR ? maxUsed : MaxVGPR;
457         }
458       }
459     }
460   }
461 
462   unsigned ExtraSGPRs = 0;
463 
464   if (VCCUsed)
465     ExtraSGPRs = 2;
466 
467   if (STM.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) {
468     if (FlatUsed)
469       ExtraSGPRs = 4;
470   } else {
471     if (STM.isXNACKEnabled())
472       ExtraSGPRs = 4;
473 
474     if (FlatUsed)
475       ExtraSGPRs = 6;
476   }
477 
478   MaxSGPR += ExtraSGPRs;
479 
480   // Update necessary Reserved* fields and max VGPRs used if
481   // "amdgpu-debugger-reserve-trap-regs" attribute was specified.
482   if (STM.debuggerReserveTrapVGPRs()) {
483     ProgInfo.ReservedVGPRFirst = MaxVGPR + 1;
484     ProgInfo.ReservedVGPRCount = MFI->getDebuggerReserveTrapVGPRCount();
485     MaxVGPR += MFI->getDebuggerReserveTrapVGPRCount();
486   }
487 
488   // We found the maximum register index. They start at 0, so add one to get the
489   // number of registers.
490   ProgInfo.NumVGPR = MaxVGPR + 1;
491   ProgInfo.NumSGPR = MaxSGPR + 1;
492 
493   if (STM.hasSGPRInitBug()) {
494     if (ProgInfo.NumSGPR > AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG) {
495       LLVMContext &Ctx = MF.getFunction()->getContext();
496       Ctx.emitError("too many SGPRs used with the SGPR init bug");
497     }
498 
499     ProgInfo.NumSGPR = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
500   }
501 
502   if (MFI->NumUserSGPRs > STM.getMaxNumUserSGPRs()) {
503     LLVMContext &Ctx = MF.getFunction()->getContext();
504     Ctx.emitError("too many user SGPRs used");
505   }
506 
507   if (MFI->LDSSize > static_cast<unsigned>(STM.getLocalMemorySize())) {
508     LLVMContext &Ctx = MF.getFunction()->getContext();
509     Ctx.emitError("LDS size exceeds device maximum");
510   }
511 
512   ProgInfo.VGPRBlocks = (ProgInfo.NumVGPR - 1) / 4;
513   ProgInfo.SGPRBlocks = (ProgInfo.NumSGPR - 1) / 8;
514   // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
515   // register.
516   ProgInfo.FloatMode = getFPMode(MF);
517 
518   ProgInfo.IEEEMode = 0;
519 
520   // Make clamp modifier on NaN input returns 0.
521   ProgInfo.DX10Clamp = 1;
522 
523   const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
524   ProgInfo.ScratchSize = FrameInfo->getStackSize();
525 
526   ProgInfo.FlatUsed = FlatUsed;
527   ProgInfo.VCCUsed = VCCUsed;
528   ProgInfo.CodeLen = CodeSize;
529 
530   unsigned LDSAlignShift;
531   if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
532     // LDS is allocated in 64 dword blocks.
533     LDSAlignShift = 8;
534   } else {
535     // LDS is allocated in 128 dword blocks.
536     LDSAlignShift = 9;
537   }
538 
539   unsigned LDSSpillSize = MFI->LDSWaveSpillSize *
540                           MFI->getMaximumWorkGroupSize(MF);
541 
542   ProgInfo.LDSSize = MFI->LDSSize + LDSSpillSize;
543   ProgInfo.LDSBlocks =
544       alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;
545 
546   // Scratch is allocated in 256 dword blocks.
547   unsigned ScratchAlignShift = 10;
548   // We need to program the hardware with the amount of scratch memory that
549   // is used by the entire wave.  ProgInfo.ScratchSize is the amount of
550   // scratch memory used per thread.
551   ProgInfo.ScratchBlocks =
552       alignTo(ProgInfo.ScratchSize * STM.getWavefrontSize(),
553               1ULL << ScratchAlignShift) >>
554       ScratchAlignShift;
555 
556   ProgInfo.ComputePGMRSrc1 =
557       S_00B848_VGPRS(ProgInfo.VGPRBlocks) |
558       S_00B848_SGPRS(ProgInfo.SGPRBlocks) |
559       S_00B848_PRIORITY(ProgInfo.Priority) |
560       S_00B848_FLOAT_MODE(ProgInfo.FloatMode) |
561       S_00B848_PRIV(ProgInfo.Priv) |
562       S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp) |
563       S_00B848_DEBUG_MODE(ProgInfo.DebugMode) |
564       S_00B848_IEEE_MODE(ProgInfo.IEEEMode);
565 
566   // 0 = X, 1 = XY, 2 = XYZ
567   unsigned TIDIGCompCnt = 0;
568   if (MFI->hasWorkItemIDZ())
569     TIDIGCompCnt = 2;
570   else if (MFI->hasWorkItemIDY())
571     TIDIGCompCnt = 1;
572 
573   ProgInfo.ComputePGMRSrc2 =
574       S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) |
575       S_00B84C_USER_SGPR(MFI->getNumUserSGPRs()) |
576       S_00B84C_TGID_X_EN(MFI->hasWorkGroupIDX()) |
577       S_00B84C_TGID_Y_EN(MFI->hasWorkGroupIDY()) |
578       S_00B84C_TGID_Z_EN(MFI->hasWorkGroupIDZ()) |
579       S_00B84C_TG_SIZE_EN(MFI->hasWorkGroupInfo()) |
580       S_00B84C_TIDIG_COMP_CNT(TIDIGCompCnt) |
581       S_00B84C_EXCP_EN_MSB(0) |
582       S_00B84C_LDS_SIZE(ProgInfo.LDSBlocks) |
583       S_00B84C_EXCP_EN(0);
584 }
585 
586 static unsigned getRsrcReg(CallingConv::ID CallConv) {
587   switch (CallConv) {
588   default: // Fall through
589   case CallingConv::AMDGPU_CS: return R_00B848_COMPUTE_PGM_RSRC1;
590   case CallingConv::AMDGPU_GS: return R_00B228_SPI_SHADER_PGM_RSRC1_GS;
591   case CallingConv::AMDGPU_PS: return R_00B028_SPI_SHADER_PGM_RSRC1_PS;
592   case CallingConv::AMDGPU_VS: return R_00B128_SPI_SHADER_PGM_RSRC1_VS;
593   }
594 }
595 
596 void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
597                                          const SIProgramInfo &KernelInfo) {
598   const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
599   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
600   unsigned RsrcReg = getRsrcReg(MF.getFunction()->getCallingConv());
601 
602   if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) {
603     OutStreamer->EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4);
604 
605     OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc1, 4);
606 
607     OutStreamer->EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4);
608     OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc2, 4);
609 
610     OutStreamer->EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4);
611     OutStreamer->EmitIntValue(S_00B860_WAVESIZE(KernelInfo.ScratchBlocks), 4);
612 
613     // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
614     // 0" comment but I don't see a corresponding field in the register spec.
615   } else {
616     OutStreamer->EmitIntValue(RsrcReg, 4);
617     OutStreamer->EmitIntValue(S_00B028_VGPRS(KernelInfo.VGPRBlocks) |
618                               S_00B028_SGPRS(KernelInfo.SGPRBlocks), 4);
619     if (STM.isVGPRSpillingEnabled(*MF.getFunction())) {
620       OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4);
621       OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(KernelInfo.ScratchBlocks), 4);
622     }
623   }
624 
625   if (MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_PS) {
626     OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);
627     OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(KernelInfo.LDSBlocks), 4);
628     OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);
629     OutStreamer->EmitIntValue(MFI->PSInputEna, 4);
630     OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4);
631     OutStreamer->EmitIntValue(MFI->getPSInputAddr(), 4);
632   }
633 }
634 
635 // This is supposed to be log2(Size)
636 static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) {
637   switch (Size) {
638   case 4:
639     return AMD_ELEMENT_4_BYTES;
640   case 8:
641     return AMD_ELEMENT_8_BYTES;
642   case 16:
643     return AMD_ELEMENT_16_BYTES;
644   default:
645     llvm_unreachable("invalid private_element_size");
646   }
647 }
648 
649 void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF,
650                                          const SIProgramInfo &KernelInfo) const {
651   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
652   const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
653   amd_kernel_code_t header;
654 
655   AMDGPU::initDefaultAMDKernelCodeT(header, STM.getFeatureBits());
656 
657   header.compute_pgm_resource_registers =
658       KernelInfo.ComputePGMRSrc1 |
659       (KernelInfo.ComputePGMRSrc2 << 32);
660   header.code_properties = AMD_CODE_PROPERTY_IS_PTR64;
661 
662 
663   AMD_HSA_BITS_SET(header.code_properties,
664                    AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE,
665                    getElementByteSizeValue(STM.getMaxPrivateElementSize()));
666 
667   if (MFI->hasPrivateSegmentBuffer()) {
668     header.code_properties |=
669       AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
670   }
671 
672   if (MFI->hasDispatchPtr())
673     header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
674 
675   if (MFI->hasQueuePtr())
676     header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
677 
678   if (MFI->hasKernargSegmentPtr())
679     header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
680 
681   if (MFI->hasDispatchID())
682     header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
683 
684   if (MFI->hasFlatScratchInit())
685     header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
686 
687   // TODO: Private segment size
688 
689   if (MFI->hasGridWorkgroupCountX()) {
690     header.code_properties |=
691       AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X;
692   }
693 
694   if (MFI->hasGridWorkgroupCountY()) {
695     header.code_properties |=
696       AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y;
697   }
698 
699   if (MFI->hasGridWorkgroupCountZ()) {
700     header.code_properties |=
701       AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z;
702   }
703 
704   if (MFI->hasDispatchPtr())
705     header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
706 
707   if (STM.isXNACKEnabled())
708     header.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED;
709 
710   header.kernarg_segment_byte_size = MFI->ABIArgOffset;
711   header.wavefront_sgpr_count = KernelInfo.NumSGPR;
712   header.workitem_vgpr_count = KernelInfo.NumVGPR;
713   header.workitem_private_segment_byte_size = KernelInfo.ScratchSize;
714   header.workgroup_group_segment_byte_size = KernelInfo.LDSSize;
715   header.reserved_vgpr_first = KernelInfo.ReservedVGPRFirst;
716   header.reserved_vgpr_count = KernelInfo.ReservedVGPRCount;
717 
718   AMDGPUTargetStreamer *TS =
719       static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
720   TS->EmitAMDKernelCodeT(header);
721 }
722 
723 bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
724                                        unsigned AsmVariant,
725                                        const char *ExtraCode, raw_ostream &O) {
726   if (ExtraCode && ExtraCode[0]) {
727     if (ExtraCode[1] != 0)
728       return true; // Unknown modifier.
729 
730     switch (ExtraCode[0]) {
731     default:
732       // See if this is a generic print operand
733       return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
734     case 'r':
735       break;
736     }
737   }
738 
739   AMDGPUInstPrinter::printRegOperand(MI->getOperand(OpNo).getReg(), O,
740                    *TM.getSubtargetImpl(*MF->getFunction())->getRegisterInfo());
741   return false;
742 }
743