1 //===-- AMDGPUAsmPrinter.cpp - AMDGPU Assebly printer  --------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 ///
12 /// The AMDGPUAsmPrinter is used to print both assembly string and also binary
13 /// code.  When passed an MCAsmStreamer it prints assembly and when passed
14 /// an MCObjectStreamer it outputs binary code.
15 //
16 //===----------------------------------------------------------------------===//
17 //
18 
19 #include "AMDGPUAsmPrinter.h"
20 #include "MCTargetDesc/AMDGPUTargetStreamer.h"
21 #include "InstPrinter/AMDGPUInstPrinter.h"
22 #include "Utils/AMDGPUBaseInfo.h"
23 #include "AMDGPU.h"
24 #include "AMDKernelCodeT.h"
25 #include "AMDGPUSubtarget.h"
26 #include "R600Defines.h"
27 #include "R600MachineFunctionInfo.h"
28 #include "R600RegisterInfo.h"
29 #include "SIDefines.h"
30 #include "SIMachineFunctionInfo.h"
31 #include "SIRegisterInfo.h"
32 #include "llvm/CodeGen/MachineFrameInfo.h"
33 #include "llvm/MC/MCContext.h"
34 #include "llvm/MC/MCSectionELF.h"
35 #include "llvm/MC/MCStreamer.h"
36 #include "llvm/Support/ELF.h"
37 #include "llvm/Support/MathExtras.h"
38 #include "llvm/Support/TargetRegistry.h"
39 #include "llvm/Target/TargetLoweringObjectFile.h"
40 
41 using namespace llvm;
42 
43 // TODO: This should get the default rounding mode from the kernel. We just set
44 // the default here, but this could change if the OpenCL rounding mode pragmas
45 // are used.
46 //
47 // The denormal mode here should match what is reported by the OpenCL runtime
48 // for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
49 // can also be override to flush with the -cl-denorms-are-zero compiler flag.
50 //
51 // AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
52 // precision, and leaves single precision to flush all and does not report
53 // CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
54 // CL_FP_DENORM for both.
55 //
56 // FIXME: It seems some instructions do not support single precision denormals
57 // regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,
58 // and sin_f32, cos_f32 on most parts).
59 
60 // We want to use these instructions, and using fp32 denormals also causes
61 // instructions to run at the double precision rate for the device so it's
62 // probably best to just report no single precision denormals.
63 static uint32_t getFPMode(const MachineFunction &F) {
64   const AMDGPUSubtarget& ST = F.getSubtarget<AMDGPUSubtarget>();
65   // TODO: Is there any real use for the flush in only / flush out only modes?
66 
67   uint32_t FP32Denormals =
68     ST.hasFP32Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
69 
70   uint32_t FP64Denormals =
71     ST.hasFP64Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
72 
73   return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) |
74          FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) |
75          FP_DENORM_MODE_SP(FP32Denormals) |
76          FP_DENORM_MODE_DP(FP64Denormals);
77 }
78 
79 static AsmPrinter *
80 createAMDGPUAsmPrinterPass(TargetMachine &tm,
81                            std::unique_ptr<MCStreamer> &&Streamer) {
82   return new AMDGPUAsmPrinter(tm, std::move(Streamer));
83 }
84 
85 extern "C" void LLVMInitializeAMDGPUAsmPrinter() {
86   TargetRegistry::RegisterAsmPrinter(TheAMDGPUTarget, createAMDGPUAsmPrinterPass);
87   TargetRegistry::RegisterAsmPrinter(TheGCNTarget, createAMDGPUAsmPrinterPass);
88 }
89 
90 AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM,
91                                    std::unique_ptr<MCStreamer> Streamer)
92     : AsmPrinter(TM, std::move(Streamer)) {}
93 
94 void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
95   const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
96   SIProgramInfo KernelInfo;
97   if (STM.isAmdHsaOS()) {
98     getSIProgramInfo(KernelInfo, *MF);
99     EmitAmdKernelCodeT(*MF, KernelInfo);
100   }
101 }
102 
103 void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) {
104 
105   // This label is used to mark the end of the .text section.
106   const TargetLoweringObjectFile &TLOF = getObjFileLowering();
107   OutStreamer->SwitchSection(TLOF.getTextSection());
108   MCSymbol *EndOfTextLabel =
109       OutContext.getOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME));
110   OutStreamer->EmitLabel(EndOfTextLabel);
111 }
112 
113 void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
114   const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
115   const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
116   if (MFI->isKernel() && STM.isAmdHsaOS()) {
117     AMDGPUTargetStreamer *TS =
118         static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
119     TS->EmitAMDGPUSymbolType(CurrentFnSym->getName(),
120                              ELF::STT_AMDGPU_HSA_KERNEL);
121   }
122 
123   AsmPrinter::EmitFunctionEntryLabel();
124 }
125 
126 bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
127 
128   // The starting address of all shader programs must be 256 bytes aligned.
129   MF.setAlignment(8);
130 
131   SetupMachineFunction(MF);
132 
133   MCContext &Context = getObjFileLowering().getContext();
134   MCSectionELF *ConfigSection =
135       Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
136   OutStreamer->SwitchSection(ConfigSection);
137 
138   const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
139   SIProgramInfo KernelInfo;
140   if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
141     getSIProgramInfo(KernelInfo, MF);
142     if (!STM.isAmdHsaOS()) {
143       EmitProgramInfoSI(MF, KernelInfo);
144     }
145     // Emit directives
146     AMDGPUTargetStreamer *TS =
147         static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
148     TS->EmitDirectiveHSACodeObjectVersion(1, 0);
149     AMDGPU::IsaVersion ISA = STM.getIsaVersion();
150     TS->EmitDirectiveHSACodeObjectISA(ISA.Major, ISA.Minor, ISA.Stepping,
151                                       "AMD", "AMDGPU");
152   } else {
153     EmitProgramInfoR600(MF);
154   }
155 
156   DisasmLines.clear();
157   HexLines.clear();
158   DisasmLineMaxLen = 0;
159 
160   EmitFunctionBody();
161 
162   if (isVerbose()) {
163     MCSectionELF *CommentSection =
164         Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
165     OutStreamer->SwitchSection(CommentSection);
166 
167     if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
168       OutStreamer->emitRawComment(" Kernel info:", false);
169       OutStreamer->emitRawComment(" codeLenInByte = " + Twine(KernelInfo.CodeLen),
170                                   false);
171       OutStreamer->emitRawComment(" NumSgprs: " + Twine(KernelInfo.NumSGPR),
172                                   false);
173       OutStreamer->emitRawComment(" NumVgprs: " + Twine(KernelInfo.NumVGPR),
174                                   false);
175       OutStreamer->emitRawComment(" FloatMode: " + Twine(KernelInfo.FloatMode),
176                                   false);
177       OutStreamer->emitRawComment(" IeeeMode: " + Twine(KernelInfo.IEEEMode),
178                                   false);
179       OutStreamer->emitRawComment(" ScratchSize: " + Twine(KernelInfo.ScratchSize),
180                                   false);
181 
182       const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
183 
184       OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
185                                   Twine(MFI->NumUserSGPRs),
186                                   false);
187     } else {
188       R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
189       OutStreamer->emitRawComment(
190         Twine("SQ_PGM_RESOURCES:STACK_SIZE = " + Twine(MFI->StackSize)));
191     }
192   }
193 
194   if (STM.dumpCode()) {
195 
196     OutStreamer->SwitchSection(
197         Context.getELFSection(".AMDGPU.disasm", ELF::SHT_NOTE, 0));
198 
199     for (size_t i = 0; i < DisasmLines.size(); ++i) {
200       std::string Comment(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
201       Comment += " ; " + HexLines[i] + "\n";
202 
203       OutStreamer->EmitBytes(StringRef(DisasmLines[i]));
204       OutStreamer->EmitBytes(StringRef(Comment));
205     }
206   }
207 
208   return false;
209 }
210 
211 void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) {
212   unsigned MaxGPR = 0;
213   bool killPixel = false;
214   const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
215   const R600RegisterInfo *RI =
216       static_cast<const R600RegisterInfo *>(STM.getRegisterInfo());
217   const R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
218 
219   for (const MachineBasicBlock &MBB : MF) {
220     for (const MachineInstr &MI : MBB) {
221       if (MI.getOpcode() == AMDGPU::KILLGT)
222         killPixel = true;
223       unsigned numOperands = MI.getNumOperands();
224       for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
225         const MachineOperand &MO = MI.getOperand(op_idx);
226         if (!MO.isReg())
227           continue;
228         unsigned HWReg = RI->getEncodingValue(MO.getReg()) & 0xff;
229 
230         // Register with value > 127 aren't GPR
231         if (HWReg > 127)
232           continue;
233         MaxGPR = std::max(MaxGPR, HWReg);
234       }
235     }
236   }
237 
238   unsigned RsrcReg;
239   if (STM.getGeneration() >= AMDGPUSubtarget::EVERGREEN) {
240     // Evergreen / Northern Islands
241     switch (MFI->getShaderType()) {
242     default: // Fall through
243     case ShaderType::COMPUTE:  RsrcReg = R_0288D4_SQ_PGM_RESOURCES_LS; break;
244     case ShaderType::GEOMETRY: RsrcReg = R_028878_SQ_PGM_RESOURCES_GS; break;
245     case ShaderType::PIXEL:    RsrcReg = R_028844_SQ_PGM_RESOURCES_PS; break;
246     case ShaderType::VERTEX:   RsrcReg = R_028860_SQ_PGM_RESOURCES_VS; break;
247     }
248   } else {
249     // R600 / R700
250     switch (MFI->getShaderType()) {
251     default: // Fall through
252     case ShaderType::GEOMETRY: // Fall through
253     case ShaderType::COMPUTE:  // Fall through
254     case ShaderType::VERTEX:   RsrcReg = R_028868_SQ_PGM_RESOURCES_VS; break;
255     case ShaderType::PIXEL:    RsrcReg = R_028850_SQ_PGM_RESOURCES_PS; break;
256     }
257   }
258 
259   OutStreamer->EmitIntValue(RsrcReg, 4);
260   OutStreamer->EmitIntValue(S_NUM_GPRS(MaxGPR + 1) |
261                            S_STACK_SIZE(MFI->StackSize), 4);
262   OutStreamer->EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4);
263   OutStreamer->EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4);
264 
265   if (MFI->getShaderType() == ShaderType::COMPUTE) {
266     OutStreamer->EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4);
267     OutStreamer->EmitIntValue(RoundUpToAlignment(MFI->LDSSize, 4) >> 2, 4);
268   }
269 }
270 
271 void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
272                                         const MachineFunction &MF) const {
273   const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
274   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
275   uint64_t CodeSize = 0;
276   unsigned MaxSGPR = 0;
277   unsigned MaxVGPR = 0;
278   bool VCCUsed = false;
279   bool FlatUsed = false;
280   const SIRegisterInfo *RI =
281       static_cast<const SIRegisterInfo *>(STM.getRegisterInfo());
282 
283   for (const MachineBasicBlock &MBB : MF) {
284     for (const MachineInstr &MI : MBB) {
285       // TODO: CodeSize should account for multiple functions.
286 
287       // TODO: Should we count size of debug info?
288       if (MI.isDebugValue())
289         continue;
290 
291       // FIXME: This is reporting 0 for many instructions.
292       CodeSize += MI.getDesc().Size;
293 
294       unsigned numOperands = MI.getNumOperands();
295       for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
296         const MachineOperand &MO = MI.getOperand(op_idx);
297         unsigned width = 0;
298         bool isSGPR = false;
299 
300         if (!MO.isReg())
301           continue;
302 
303         unsigned reg = MO.getReg();
304         switch (reg) {
305         case AMDGPU::EXEC:
306         case AMDGPU::SCC:
307         case AMDGPU::M0:
308           continue;
309 
310         case AMDGPU::VCC:
311         case AMDGPU::VCC_LO:
312         case AMDGPU::VCC_HI:
313           VCCUsed = true;
314           continue;
315 
316         case AMDGPU::FLAT_SCR:
317         case AMDGPU::FLAT_SCR_LO:
318         case AMDGPU::FLAT_SCR_HI:
319           FlatUsed = true;
320           continue;
321 
322         default:
323           break;
324         }
325 
326         if (AMDGPU::SReg_32RegClass.contains(reg)) {
327           isSGPR = true;
328           width = 1;
329         } else if (AMDGPU::VGPR_32RegClass.contains(reg)) {
330           isSGPR = false;
331           width = 1;
332         } else if (AMDGPU::SReg_64RegClass.contains(reg)) {
333           isSGPR = true;
334           width = 2;
335         } else if (AMDGPU::VReg_64RegClass.contains(reg)) {
336           isSGPR = false;
337           width = 2;
338         } else if (AMDGPU::VReg_96RegClass.contains(reg)) {
339           isSGPR = false;
340           width = 3;
341         } else if (AMDGPU::SReg_128RegClass.contains(reg)) {
342           isSGPR = true;
343           width = 4;
344         } else if (AMDGPU::VReg_128RegClass.contains(reg)) {
345           isSGPR = false;
346           width = 4;
347         } else if (AMDGPU::SReg_256RegClass.contains(reg)) {
348           isSGPR = true;
349           width = 8;
350         } else if (AMDGPU::VReg_256RegClass.contains(reg)) {
351           isSGPR = false;
352           width = 8;
353         } else if (AMDGPU::SReg_512RegClass.contains(reg)) {
354           isSGPR = true;
355           width = 16;
356         } else if (AMDGPU::VReg_512RegClass.contains(reg)) {
357           isSGPR = false;
358           width = 16;
359         } else {
360           llvm_unreachable("Unknown register class");
361         }
362         unsigned hwReg = RI->getEncodingValue(reg) & 0xff;
363         unsigned maxUsed = hwReg + width - 1;
364         if (isSGPR) {
365           MaxSGPR = maxUsed > MaxSGPR ? maxUsed : MaxSGPR;
366         } else {
367           MaxVGPR = maxUsed > MaxVGPR ? maxUsed : MaxVGPR;
368         }
369       }
370     }
371   }
372 
373   if (VCCUsed)
374     MaxSGPR += 2;
375 
376   if (FlatUsed)
377     MaxSGPR += 2;
378 
379   // We found the maximum register index. They start at 0, so add one to get the
380   // number of registers.
381   ProgInfo.NumVGPR = MaxVGPR + 1;
382   ProgInfo.NumSGPR = MaxSGPR + 1;
383 
384   if (STM.hasSGPRInitBug()) {
385     if (ProgInfo.NumSGPR > AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG) {
386       LLVMContext &Ctx = MF.getFunction()->getContext();
387       Ctx.emitError("too many SGPRs used with the SGPR init bug");
388     }
389 
390     ProgInfo.NumSGPR = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
391   }
392 
393   ProgInfo.VGPRBlocks = (ProgInfo.NumVGPR - 1) / 4;
394   ProgInfo.SGPRBlocks = (ProgInfo.NumSGPR - 1) / 8;
395   // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
396   // register.
397   ProgInfo.FloatMode = getFPMode(MF);
398 
399   // XXX: Not quite sure what this does, but sc seems to unset this.
400   ProgInfo.IEEEMode = 0;
401 
402   // Do not clamp NAN to 0.
403   ProgInfo.DX10Clamp = 0;
404 
405   const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
406   ProgInfo.ScratchSize = FrameInfo->estimateStackSize(MF);
407 
408   ProgInfo.FlatUsed = FlatUsed;
409   ProgInfo.VCCUsed = VCCUsed;
410   ProgInfo.CodeLen = CodeSize;
411 
412   unsigned LDSAlignShift;
413   if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
414     // LDS is allocated in 64 dword blocks.
415     LDSAlignShift = 8;
416   } else {
417     // LDS is allocated in 128 dword blocks.
418     LDSAlignShift = 9;
419   }
420 
421   unsigned LDSSpillSize = MFI->LDSWaveSpillSize *
422                           MFI->getMaximumWorkGroupSize(MF);
423 
424   ProgInfo.LDSSize = MFI->LDSSize + LDSSpillSize;
425   ProgInfo.LDSBlocks =
426      RoundUpToAlignment(ProgInfo.LDSSize, 1 << LDSAlignShift) >> LDSAlignShift;
427 
428   // Scratch is allocated in 256 dword blocks.
429   unsigned ScratchAlignShift = 10;
430   // We need to program the hardware with the amount of scratch memory that
431   // is used by the entire wave.  ProgInfo.ScratchSize is the amount of
432   // scratch memory used per thread.
433   ProgInfo.ScratchBlocks =
434     RoundUpToAlignment(ProgInfo.ScratchSize * STM.getWavefrontSize(),
435                        1 << ScratchAlignShift) >> ScratchAlignShift;
436 
437   ProgInfo.ComputePGMRSrc1 =
438       S_00B848_VGPRS(ProgInfo.VGPRBlocks) |
439       S_00B848_SGPRS(ProgInfo.SGPRBlocks) |
440       S_00B848_PRIORITY(ProgInfo.Priority) |
441       S_00B848_FLOAT_MODE(ProgInfo.FloatMode) |
442       S_00B848_PRIV(ProgInfo.Priv) |
443       S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp) |
444       S_00B848_IEEE_MODE(ProgInfo.DebugMode) |
445       S_00B848_IEEE_MODE(ProgInfo.IEEEMode);
446 
447   ProgInfo.ComputePGMRSrc2 =
448       S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) |
449       S_00B84C_USER_SGPR(MFI->NumUserSGPRs) |
450       S_00B84C_TGID_X_EN(1) |
451       S_00B84C_TGID_Y_EN(1) |
452       S_00B84C_TGID_Z_EN(1) |
453       S_00B84C_TG_SIZE_EN(1) |
454       S_00B84C_TIDIG_COMP_CNT(2) |
455       S_00B84C_LDS_SIZE(ProgInfo.LDSBlocks);
456 }
457 
458 static unsigned getRsrcReg(unsigned ShaderType) {
459   switch (ShaderType) {
460   default: // Fall through
461   case ShaderType::COMPUTE:  return R_00B848_COMPUTE_PGM_RSRC1;
462   case ShaderType::GEOMETRY: return R_00B228_SPI_SHADER_PGM_RSRC1_GS;
463   case ShaderType::PIXEL:    return R_00B028_SPI_SHADER_PGM_RSRC1_PS;
464   case ShaderType::VERTEX:   return R_00B128_SPI_SHADER_PGM_RSRC1_VS;
465   }
466 }
467 
468 void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
469                                          const SIProgramInfo &KernelInfo) {
470   const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
471   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
472   unsigned RsrcReg = getRsrcReg(MFI->getShaderType());
473 
474   if (MFI->getShaderType() == ShaderType::COMPUTE) {
475     OutStreamer->EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4);
476 
477     OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc1, 4);
478 
479     OutStreamer->EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4);
480     OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc2, 4);
481 
482     OutStreamer->EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4);
483     OutStreamer->EmitIntValue(S_00B860_WAVESIZE(KernelInfo.ScratchBlocks), 4);
484 
485     // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
486     // 0" comment but I don't see a corresponding field in the register spec.
487   } else {
488     OutStreamer->EmitIntValue(RsrcReg, 4);
489     OutStreamer->EmitIntValue(S_00B028_VGPRS(KernelInfo.VGPRBlocks) |
490                               S_00B028_SGPRS(KernelInfo.SGPRBlocks), 4);
491     if (STM.isVGPRSpillingEnabled(MFI)) {
492       OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4);
493       OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(KernelInfo.ScratchBlocks), 4);
494     }
495   }
496 
497   if (MFI->getShaderType() == ShaderType::PIXEL) {
498     OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);
499     OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(KernelInfo.LDSBlocks), 4);
500     OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);
501     OutStreamer->EmitIntValue(MFI->PSInputAddr, 4);
502   }
503 }
504 
505 void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF,
506                                          const SIProgramInfo &KernelInfo) const {
507   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
508   const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
509   amd_kernel_code_t header;
510 
511   AMDGPU::initDefaultAMDKernelCodeT(header, STM.getFeatureBits());
512 
513   header.compute_pgm_resource_registers =
514       KernelInfo.ComputePGMRSrc1 |
515       (KernelInfo.ComputePGMRSrc2 << 32);
516   header.code_properties =
517       AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR |
518       AMD_CODE_PROPERTY_IS_PTR64;
519 
520   header.kernarg_segment_byte_size = MFI->ABIArgOffset;
521   header.wavefront_sgpr_count = KernelInfo.NumSGPR;
522   header.workitem_vgpr_count = KernelInfo.NumVGPR;
523 
524   AMDGPUTargetStreamer *TS =
525       static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
526   TS->EmitAMDKernelCodeT(header);
527 }
528 
529 bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
530                                        unsigned AsmVariant,
531                                        const char *ExtraCode, raw_ostream &O) {
532   if (ExtraCode && ExtraCode[0]) {
533     if (ExtraCode[1] != 0)
534       return true; // Unknown modifier.
535 
536     switch (ExtraCode[0]) {
537     default:
538       // See if this is a generic print operand
539       return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
540     case 'r':
541       break;
542     }
543   }
544 
545   AMDGPUInstPrinter::printRegOperand(MI->getOperand(OpNo).getReg(), O,
546                    *TM.getSubtargetImpl(*MF->getFunction())->getRegisterInfo());
547   return false;
548 }
549