1 //===- AMDGPUBaseInfo.cpp - AMDGPU Base encoding information --------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "AMDGPUBaseInfo.h"
10 #include "AMDGPUTargetTransformInfo.h"
11 #include "AMDGPU.h"
12 #include "SIDefines.h"
13 #include "llvm/ADT/StringRef.h"
14 #include "llvm/ADT/Triple.h"
15 #include "llvm/BinaryFormat/ELF.h"
16 #include "llvm/CodeGen/MachineMemOperand.h"
17 #include "llvm/IR/Attributes.h"
18 #include "llvm/IR/Constants.h"
19 #include "llvm/IR/Function.h"
20 #include "llvm/IR/GlobalValue.h"
21 #include "llvm/IR/Instruction.h"
22 #include "llvm/IR/LLVMContext.h"
23 #include "llvm/IR/Module.h"
24 #include "llvm/MC/MCContext.h"
25 #include "llvm/MC/MCInstrDesc.h"
26 #include "llvm/MC/MCInstrInfo.h"
27 #include "llvm/MC/MCRegisterInfo.h"
28 #include "llvm/MC/MCSectionELF.h"
29 #include "llvm/MC/MCSubtargetInfo.h"
30 #include "llvm/MC/SubtargetFeature.h"
31 #include "llvm/Support/Casting.h"
32 #include "llvm/Support/ErrorHandling.h"
33 #include "llvm/Support/MathExtras.h"
34 #include <algorithm>
35 #include <cassert>
36 #include <cstdint>
37 #include <cstring>
38 #include <utility>
39 
40 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
41 
42 #define GET_INSTRINFO_NAMED_OPS
43 #define GET_INSTRMAP_INFO
44 #include "AMDGPUGenInstrInfo.inc"
45 #undef GET_INSTRMAP_INFO
46 #undef GET_INSTRINFO_NAMED_OPS
47 
48 namespace {
49 
50 /// \returns Bit mask for given bit \p Shift and bit \p Width.
51 unsigned getBitMask(unsigned Shift, unsigned Width) {
52   return ((1 << Width) - 1) << Shift;
53 }
54 
55 /// Packs \p Src into \p Dst for given bit \p Shift and bit \p Width.
56 ///
57 /// \returns Packed \p Dst.
58 unsigned packBits(unsigned Src, unsigned Dst, unsigned Shift, unsigned Width) {
59   Dst &= ~(1 << Shift) & ~getBitMask(Shift, Width);
60   Dst |= (Src << Shift) & getBitMask(Shift, Width);
61   return Dst;
62 }
63 
64 /// Unpacks bits from \p Src for given bit \p Shift and bit \p Width.
65 ///
66 /// \returns Unpacked bits.
67 unsigned unpackBits(unsigned Src, unsigned Shift, unsigned Width) {
68   return (Src & getBitMask(Shift, Width)) >> Shift;
69 }
70 
71 /// \returns Vmcnt bit shift (lower bits).
72 unsigned getVmcntBitShiftLo() { return 0; }
73 
74 /// \returns Vmcnt bit width (lower bits).
75 unsigned getVmcntBitWidthLo() { return 4; }
76 
77 /// \returns Expcnt bit shift.
78 unsigned getExpcntBitShift() { return 4; }
79 
80 /// \returns Expcnt bit width.
81 unsigned getExpcntBitWidth() { return 3; }
82 
83 /// \returns Lgkmcnt bit shift.
84 unsigned getLgkmcntBitShift() { return 8; }
85 
86 /// \returns Lgkmcnt bit width.
87 unsigned getLgkmcntBitWidth() { return 4; }
88 
89 /// \returns Vmcnt bit shift (higher bits).
90 unsigned getVmcntBitShiftHi() { return 14; }
91 
92 /// \returns Vmcnt bit width (higher bits).
93 unsigned getVmcntBitWidthHi() { return 2; }
94 
95 } // end namespace anonymous
96 
97 namespace llvm {
98 
99 namespace AMDGPU {
100 
101 struct MIMGInfo {
102   uint16_t Opcode;
103   uint16_t BaseOpcode;
104   uint8_t MIMGEncoding;
105   uint8_t VDataDwords;
106   uint8_t VAddrDwords;
107 };
108 
109 #define GET_MIMGBaseOpcodesTable_IMPL
110 #define GET_MIMGDimInfoTable_IMPL
111 #define GET_MIMGInfoTable_IMPL
112 #define GET_MIMGLZMappingTable_IMPL
113 #include "AMDGPUGenSearchableTables.inc"
114 
115 int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding,
116                   unsigned VDataDwords, unsigned VAddrDwords) {
117   const MIMGInfo *Info = getMIMGOpcodeHelper(BaseOpcode, MIMGEncoding,
118                                              VDataDwords, VAddrDwords);
119   return Info ? Info->Opcode : -1;
120 }
121 
122 int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels) {
123   const MIMGInfo *OrigInfo = getMIMGInfo(Opc);
124   const MIMGInfo *NewInfo =
125       getMIMGOpcodeHelper(OrigInfo->BaseOpcode, OrigInfo->MIMGEncoding,
126                           NewChannels, OrigInfo->VAddrDwords);
127   return NewInfo ? NewInfo->Opcode : -1;
128 }
129 
130 struct MUBUFInfo {
131   uint16_t Opcode;
132   uint16_t BaseOpcode;
133   uint8_t dwords;
134   bool has_vaddr;
135   bool has_srsrc;
136   bool has_soffset;
137 };
138 
139 #define GET_MUBUFInfoTable_DECL
140 #define GET_MUBUFInfoTable_IMPL
141 #include "AMDGPUGenSearchableTables.inc"
142 
143 int getMUBUFBaseOpcode(unsigned Opc) {
144   const MUBUFInfo *Info = getMUBUFInfoFromOpcode(Opc);
145   return Info ? Info->BaseOpcode : -1;
146 }
147 
148 int getMUBUFOpcode(unsigned BaseOpc, unsigned Dwords) {
149   const MUBUFInfo *Info = getMUBUFInfoFromBaseOpcodeAndDwords(BaseOpc, Dwords);
150   return Info ? Info->Opcode : -1;
151 }
152 
153 int getMUBUFDwords(unsigned Opc) {
154   const MUBUFInfo *Info = getMUBUFOpcodeHelper(Opc);
155   return Info ? Info->dwords : 0;
156 }
157 
158 bool getMUBUFHasVAddr(unsigned Opc) {
159   const MUBUFInfo *Info = getMUBUFOpcodeHelper(Opc);
160   return Info ? Info->has_vaddr : false;
161 }
162 
163 bool getMUBUFHasSrsrc(unsigned Opc) {
164   const MUBUFInfo *Info = getMUBUFOpcodeHelper(Opc);
165   return Info ? Info->has_srsrc : false;
166 }
167 
168 bool getMUBUFHasSoffset(unsigned Opc) {
169   const MUBUFInfo *Info = getMUBUFOpcodeHelper(Opc);
170   return Info ? Info->has_soffset : false;
171 }
172 
173 // Wrapper for Tablegen'd function.  enum Subtarget is not defined in any
174 // header files, so we need to wrap it in a function that takes unsigned
175 // instead.
176 int getMCOpcode(uint16_t Opcode, unsigned Gen) {
177   return getMCOpcodeGen(Opcode, static_cast<Subtarget>(Gen));
178 }
179 
180 namespace IsaInfo {
181 
182 void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream) {
183   auto TargetTriple = STI->getTargetTriple();
184   auto Version = getIsaVersion(STI->getCPU());
185 
186   Stream << TargetTriple.getArchName() << '-'
187          << TargetTriple.getVendorName() << '-'
188          << TargetTriple.getOSName() << '-'
189          << TargetTriple.getEnvironmentName() << '-'
190          << "gfx"
191          << Version.Major
192          << Version.Minor
193          << Version.Stepping;
194 
195   if (hasXNACK(*STI))
196     Stream << "+xnack";
197   if (hasSRAMECC(*STI))
198     Stream << "+sram-ecc";
199 
200   Stream.flush();
201 }
202 
203 bool hasCodeObjectV3(const MCSubtargetInfo *STI) {
204   return STI->getTargetTriple().getOS() == Triple::AMDHSA &&
205              STI->getFeatureBits().test(FeatureCodeObjectV3);
206 }
207 
208 unsigned getWavefrontSize(const MCSubtargetInfo *STI) {
209   if (STI->getFeatureBits().test(FeatureWavefrontSize16))
210     return 16;
211   if (STI->getFeatureBits().test(FeatureWavefrontSize32))
212     return 32;
213 
214   return 64;
215 }
216 
217 unsigned getLocalMemorySize(const MCSubtargetInfo *STI) {
218   if (STI->getFeatureBits().test(FeatureLocalMemorySize32768))
219     return 32768;
220   if (STI->getFeatureBits().test(FeatureLocalMemorySize65536))
221     return 65536;
222 
223   return 0;
224 }
225 
226 unsigned getEUsPerCU(const MCSubtargetInfo *STI) {
227   return 4;
228 }
229 
230 unsigned getMaxWorkGroupsPerCU(const MCSubtargetInfo *STI,
231                                unsigned FlatWorkGroupSize) {
232   if (!STI->getFeatureBits().test(FeatureGCN))
233     return 8;
234   unsigned N = getWavesPerWorkGroup(STI, FlatWorkGroupSize);
235   if (N == 1)
236     return 40;
237   N = 40 / N;
238   return std::min(N, 16u);
239 }
240 
241 unsigned getMaxWavesPerCU(const MCSubtargetInfo *STI) {
242   return getMaxWavesPerEU() * getEUsPerCU(STI);
243 }
244 
245 unsigned getMaxWavesPerCU(const MCSubtargetInfo *STI,
246                           unsigned FlatWorkGroupSize) {
247   return getWavesPerWorkGroup(STI, FlatWorkGroupSize);
248 }
249 
250 unsigned getMinWavesPerEU(const MCSubtargetInfo *STI) {
251   return 1;
252 }
253 
254 unsigned getMaxWavesPerEU() {
255   // FIXME: Need to take scratch memory into account.
256   return 10;
257 }
258 
259 unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI,
260                           unsigned FlatWorkGroupSize) {
261   return alignTo(getMaxWavesPerCU(STI, FlatWorkGroupSize),
262                  getEUsPerCU(STI)) / getEUsPerCU(STI);
263 }
264 
265 unsigned getMinFlatWorkGroupSize(const MCSubtargetInfo *STI) {
266   return 1;
267 }
268 
269 unsigned getMaxFlatWorkGroupSize(const MCSubtargetInfo *STI) {
270   return 2048;
271 }
272 
273 unsigned getWavesPerWorkGroup(const MCSubtargetInfo *STI,
274                               unsigned FlatWorkGroupSize) {
275   return alignTo(FlatWorkGroupSize, getWavefrontSize(STI)) /
276                  getWavefrontSize(STI);
277 }
278 
279 unsigned getSGPRAllocGranule(const MCSubtargetInfo *STI) {
280   IsaVersion Version = getIsaVersion(STI->getCPU());
281   if (Version.Major >= 8)
282     return 16;
283   return 8;
284 }
285 
286 unsigned getSGPREncodingGranule(const MCSubtargetInfo *STI) {
287   return 8;
288 }
289 
290 unsigned getTotalNumSGPRs(const MCSubtargetInfo *STI) {
291   IsaVersion Version = getIsaVersion(STI->getCPU());
292   if (Version.Major >= 8)
293     return 800;
294   return 512;
295 }
296 
297 unsigned getAddressableNumSGPRs(const MCSubtargetInfo *STI) {
298   if (STI->getFeatureBits().test(FeatureSGPRInitBug))
299     return FIXED_NUM_SGPRS_FOR_INIT_BUG;
300 
301   IsaVersion Version = getIsaVersion(STI->getCPU());
302   if (Version.Major >= 8)
303     return 102;
304   return 104;
305 }
306 
307 unsigned getMinNumSGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
308   assert(WavesPerEU != 0);
309 
310   if (WavesPerEU >= getMaxWavesPerEU())
311     return 0;
312 
313   unsigned MinNumSGPRs = getTotalNumSGPRs(STI) / (WavesPerEU + 1);
314   if (STI->getFeatureBits().test(FeatureTrapHandler))
315     MinNumSGPRs -= std::min(MinNumSGPRs, (unsigned)TRAP_NUM_SGPRS);
316   MinNumSGPRs = alignDown(MinNumSGPRs, getSGPRAllocGranule(STI)) + 1;
317   return std::min(MinNumSGPRs, getAddressableNumSGPRs(STI));
318 }
319 
320 unsigned getMaxNumSGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU,
321                         bool Addressable) {
322   assert(WavesPerEU != 0);
323 
324   IsaVersion Version = getIsaVersion(STI->getCPU());
325   unsigned AddressableNumSGPRs = getAddressableNumSGPRs(STI);
326   if (Version.Major >= 8 && !Addressable)
327     AddressableNumSGPRs = 112;
328   unsigned MaxNumSGPRs = getTotalNumSGPRs(STI) / WavesPerEU;
329   if (STI->getFeatureBits().test(FeatureTrapHandler))
330     MaxNumSGPRs -= std::min(MaxNumSGPRs, (unsigned)TRAP_NUM_SGPRS);
331   MaxNumSGPRs = alignDown(MaxNumSGPRs, getSGPRAllocGranule(STI));
332   return std::min(MaxNumSGPRs, AddressableNumSGPRs);
333 }
334 
335 unsigned getNumExtraSGPRs(const MCSubtargetInfo *STI, bool VCCUsed,
336                           bool FlatScrUsed, bool XNACKUsed) {
337   unsigned ExtraSGPRs = 0;
338   if (VCCUsed)
339     ExtraSGPRs = 2;
340 
341   IsaVersion Version = getIsaVersion(STI->getCPU());
342   if (Version.Major < 8) {
343     if (FlatScrUsed)
344       ExtraSGPRs = 4;
345   } else {
346     if (XNACKUsed)
347       ExtraSGPRs = 4;
348 
349     if (FlatScrUsed)
350       ExtraSGPRs = 6;
351   }
352 
353   return ExtraSGPRs;
354 }
355 
356 unsigned getNumExtraSGPRs(const MCSubtargetInfo *STI, bool VCCUsed,
357                           bool FlatScrUsed) {
358   return getNumExtraSGPRs(STI, VCCUsed, FlatScrUsed,
359                           STI->getFeatureBits().test(AMDGPU::FeatureXNACK));
360 }
361 
362 unsigned getNumSGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs) {
363   NumSGPRs = alignTo(std::max(1u, NumSGPRs), getSGPREncodingGranule(STI));
364   // SGPRBlocks is actual number of SGPR blocks minus 1.
365   return NumSGPRs / getSGPREncodingGranule(STI) - 1;
366 }
367 
368 unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI) {
369   return 4;
370 }
371 
372 unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI) {
373   return getVGPRAllocGranule(STI);
374 }
375 
376 unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI) {
377   return 256;
378 }
379 
380 unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI) {
381   return getTotalNumVGPRs(STI);
382 }
383 
384 unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
385   assert(WavesPerEU != 0);
386 
387   if (WavesPerEU >= getMaxWavesPerEU())
388     return 0;
389   unsigned MinNumVGPRs =
390       alignDown(getTotalNumVGPRs(STI) / (WavesPerEU + 1),
391                 getVGPRAllocGranule(STI)) + 1;
392   return std::min(MinNumVGPRs, getAddressableNumVGPRs(STI));
393 }
394 
395 unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
396   assert(WavesPerEU != 0);
397 
398   unsigned MaxNumVGPRs = alignDown(getTotalNumVGPRs(STI) / WavesPerEU,
399                                    getVGPRAllocGranule(STI));
400   unsigned AddressableNumVGPRs = getAddressableNumVGPRs(STI);
401   return std::min(MaxNumVGPRs, AddressableNumVGPRs);
402 }
403 
404 unsigned getNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumVGPRs) {
405   NumVGPRs = alignTo(std::max(1u, NumVGPRs), getVGPREncodingGranule(STI));
406   // VGPRBlocks is actual number of VGPR blocks minus 1.
407   return NumVGPRs / getVGPREncodingGranule(STI) - 1;
408 }
409 
410 } // end namespace IsaInfo
411 
412 void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
413                                const MCSubtargetInfo *STI) {
414   IsaVersion Version = getIsaVersion(STI->getCPU());
415 
416   memset(&Header, 0, sizeof(Header));
417 
418   Header.amd_kernel_code_version_major = 1;
419   Header.amd_kernel_code_version_minor = 2;
420   Header.amd_machine_kind = 1; // AMD_MACHINE_KIND_AMDGPU
421   Header.amd_machine_version_major = Version.Major;
422   Header.amd_machine_version_minor = Version.Minor;
423   Header.amd_machine_version_stepping = Version.Stepping;
424   Header.kernel_code_entry_byte_offset = sizeof(Header);
425   // wavefront_size is specified as a power of 2: 2^6 = 64 threads.
426   Header.wavefront_size = 6;
427 
428   // If the code object does not support indirect functions, then the value must
429   // be 0xffffffff.
430   Header.call_convention = -1;
431 
432   // These alignment values are specified in powers of two, so alignment =
433   // 2^n.  The minimum alignment is 2^4 = 16.
434   Header.kernarg_segment_alignment = 4;
435   Header.group_segment_alignment = 4;
436   Header.private_segment_alignment = 4;
437 }
438 
439 amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor() {
440   amdhsa::kernel_descriptor_t KD;
441   memset(&KD, 0, sizeof(KD));
442   AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
443                   amdhsa::COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64,
444                   amdhsa::FLOAT_DENORM_MODE_FLUSH_NONE);
445   AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
446                   amdhsa::COMPUTE_PGM_RSRC1_ENABLE_DX10_CLAMP, 1);
447   AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
448                   amdhsa::COMPUTE_PGM_RSRC1_ENABLE_IEEE_MODE, 1);
449   AMDHSA_BITS_SET(KD.compute_pgm_rsrc2,
450                   amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X, 1);
451   return KD;
452 }
453 
454 bool isGroupSegment(const GlobalValue *GV) {
455   return GV->getType()->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
456 }
457 
458 bool isGlobalSegment(const GlobalValue *GV) {
459   return GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
460 }
461 
462 bool isReadOnlySegment(const GlobalValue *GV) {
463   return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
464          GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
465 }
466 
467 bool shouldEmitConstantsToTextSection(const Triple &TT) {
468   return TT.getOS() != Triple::AMDHSA;
469 }
470 
471 int getIntegerAttribute(const Function &F, StringRef Name, int Default) {
472   Attribute A = F.getFnAttribute(Name);
473   int Result = Default;
474 
475   if (A.isStringAttribute()) {
476     StringRef Str = A.getValueAsString();
477     if (Str.getAsInteger(0, Result)) {
478       LLVMContext &Ctx = F.getContext();
479       Ctx.emitError("can't parse integer attribute " + Name);
480     }
481   }
482 
483   return Result;
484 }
485 
486 std::pair<int, int> getIntegerPairAttribute(const Function &F,
487                                             StringRef Name,
488                                             std::pair<int, int> Default,
489                                             bool OnlyFirstRequired) {
490   Attribute A = F.getFnAttribute(Name);
491   if (!A.isStringAttribute())
492     return Default;
493 
494   LLVMContext &Ctx = F.getContext();
495   std::pair<int, int> Ints = Default;
496   std::pair<StringRef, StringRef> Strs = A.getValueAsString().split(',');
497   if (Strs.first.trim().getAsInteger(0, Ints.first)) {
498     Ctx.emitError("can't parse first integer attribute " + Name);
499     return Default;
500   }
501   if (Strs.second.trim().getAsInteger(0, Ints.second)) {
502     if (!OnlyFirstRequired || !Strs.second.trim().empty()) {
503       Ctx.emitError("can't parse second integer attribute " + Name);
504       return Default;
505     }
506   }
507 
508   return Ints;
509 }
510 
511 unsigned getVmcntBitMask(const IsaVersion &Version) {
512   unsigned VmcntLo = (1 << getVmcntBitWidthLo()) - 1;
513   if (Version.Major < 9)
514     return VmcntLo;
515 
516   unsigned VmcntHi = ((1 << getVmcntBitWidthHi()) - 1) << getVmcntBitWidthLo();
517   return VmcntLo | VmcntHi;
518 }
519 
520 unsigned getExpcntBitMask(const IsaVersion &Version) {
521   return (1 << getExpcntBitWidth()) - 1;
522 }
523 
524 unsigned getLgkmcntBitMask(const IsaVersion &Version) {
525   return (1 << getLgkmcntBitWidth()) - 1;
526 }
527 
528 unsigned getWaitcntBitMask(const IsaVersion &Version) {
529   unsigned VmcntLo = getBitMask(getVmcntBitShiftLo(), getVmcntBitWidthLo());
530   unsigned Expcnt = getBitMask(getExpcntBitShift(), getExpcntBitWidth());
531   unsigned Lgkmcnt = getBitMask(getLgkmcntBitShift(), getLgkmcntBitWidth());
532   unsigned Waitcnt = VmcntLo | Expcnt | Lgkmcnt;
533   if (Version.Major < 9)
534     return Waitcnt;
535 
536   unsigned VmcntHi = getBitMask(getVmcntBitShiftHi(), getVmcntBitWidthHi());
537   return Waitcnt | VmcntHi;
538 }
539 
540 unsigned decodeVmcnt(const IsaVersion &Version, unsigned Waitcnt) {
541   unsigned VmcntLo =
542       unpackBits(Waitcnt, getVmcntBitShiftLo(), getVmcntBitWidthLo());
543   if (Version.Major < 9)
544     return VmcntLo;
545 
546   unsigned VmcntHi =
547       unpackBits(Waitcnt, getVmcntBitShiftHi(), getVmcntBitWidthHi());
548   VmcntHi <<= getVmcntBitWidthLo();
549   return VmcntLo | VmcntHi;
550 }
551 
552 unsigned decodeExpcnt(const IsaVersion &Version, unsigned Waitcnt) {
553   return unpackBits(Waitcnt, getExpcntBitShift(), getExpcntBitWidth());
554 }
555 
556 unsigned decodeLgkmcnt(const IsaVersion &Version, unsigned Waitcnt) {
557   return unpackBits(Waitcnt, getLgkmcntBitShift(), getLgkmcntBitWidth());
558 }
559 
560 void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt,
561                    unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt) {
562   Vmcnt = decodeVmcnt(Version, Waitcnt);
563   Expcnt = decodeExpcnt(Version, Waitcnt);
564   Lgkmcnt = decodeLgkmcnt(Version, Waitcnt);
565 }
566 
567 Waitcnt decodeWaitcnt(const IsaVersion &Version, unsigned Encoded) {
568   Waitcnt Decoded;
569   Decoded.VmCnt = decodeVmcnt(Version, Encoded);
570   Decoded.ExpCnt = decodeExpcnt(Version, Encoded);
571   Decoded.LgkmCnt = decodeLgkmcnt(Version, Encoded);
572   return Decoded;
573 }
574 
575 unsigned encodeVmcnt(const IsaVersion &Version, unsigned Waitcnt,
576                      unsigned Vmcnt) {
577   Waitcnt =
578       packBits(Vmcnt, Waitcnt, getVmcntBitShiftLo(), getVmcntBitWidthLo());
579   if (Version.Major < 9)
580     return Waitcnt;
581 
582   Vmcnt >>= getVmcntBitWidthLo();
583   return packBits(Vmcnt, Waitcnt, getVmcntBitShiftHi(), getVmcntBitWidthHi());
584 }
585 
586 unsigned encodeExpcnt(const IsaVersion &Version, unsigned Waitcnt,
587                       unsigned Expcnt) {
588   return packBits(Expcnt, Waitcnt, getExpcntBitShift(), getExpcntBitWidth());
589 }
590 
591 unsigned encodeLgkmcnt(const IsaVersion &Version, unsigned Waitcnt,
592                        unsigned Lgkmcnt) {
593   return packBits(Lgkmcnt, Waitcnt, getLgkmcntBitShift(), getLgkmcntBitWidth());
594 }
595 
596 unsigned encodeWaitcnt(const IsaVersion &Version,
597                        unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt) {
598   unsigned Waitcnt = getWaitcntBitMask(Version);
599   Waitcnt = encodeVmcnt(Version, Waitcnt, Vmcnt);
600   Waitcnt = encodeExpcnt(Version, Waitcnt, Expcnt);
601   Waitcnt = encodeLgkmcnt(Version, Waitcnt, Lgkmcnt);
602   return Waitcnt;
603 }
604 
605 unsigned encodeWaitcnt(const IsaVersion &Version, const Waitcnt &Decoded) {
606   return encodeWaitcnt(Version, Decoded.VmCnt, Decoded.ExpCnt, Decoded.LgkmCnt);
607 }
608 
609 unsigned getInitialPSInputAddr(const Function &F) {
610   return getIntegerAttribute(F, "InitialPSInputAddr", 0);
611 }
612 
613 bool isShader(CallingConv::ID cc) {
614   switch(cc) {
615     case CallingConv::AMDGPU_VS:
616     case CallingConv::AMDGPU_LS:
617     case CallingConv::AMDGPU_HS:
618     case CallingConv::AMDGPU_ES:
619     case CallingConv::AMDGPU_GS:
620     case CallingConv::AMDGPU_PS:
621     case CallingConv::AMDGPU_CS:
622       return true;
623     default:
624       return false;
625   }
626 }
627 
628 bool isCompute(CallingConv::ID cc) {
629   return !isShader(cc) || cc == CallingConv::AMDGPU_CS;
630 }
631 
632 bool isEntryFunctionCC(CallingConv::ID CC) {
633   switch (CC) {
634   case CallingConv::AMDGPU_KERNEL:
635   case CallingConv::SPIR_KERNEL:
636   case CallingConv::AMDGPU_VS:
637   case CallingConv::AMDGPU_GS:
638   case CallingConv::AMDGPU_PS:
639   case CallingConv::AMDGPU_CS:
640   case CallingConv::AMDGPU_ES:
641   case CallingConv::AMDGPU_HS:
642   case CallingConv::AMDGPU_LS:
643     return true;
644   default:
645     return false;
646   }
647 }
648 
649 bool hasXNACK(const MCSubtargetInfo &STI) {
650   return STI.getFeatureBits()[AMDGPU::FeatureXNACK];
651 }
652 
653 bool hasSRAMECC(const MCSubtargetInfo &STI) {
654   return STI.getFeatureBits()[AMDGPU::FeatureSRAMECC];
655 }
656 
657 bool hasMIMG_R128(const MCSubtargetInfo &STI) {
658   return STI.getFeatureBits()[AMDGPU::FeatureMIMG_R128];
659 }
660 
661 bool hasPackedD16(const MCSubtargetInfo &STI) {
662   return !STI.getFeatureBits()[AMDGPU::FeatureUnpackedD16VMem];
663 }
664 
665 bool isSI(const MCSubtargetInfo &STI) {
666   return STI.getFeatureBits()[AMDGPU::FeatureSouthernIslands];
667 }
668 
669 bool isCI(const MCSubtargetInfo &STI) {
670   return STI.getFeatureBits()[AMDGPU::FeatureSeaIslands];
671 }
672 
673 bool isVI(const MCSubtargetInfo &STI) {
674   return STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands];
675 }
676 
677 bool isGFX9(const MCSubtargetInfo &STI) {
678   return STI.getFeatureBits()[AMDGPU::FeatureGFX9];
679 }
680 
681 bool isGCN3Encoding(const MCSubtargetInfo &STI) {
682   return STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding];
683 }
684 
685 bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI) {
686   const MCRegisterClass SGPRClass = TRI->getRegClass(AMDGPU::SReg_32RegClassID);
687   const unsigned FirstSubReg = TRI->getSubReg(Reg, 1);
688   return SGPRClass.contains(FirstSubReg != 0 ? FirstSubReg : Reg) ||
689     Reg == AMDGPU::SCC;
690 }
691 
692 bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI) {
693   for (MCRegAliasIterator R(Reg0, TRI, true); R.isValid(); ++R) {
694     if (*R == Reg1) return true;
695   }
696   return false;
697 }
698 
699 #define MAP_REG2REG \
700   using namespace AMDGPU; \
701   switch(Reg) { \
702   default: return Reg; \
703   CASE_CI_VI(FLAT_SCR) \
704   CASE_CI_VI(FLAT_SCR_LO) \
705   CASE_CI_VI(FLAT_SCR_HI) \
706   CASE_VI_GFX9(TTMP0) \
707   CASE_VI_GFX9(TTMP1) \
708   CASE_VI_GFX9(TTMP2) \
709   CASE_VI_GFX9(TTMP3) \
710   CASE_VI_GFX9(TTMP4) \
711   CASE_VI_GFX9(TTMP5) \
712   CASE_VI_GFX9(TTMP6) \
713   CASE_VI_GFX9(TTMP7) \
714   CASE_VI_GFX9(TTMP8) \
715   CASE_VI_GFX9(TTMP9) \
716   CASE_VI_GFX9(TTMP10) \
717   CASE_VI_GFX9(TTMP11) \
718   CASE_VI_GFX9(TTMP12) \
719   CASE_VI_GFX9(TTMP13) \
720   CASE_VI_GFX9(TTMP14) \
721   CASE_VI_GFX9(TTMP15) \
722   CASE_VI_GFX9(TTMP0_TTMP1) \
723   CASE_VI_GFX9(TTMP2_TTMP3) \
724   CASE_VI_GFX9(TTMP4_TTMP5) \
725   CASE_VI_GFX9(TTMP6_TTMP7) \
726   CASE_VI_GFX9(TTMP8_TTMP9) \
727   CASE_VI_GFX9(TTMP10_TTMP11) \
728   CASE_VI_GFX9(TTMP12_TTMP13) \
729   CASE_VI_GFX9(TTMP14_TTMP15) \
730   CASE_VI_GFX9(TTMP0_TTMP1_TTMP2_TTMP3) \
731   CASE_VI_GFX9(TTMP4_TTMP5_TTMP6_TTMP7) \
732   CASE_VI_GFX9(TTMP8_TTMP9_TTMP10_TTMP11) \
733   CASE_VI_GFX9(TTMP12_TTMP13_TTMP14_TTMP15) \
734   CASE_VI_GFX9(TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7) \
735   CASE_VI_GFX9(TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11) \
736   CASE_VI_GFX9(TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15) \
737   CASE_VI_GFX9(TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15) \
738   }
739 
740 #define CASE_CI_VI(node) \
741   assert(!isSI(STI)); \
742   case node: return isCI(STI) ? node##_ci : node##_vi;
743 
744 #define CASE_VI_GFX9(node) \
745   case node: return isGFX9(STI) ? node##_gfx9 : node##_vi;
746 
747 unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) {
748   if (STI.getTargetTriple().getArch() == Triple::r600)
749     return Reg;
750   MAP_REG2REG
751 }
752 
753 #undef CASE_CI_VI
754 #undef CASE_VI_GFX9
755 
756 #define CASE_CI_VI(node)   case node##_ci: case node##_vi:   return node;
757 #define CASE_VI_GFX9(node) case node##_vi: case node##_gfx9: return node;
758 
759 unsigned mc2PseudoReg(unsigned Reg) {
760   MAP_REG2REG
761 }
762 
763 #undef CASE_CI_VI
764 #undef CASE_VI_GFX9
765 #undef MAP_REG2REG
766 
767 bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo) {
768   assert(OpNo < Desc.NumOperands);
769   unsigned OpType = Desc.OpInfo[OpNo].OperandType;
770   return OpType >= AMDGPU::OPERAND_SRC_FIRST &&
771          OpType <= AMDGPU::OPERAND_SRC_LAST;
772 }
773 
774 bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo) {
775   assert(OpNo < Desc.NumOperands);
776   unsigned OpType = Desc.OpInfo[OpNo].OperandType;
777   switch (OpType) {
778   case AMDGPU::OPERAND_REG_IMM_FP32:
779   case AMDGPU::OPERAND_REG_IMM_FP64:
780   case AMDGPU::OPERAND_REG_IMM_FP16:
781   case AMDGPU::OPERAND_REG_INLINE_C_FP32:
782   case AMDGPU::OPERAND_REG_INLINE_C_FP64:
783   case AMDGPU::OPERAND_REG_INLINE_C_FP16:
784   case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
785     return true;
786   default:
787     return false;
788   }
789 }
790 
791 bool isSISrcInlinableOperand(const MCInstrDesc &Desc, unsigned OpNo) {
792   assert(OpNo < Desc.NumOperands);
793   unsigned OpType = Desc.OpInfo[OpNo].OperandType;
794   return OpType >= AMDGPU::OPERAND_REG_INLINE_C_FIRST &&
795          OpType <= AMDGPU::OPERAND_REG_INLINE_C_LAST;
796 }
797 
798 // Avoid using MCRegisterClass::getSize, since that function will go away
799 // (move from MC* level to Target* level). Return size in bits.
800 unsigned getRegBitWidth(unsigned RCID) {
801   switch (RCID) {
802   case AMDGPU::SGPR_32RegClassID:
803   case AMDGPU::VGPR_32RegClassID:
804   case AMDGPU::VS_32RegClassID:
805   case AMDGPU::SReg_32RegClassID:
806   case AMDGPU::SReg_32_XM0RegClassID:
807     return 32;
808   case AMDGPU::SGPR_64RegClassID:
809   case AMDGPU::VS_64RegClassID:
810   case AMDGPU::SReg_64RegClassID:
811   case AMDGPU::VReg_64RegClassID:
812   case AMDGPU::SReg_64_XEXECRegClassID:
813     return 64;
814   case AMDGPU::VReg_96RegClassID:
815     return 96;
816   case AMDGPU::SGPR_128RegClassID:
817   case AMDGPU::SReg_128RegClassID:
818   case AMDGPU::VReg_128RegClassID:
819     return 128;
820   case AMDGPU::SReg_256RegClassID:
821   case AMDGPU::VReg_256RegClassID:
822     return 256;
823   case AMDGPU::SReg_512RegClassID:
824   case AMDGPU::VReg_512RegClassID:
825     return 512;
826   default:
827     llvm_unreachable("Unexpected register class");
828   }
829 }
830 
831 unsigned getRegBitWidth(const MCRegisterClass &RC) {
832   return getRegBitWidth(RC.getID());
833 }
834 
835 unsigned getRegOperandSize(const MCRegisterInfo *MRI, const MCInstrDesc &Desc,
836                            unsigned OpNo) {
837   assert(OpNo < Desc.NumOperands);
838   unsigned RCID = Desc.OpInfo[OpNo].RegClass;
839   return getRegBitWidth(MRI->getRegClass(RCID)) / 8;
840 }
841 
842 bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi) {
843   if (Literal >= -16 && Literal <= 64)
844     return true;
845 
846   uint64_t Val = static_cast<uint64_t>(Literal);
847   return (Val == DoubleToBits(0.0)) ||
848          (Val == DoubleToBits(1.0)) ||
849          (Val == DoubleToBits(-1.0)) ||
850          (Val == DoubleToBits(0.5)) ||
851          (Val == DoubleToBits(-0.5)) ||
852          (Val == DoubleToBits(2.0)) ||
853          (Val == DoubleToBits(-2.0)) ||
854          (Val == DoubleToBits(4.0)) ||
855          (Val == DoubleToBits(-4.0)) ||
856          (Val == 0x3fc45f306dc9c882 && HasInv2Pi);
857 }
858 
859 bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi) {
860   if (Literal >= -16 && Literal <= 64)
861     return true;
862 
863   // The actual type of the operand does not seem to matter as long
864   // as the bits match one of the inline immediate values.  For example:
865   //
866   // -nan has the hexadecimal encoding of 0xfffffffe which is -2 in decimal,
867   // so it is a legal inline immediate.
868   //
869   // 1065353216 has the hexadecimal encoding 0x3f800000 which is 1.0f in
870   // floating-point, so it is a legal inline immediate.
871 
872   uint32_t Val = static_cast<uint32_t>(Literal);
873   return (Val == FloatToBits(0.0f)) ||
874          (Val == FloatToBits(1.0f)) ||
875          (Val == FloatToBits(-1.0f)) ||
876          (Val == FloatToBits(0.5f)) ||
877          (Val == FloatToBits(-0.5f)) ||
878          (Val == FloatToBits(2.0f)) ||
879          (Val == FloatToBits(-2.0f)) ||
880          (Val == FloatToBits(4.0f)) ||
881          (Val == FloatToBits(-4.0f)) ||
882          (Val == 0x3e22f983 && HasInv2Pi);
883 }
884 
885 bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi) {
886   if (!HasInv2Pi)
887     return false;
888 
889   if (Literal >= -16 && Literal <= 64)
890     return true;
891 
892   uint16_t Val = static_cast<uint16_t>(Literal);
893   return Val == 0x3C00 || // 1.0
894          Val == 0xBC00 || // -1.0
895          Val == 0x3800 || // 0.5
896          Val == 0xB800 || // -0.5
897          Val == 0x4000 || // 2.0
898          Val == 0xC000 || // -2.0
899          Val == 0x4400 || // 4.0
900          Val == 0xC400 || // -4.0
901          Val == 0x3118;   // 1/2pi
902 }
903 
904 bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi) {
905   assert(HasInv2Pi);
906 
907   int16_t Lo16 = static_cast<int16_t>(Literal);
908   int16_t Hi16 = static_cast<int16_t>(Literal >> 16);
909   return Lo16 == Hi16 && isInlinableLiteral16(Lo16, HasInv2Pi);
910 }
911 
912 bool isArgPassedInSGPR(const Argument *A) {
913   const Function *F = A->getParent();
914 
915   // Arguments to compute shaders are never a source of divergence.
916   CallingConv::ID CC = F->getCallingConv();
917   switch (CC) {
918   case CallingConv::AMDGPU_KERNEL:
919   case CallingConv::SPIR_KERNEL:
920     return true;
921   case CallingConv::AMDGPU_VS:
922   case CallingConv::AMDGPU_LS:
923   case CallingConv::AMDGPU_HS:
924   case CallingConv::AMDGPU_ES:
925   case CallingConv::AMDGPU_GS:
926   case CallingConv::AMDGPU_PS:
927   case CallingConv::AMDGPU_CS:
928     // For non-compute shaders, SGPR inputs are marked with either inreg or byval.
929     // Everything else is in VGPRs.
930     return F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::InReg) ||
931            F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::ByVal);
932   default:
933     // TODO: Should calls support inreg for SGPR inputs?
934     return false;
935   }
936 }
937 
938 int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset) {
939   if (isGCN3Encoding(ST))
940     return ByteOffset;
941   return ByteOffset >> 2;
942 }
943 
944 bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset) {
945   int64_t EncodedOffset = getSMRDEncodedOffset(ST, ByteOffset);
946   return isGCN3Encoding(ST) ?
947     isUInt<20>(EncodedOffset) : isUInt<8>(EncodedOffset);
948 }
949 
950 // Given Imm, split it into the values to put into the SOffset and ImmOffset
951 // fields in an MUBUF instruction. Return false if it is not possible (due to a
952 // hardware bug needing a workaround).
953 //
954 // The required alignment ensures that individual address components remain
955 // aligned if they are aligned to begin with. It also ensures that additional
956 // offsets within the given alignment can be added to the resulting ImmOffset.
957 bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset,
958                       const GCNSubtarget *Subtarget, uint32_t Align) {
959   const uint32_t MaxImm = alignDown(4095, Align);
960   uint32_t Overflow = 0;
961 
962   if (Imm > MaxImm) {
963     if (Imm <= MaxImm + 64) {
964       // Use an SOffset inline constant for 4..64
965       Overflow = Imm - MaxImm;
966       Imm = MaxImm;
967     } else {
968       // Try to keep the same value in SOffset for adjacent loads, so that
969       // the corresponding register contents can be re-used.
970       //
971       // Load values with all low-bits (except for alignment bits) set into
972       // SOffset, so that a larger range of values can be covered using
973       // s_movk_i32.
974       //
975       // Atomic operations fail to work correctly when individual address
976       // components are unaligned, even if their sum is aligned.
977       uint32_t High = (Imm + Align) & ~4095;
978       uint32_t Low = (Imm + Align) & 4095;
979       Imm = Low;
980       Overflow = High - Align;
981     }
982   }
983 
984   // There is a hardware bug in SI and CI which prevents address clamping in
985   // MUBUF instructions from working correctly with SOffsets. The immediate
986   // offset is unaffected.
987   if (Overflow > 0 &&
988       Subtarget->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
989     return false;
990 
991   ImmOffset = Imm;
992   SOffset = Overflow;
993   return true;
994 }
995 
996 namespace {
997 
998 struct SourceOfDivergence {
999   unsigned Intr;
1000 };
1001 const SourceOfDivergence *lookupSourceOfDivergence(unsigned Intr);
1002 
1003 #define GET_SourcesOfDivergence_IMPL
1004 #include "AMDGPUGenSearchableTables.inc"
1005 
1006 } // end anonymous namespace
1007 
1008 bool isIntrinsicSourceOfDivergence(unsigned IntrID) {
1009   return lookupSourceOfDivergence(IntrID);
1010 }
1011 } // namespace AMDGPU
1012 } // namespace llvm
1013