1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
19 #endif
20 
21 #include "AMDGPU.h"
22 #include "AMDGPULegalizerInfo.h"
23 #include "AMDGPUTargetMachine.h"
24 #include "SIMachineFunctionInfo.h"
25 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/CodeGen/TargetOpcodes.h"
28 #include "llvm/CodeGen/ValueTypes.h"
29 #include "llvm/IR/DerivedTypes.h"
30 #include "llvm/IR/DiagnosticInfo.h"
31 #include "llvm/IR/Type.h"
32 #include "llvm/Support/Debug.h"
33 
34 #define DEBUG_TYPE "amdgpu-legalinfo"
35 
36 using namespace llvm;
37 using namespace LegalizeActions;
38 using namespace LegalizeMutations;
39 using namespace LegalityPredicates;
40 
41 
42 static LegalityPredicate isMultiple32(unsigned TypeIdx,
43                                       unsigned MaxSize = 1024) {
44   return [=](const LegalityQuery &Query) {
45     const LLT Ty = Query.Types[TypeIdx];
46     const LLT EltTy = Ty.getScalarType();
47     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
48   };
49 }
50 
51 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
52   return [=](const LegalityQuery &Query) {
53     return Query.Types[TypeIdx].getSizeInBits() == Size;
54   };
55 }
56 
57 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
58   return [=](const LegalityQuery &Query) {
59     const LLT Ty = Query.Types[TypeIdx];
60     return Ty.isVector() &&
61            Ty.getNumElements() % 2 != 0 &&
62            Ty.getElementType().getSizeInBits() < 32 &&
63            Ty.getSizeInBits() % 32 != 0;
64   };
65 }
66 
67 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
68   return [=](const LegalityQuery &Query) {
69     const LLT Ty = Query.Types[TypeIdx];
70     const LLT EltTy = Ty.getElementType();
71     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
72   };
73 }
74 
75 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
76   return [=](const LegalityQuery &Query) {
77     const LLT Ty = Query.Types[TypeIdx];
78     const LLT EltTy = Ty.getElementType();
79     unsigned Size = Ty.getSizeInBits();
80     unsigned Pieces = (Size + 63) / 64;
81     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
82     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
83   };
84 }
85 
86 // Increase the number of vector elements to reach the next multiple of 32-bit
87 // type.
88 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
89   return [=](const LegalityQuery &Query) {
90     const LLT Ty = Query.Types[TypeIdx];
91 
92     const LLT EltTy = Ty.getElementType();
93     const int Size = Ty.getSizeInBits();
94     const int EltSize = EltTy.getSizeInBits();
95     const int NextMul32 = (Size + 31) / 32;
96 
97     assert(EltSize < 32);
98 
99     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
100     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
101   };
102 }
103 
104 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
105   return [=](const LegalityQuery &Query) {
106     const LLT QueryTy = Query.Types[TypeIdx];
107     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
108   };
109 }
110 
111 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
112   return [=](const LegalityQuery &Query) {
113     const LLT QueryTy = Query.Types[TypeIdx];
114     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
115   };
116 }
117 
118 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
119   return [=](const LegalityQuery &Query) {
120     const LLT QueryTy = Query.Types[TypeIdx];
121     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
122   };
123 }
124 
125 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
126 // v2s16.
127 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
128   return [=](const LegalityQuery &Query) {
129     const LLT Ty = Query.Types[TypeIdx];
130     if (Ty.isVector()) {
131       const int EltSize = Ty.getElementType().getSizeInBits();
132       return EltSize == 32 || EltSize == 64 ||
133             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
134              EltSize == 128 || EltSize == 256;
135     }
136 
137     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
138   };
139 }
140 
141 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
142   return [=](const LegalityQuery &Query) {
143     return Query.Types[TypeIdx].getElementType() == Type;
144   };
145 }
146 
147 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
148   return [=](const LegalityQuery &Query) {
149     const LLT Ty = Query.Types[TypeIdx];
150     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
151            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
152   };
153 }
154 
155 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
156                                          const GCNTargetMachine &TM)
157   :  ST(ST_) {
158   using namespace TargetOpcode;
159 
160   auto GetAddrSpacePtr = [&TM](unsigned AS) {
161     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
162   };
163 
164   const LLT S1 = LLT::scalar(1);
165   const LLT S8 = LLT::scalar(8);
166   const LLT S16 = LLT::scalar(16);
167   const LLT S32 = LLT::scalar(32);
168   const LLT S64 = LLT::scalar(64);
169   const LLT S96 = LLT::scalar(96);
170   const LLT S128 = LLT::scalar(128);
171   const LLT S256 = LLT::scalar(256);
172   const LLT S1024 = LLT::scalar(1024);
173 
174   const LLT V2S16 = LLT::vector(2, 16);
175   const LLT V4S16 = LLT::vector(4, 16);
176 
177   const LLT V2S32 = LLT::vector(2, 32);
178   const LLT V3S32 = LLT::vector(3, 32);
179   const LLT V4S32 = LLT::vector(4, 32);
180   const LLT V5S32 = LLT::vector(5, 32);
181   const LLT V6S32 = LLT::vector(6, 32);
182   const LLT V7S32 = LLT::vector(7, 32);
183   const LLT V8S32 = LLT::vector(8, 32);
184   const LLT V9S32 = LLT::vector(9, 32);
185   const LLT V10S32 = LLT::vector(10, 32);
186   const LLT V11S32 = LLT::vector(11, 32);
187   const LLT V12S32 = LLT::vector(12, 32);
188   const LLT V13S32 = LLT::vector(13, 32);
189   const LLT V14S32 = LLT::vector(14, 32);
190   const LLT V15S32 = LLT::vector(15, 32);
191   const LLT V16S32 = LLT::vector(16, 32);
192   const LLT V32S32 = LLT::vector(32, 32);
193 
194   const LLT V2S64 = LLT::vector(2, 64);
195   const LLT V3S64 = LLT::vector(3, 64);
196   const LLT V4S64 = LLT::vector(4, 64);
197   const LLT V5S64 = LLT::vector(5, 64);
198   const LLT V6S64 = LLT::vector(6, 64);
199   const LLT V7S64 = LLT::vector(7, 64);
200   const LLT V8S64 = LLT::vector(8, 64);
201   const LLT V16S64 = LLT::vector(16, 64);
202 
203   std::initializer_list<LLT> AllS32Vectors =
204     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
205      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
206   std::initializer_list<LLT> AllS64Vectors =
207     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
208 
209   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
210   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
211   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
212   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
213   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
214   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
215   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
216 
217   const LLT CodePtr = FlatPtr;
218 
219   const std::initializer_list<LLT> AddrSpaces64 = {
220     GlobalPtr, ConstantPtr, FlatPtr
221   };
222 
223   const std::initializer_list<LLT> AddrSpaces32 = {
224     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
225   };
226 
227   const std::initializer_list<LLT> FPTypesBase = {
228     S32, S64
229   };
230 
231   const std::initializer_list<LLT> FPTypes16 = {
232     S32, S64, S16
233   };
234 
235   const std::initializer_list<LLT> FPTypesPK16 = {
236     S32, S64, S16, V2S16
237   };
238 
239   setAction({G_BRCOND, S1}, Legal);
240 
241   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
242   // elements for v3s16
243   getActionDefinitionsBuilder(G_PHI)
244     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
245     .legalFor(AllS32Vectors)
246     .legalFor(AllS64Vectors)
247     .legalFor(AddrSpaces64)
248     .legalFor(AddrSpaces32)
249     .clampScalar(0, S32, S256)
250     .widenScalarToNextPow2(0, 32)
251     .clampMaxNumElements(0, S32, 16)
252     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
253     .legalIf(isPointer(0));
254 
255   if (ST.has16BitInsts()) {
256     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
257       .legalFor({S32, S16})
258       .clampScalar(0, S16, S32)
259       .scalarize(0);
260   } else {
261     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
262       .legalFor({S32})
263       .clampScalar(0, S32, S32)
264       .scalarize(0);
265   }
266 
267   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
268     .legalFor({S32})
269     .clampScalar(0, S32, S32)
270     .scalarize(0);
271 
272   // Report legal for any types we can handle anywhere. For the cases only legal
273   // on the SALU, RegBankSelect will be able to re-legalize.
274   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
275     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
276     .clampScalar(0, S32, S64)
277     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
278     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
279     .widenScalarToNextPow2(0)
280     .scalarize(0);
281 
282   getActionDefinitionsBuilder({G_UADDO, G_SADDO, G_USUBO, G_SSUBO,
283                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
284     .legalFor({{S32, S1}})
285     .clampScalar(0, S32, S32)
286     .scalarize(0); // TODO: Implement.
287 
288   getActionDefinitionsBuilder(G_BITCAST)
289     // Don't worry about the size constraint.
290     .legalIf(all(isRegisterType(0), isRegisterType(1)))
291     // FIXME: Testing hack
292     .legalForCartesianProduct({S16, LLT::vector(2, 8), });
293 
294   getActionDefinitionsBuilder(G_FCONSTANT)
295     .legalFor({S32, S64, S16})
296     .clampScalar(0, S16, S64);
297 
298   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
299     .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
300                ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
301     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
302     .clampScalarOrElt(0, S32, S1024)
303     .legalIf(isMultiple32(0))
304     .widenScalarToNextPow2(0, 32)
305     .clampMaxNumElements(0, S32, 16);
306 
307 
308   // FIXME: i1 operands to intrinsics should always be legal, but other i1
309   // values may not be legal.  We need to figure out how to distinguish
310   // between these two scenarios.
311   getActionDefinitionsBuilder(G_CONSTANT)
312     .legalFor({S1, S32, S64, S16, GlobalPtr,
313                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
314     .clampScalar(0, S32, S64)
315     .widenScalarToNextPow2(0)
316     .legalIf(isPointer(0));
317 
318   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
319   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
320     .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr});
321 
322 
323   auto &FPOpActions = getActionDefinitionsBuilder(
324     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
325     .legalFor({S32, S64});
326   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
327     .customFor({S32, S64});
328 
329   if (ST.has16BitInsts()) {
330     if (ST.hasVOP3PInsts())
331       FPOpActions.legalFor({S16, V2S16});
332     else
333       FPOpActions.legalFor({S16});
334 
335     TrigActions.customFor({S16});
336   }
337 
338   auto &MinNumMaxNum = getActionDefinitionsBuilder({
339       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
340 
341   if (ST.hasVOP3PInsts()) {
342     MinNumMaxNum.customFor(FPTypesPK16)
343       .clampMaxNumElements(0, S16, 2)
344       .clampScalar(0, S16, S64)
345       .scalarize(0);
346   } else if (ST.has16BitInsts()) {
347     MinNumMaxNum.customFor(FPTypes16)
348       .clampScalar(0, S16, S64)
349       .scalarize(0);
350   } else {
351     MinNumMaxNum.customFor(FPTypesBase)
352       .clampScalar(0, S32, S64)
353       .scalarize(0);
354   }
355 
356   if (ST.hasVOP3PInsts())
357     FPOpActions.clampMaxNumElements(0, S16, 2);
358 
359   FPOpActions
360     .scalarize(0)
361     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
362 
363   TrigActions
364     .scalarize(0)
365     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
366 
367   getActionDefinitionsBuilder({G_FNEG, G_FABS})
368     .legalFor(FPTypesPK16)
369     .clampMaxNumElements(0, S16, 2)
370     .scalarize(0)
371     .clampScalar(0, S16, S64);
372 
373   // TODO: Implement
374   getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
375 
376   if (ST.has16BitInsts()) {
377     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
378       .legalFor({S32, S64, S16})
379       .scalarize(0)
380       .clampScalar(0, S16, S64);
381   } else {
382     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
383       .legalFor({S32, S64})
384       .scalarize(0)
385       .clampScalar(0, S32, S64);
386   }
387 
388   getActionDefinitionsBuilder(G_FPTRUNC)
389     .legalFor({{S32, S64}, {S16, S32}})
390     .scalarize(0);
391 
392   getActionDefinitionsBuilder(G_FPEXT)
393     .legalFor({{S64, S32}, {S32, S16}})
394     .lowerFor({{S64, S16}}) // FIXME: Implement
395     .scalarize(0);
396 
397   // TODO: Verify V_BFI_B32 is generated from expanded bit ops.
398   getActionDefinitionsBuilder(G_FCOPYSIGN).lower();
399 
400   getActionDefinitionsBuilder(G_FSUB)
401       // Use actual fsub instruction
402       .legalFor({S32})
403       // Must use fadd + fneg
404       .lowerFor({S64, S16, V2S16})
405       .scalarize(0)
406       .clampScalar(0, S32, S64);
407 
408   // Whether this is legal depends on the floating point mode for the function.
409   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
410   if (ST.hasMadF16())
411     FMad.customFor({S32, S16});
412   else
413     FMad.customFor({S32});
414   FMad.scalarize(0)
415       .lower();
416 
417   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
418     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
419                {S32, S1}, {S64, S1}, {S16, S1},
420                {S96, S32},
421                // FIXME: Hack
422                {S64, LLT::scalar(33)},
423                {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}})
424     .scalarize(0);
425 
426   // TODO: Split s1->s64 during regbankselect for VALU.
427   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
428     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}, {S32, S1}, {S16, S1}, {S64, S1}})
429     .lowerFor({{S32, S64}})
430     .customFor({{S64, S64}});
431   if (ST.has16BitInsts())
432     IToFP.legalFor({{S16, S16}});
433   IToFP.clampScalar(1, S32, S64)
434        .scalarize(0);
435 
436   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
437     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}});
438   if (ST.has16BitInsts())
439     FPToI.legalFor({{S16, S16}});
440   else
441     FPToI.minScalar(1, S32);
442 
443   FPToI.minScalar(0, S32)
444        .scalarize(0);
445 
446   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
447     .legalFor({S32, S64})
448     .scalarize(0);
449 
450   if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
451     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
452       .legalFor({S32, S64})
453       .clampScalar(0, S32, S64)
454       .scalarize(0);
455   } else {
456     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
457       .legalFor({S32})
458       .customFor({S64})
459       .clampScalar(0, S32, S64)
460       .scalarize(0);
461   }
462 
463   getActionDefinitionsBuilder(G_GEP)
464     .legalForCartesianProduct(AddrSpaces64, {S64})
465     .legalForCartesianProduct(AddrSpaces32, {S32})
466     .scalarize(0);
467 
468   getActionDefinitionsBuilder(G_PTR_MASK)
469     .scalarize(0)
470     .alwaysLegal();
471 
472   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
473 
474   auto &CmpBuilder =
475     getActionDefinitionsBuilder(G_ICMP)
476     .legalForCartesianProduct(
477       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
478     .legalFor({{S1, S32}, {S1, S64}});
479   if (ST.has16BitInsts()) {
480     CmpBuilder.legalFor({{S1, S16}});
481   }
482 
483   CmpBuilder
484     .widenScalarToNextPow2(1)
485     .clampScalar(1, S32, S64)
486     .scalarize(0)
487     .legalIf(all(typeIs(0, S1), isPointer(1)));
488 
489   getActionDefinitionsBuilder(G_FCMP)
490     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
491     .widenScalarToNextPow2(1)
492     .clampScalar(1, S32, S64)
493     .scalarize(0);
494 
495   // FIXME: fexp, flog2, flog10 needs to be custom lowered.
496   getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2,
497                                G_FLOG, G_FLOG2, G_FLOG10})
498     .legalFor({S32})
499     .scalarize(0);
500 
501   // The 64-bit versions produce 32-bit results, but only on the SALU.
502   getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF,
503                                G_CTTZ, G_CTTZ_ZERO_UNDEF,
504                                G_CTPOP})
505     .legalFor({{S32, S32}, {S32, S64}})
506     .clampScalar(0, S32, S32)
507     .clampScalar(1, S32, S64)
508     .scalarize(0)
509     .widenScalarToNextPow2(0, 32)
510     .widenScalarToNextPow2(1, 32);
511 
512   // TODO: Expand for > s32
513   getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE})
514     .legalFor({S32})
515     .clampScalar(0, S32, S32)
516     .scalarize(0);
517 
518   if (ST.has16BitInsts()) {
519     if (ST.hasVOP3PInsts()) {
520       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
521         .legalFor({S32, S16, V2S16})
522         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
523         .clampMaxNumElements(0, S16, 2)
524         .clampScalar(0, S16, S32)
525         .widenScalarToNextPow2(0)
526         .scalarize(0);
527     } else {
528       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
529         .legalFor({S32, S16})
530         .widenScalarToNextPow2(0)
531         .clampScalar(0, S16, S32)
532         .scalarize(0);
533     }
534   } else {
535     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
536       .legalFor({S32})
537       .clampScalar(0, S32, S32)
538       .widenScalarToNextPow2(0)
539       .scalarize(0);
540   }
541 
542   auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
543     return [=](const LegalityQuery &Query) {
544       return Query.Types[TypeIdx0].getSizeInBits() <
545              Query.Types[TypeIdx1].getSizeInBits();
546     };
547   };
548 
549   auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
550     return [=](const LegalityQuery &Query) {
551       return Query.Types[TypeIdx0].getSizeInBits() >
552              Query.Types[TypeIdx1].getSizeInBits();
553     };
554   };
555 
556   getActionDefinitionsBuilder(G_INTTOPTR)
557     // List the common cases
558     .legalForCartesianProduct(AddrSpaces64, {S64})
559     .legalForCartesianProduct(AddrSpaces32, {S32})
560     .scalarize(0)
561     // Accept any address space as long as the size matches
562     .legalIf(sameSize(0, 1))
563     .widenScalarIf(smallerThan(1, 0),
564       [](const LegalityQuery &Query) {
565         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
566       })
567     .narrowScalarIf(greaterThan(1, 0),
568       [](const LegalityQuery &Query) {
569         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
570       });
571 
572   getActionDefinitionsBuilder(G_PTRTOINT)
573     // List the common cases
574     .legalForCartesianProduct(AddrSpaces64, {S64})
575     .legalForCartesianProduct(AddrSpaces32, {S32})
576     .scalarize(0)
577     // Accept any address space as long as the size matches
578     .legalIf(sameSize(0, 1))
579     .widenScalarIf(smallerThan(0, 1),
580       [](const LegalityQuery &Query) {
581         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
582       })
583     .narrowScalarIf(
584       greaterThan(0, 1),
585       [](const LegalityQuery &Query) {
586         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
587       });
588 
589   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
590     .scalarize(0)
591     .custom();
592 
593   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
594   // handle some operations by just promoting the register during
595   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
596   auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned {
597     switch (AS) {
598     // FIXME: Private element size.
599     case AMDGPUAS::PRIVATE_ADDRESS:
600       return 32;
601     // FIXME: Check subtarget
602     case AMDGPUAS::LOCAL_ADDRESS:
603       return ST.useDS128() ? 128 : 64;
604 
605     // Treat constant and global as identical. SMRD loads are sometimes usable
606     // for global loads (ideally constant address space should be eliminated)
607     // depending on the context. Legality cannot be context dependent, but
608     // RegBankSelect can split the load as necessary depending on the pointer
609     // register bank/uniformity and if the memory is invariant or not written in
610     // a kernel.
611     case AMDGPUAS::CONSTANT_ADDRESS:
612     case AMDGPUAS::GLOBAL_ADDRESS:
613       return 512;
614     default:
615       return 128;
616     }
617   };
618 
619   const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool {
620     const LLT DstTy = Query.Types[0];
621 
622     // Split vector extloads.
623     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
624     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
625       return true;
626 
627     const LLT PtrTy = Query.Types[1];
628     unsigned AS = PtrTy.getAddressSpace();
629     if (MemSize > maxSizeForAddrSpace(AS))
630       return true;
631 
632     // Catch weird sized loads that don't evenly divide into the access sizes
633     // TODO: May be able to widen depending on alignment etc.
634     unsigned NumRegs = MemSize / 32;
635     if (NumRegs == 3 && !ST.hasDwordx3LoadStores())
636       return true;
637 
638     unsigned Align = Query.MMODescrs[0].AlignInBits;
639     if (Align < MemSize) {
640       const SITargetLowering *TLI = ST.getTargetLowering();
641       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
642     }
643 
644     return false;
645   };
646 
647   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
648   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
649   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
650 
651   // TODO: Refine based on subtargets which support unaligned access or 128-bit
652   // LDS
653   // TODO: Unsupported flat for SI.
654 
655   for (unsigned Op : {G_LOAD, G_STORE}) {
656     const bool IsStore = Op == G_STORE;
657 
658     auto &Actions = getActionDefinitionsBuilder(Op);
659     // Whitelist the common cases.
660     // TODO: Pointer loads
661     // TODO: Wide constant loads
662     // TODO: Only CI+ has 3x loads
663     // TODO: Loads to s16 on gfx9
664     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
665                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
666                                       {V3S32, GlobalPtr, 96, GlobalAlign32},
667                                       {S96, GlobalPtr, 96, GlobalAlign32},
668                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
669                                       {S128, GlobalPtr, 128, GlobalAlign32},
670                                       {S64, GlobalPtr, 64, GlobalAlign32},
671                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
672                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
673                                       {S32, GlobalPtr, 8, GlobalAlign8},
674                                       {S32, GlobalPtr, 16, GlobalAlign16},
675 
676                                       {S32, LocalPtr, 32, 32},
677                                       {S64, LocalPtr, 64, 32},
678                                       {V2S32, LocalPtr, 64, 32},
679                                       {S32, LocalPtr, 8, 8},
680                                       {S32, LocalPtr, 16, 16},
681                                       {V2S16, LocalPtr, 32, 32},
682 
683                                       {S32, PrivatePtr, 32, 32},
684                                       {S32, PrivatePtr, 8, 8},
685                                       {S32, PrivatePtr, 16, 16},
686                                       {V2S16, PrivatePtr, 32, 32},
687 
688                                       {S32, FlatPtr, 32, GlobalAlign32},
689                                       {S32, FlatPtr, 16, GlobalAlign16},
690                                       {S32, FlatPtr, 8, GlobalAlign8},
691                                       {V2S16, FlatPtr, 32, GlobalAlign32},
692 
693                                       {S32, ConstantPtr, 32, GlobalAlign32},
694                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
695                                       {V3S32, ConstantPtr, 96, GlobalAlign32},
696                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
697                                       {S64, ConstantPtr, 64, GlobalAlign32},
698                                       {S128, ConstantPtr, 128, GlobalAlign32},
699                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
700     Actions
701         .customIf(typeIs(1, Constant32Ptr))
702         .narrowScalarIf(
703             [=](const LegalityQuery &Query) -> bool {
704               return !Query.Types[0].isVector() && needToSplitLoad(Query);
705             },
706             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
707               const LLT DstTy = Query.Types[0];
708               const LLT PtrTy = Query.Types[1];
709 
710               const unsigned DstSize = DstTy.getSizeInBits();
711               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
712 
713               // Split extloads.
714               if (DstSize > MemSize)
715                 return std::make_pair(0, LLT::scalar(MemSize));
716 
717               if (DstSize > 32 && (DstSize % 32 != 0)) {
718                 // FIXME: Need a way to specify non-extload of larger size if
719                 // suitably aligned.
720                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
721               }
722 
723               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
724               if (MemSize > MaxSize)
725                 return std::make_pair(0, LLT::scalar(MaxSize));
726 
727               unsigned Align = Query.MMODescrs[0].AlignInBits;
728               return std::make_pair(0, LLT::scalar(Align));
729             })
730         .fewerElementsIf(
731             [=](const LegalityQuery &Query) -> bool {
732               return Query.Types[0].isVector() && needToSplitLoad(Query);
733             },
734             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
735               const LLT DstTy = Query.Types[0];
736               const LLT PtrTy = Query.Types[1];
737 
738               LLT EltTy = DstTy.getElementType();
739               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
740 
741               // Split if it's too large for the address space.
742               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
743                 unsigned NumElts = DstTy.getNumElements();
744                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
745 
746                 // FIXME: Refine when odd breakdowns handled
747                 // The scalars will need to be re-legalized.
748                 if (NumPieces == 1 || NumPieces >= NumElts ||
749                     NumElts % NumPieces != 0)
750                   return std::make_pair(0, EltTy);
751 
752                 return std::make_pair(0,
753                                       LLT::vector(NumElts / NumPieces, EltTy));
754               }
755 
756               // Need to split because of alignment.
757               unsigned Align = Query.MMODescrs[0].AlignInBits;
758               unsigned EltSize = EltTy.getSizeInBits();
759               if (EltSize > Align &&
760                   (EltSize / Align < DstTy.getNumElements())) {
761                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
762               }
763 
764               // May need relegalization for the scalars.
765               return std::make_pair(0, EltTy);
766             })
767         .minScalar(0, S32);
768 
769     if (IsStore)
770       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
771 
772     // TODO: Need a bitcast lower option?
773     Actions
774         .legalIf([=](const LegalityQuery &Query) {
775           const LLT Ty0 = Query.Types[0];
776           unsigned Size = Ty0.getSizeInBits();
777           unsigned MemSize = Query.MMODescrs[0].SizeInBits;
778           unsigned Align = Query.MMODescrs[0].AlignInBits;
779 
780           // No extending vector loads.
781           if (Size > MemSize && Ty0.isVector())
782             return false;
783 
784           // FIXME: Widening store from alignment not valid.
785           if (MemSize < Size)
786             MemSize = std::max(MemSize, Align);
787 
788           switch (MemSize) {
789           case 8:
790           case 16:
791             return Size == 32;
792           case 32:
793           case 64:
794           case 128:
795             return true;
796           case 96:
797             return ST.hasDwordx3LoadStores();
798           case 256:
799           case 512:
800             return true;
801           default:
802             return false;
803           }
804         })
805         .widenScalarToNextPow2(0)
806         // TODO: v3s32->v4s32 with alignment
807         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
808   }
809 
810   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
811                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
812                                                   {S32, GlobalPtr, 16, 2 * 8},
813                                                   {S32, LocalPtr, 8, 8},
814                                                   {S32, LocalPtr, 16, 16},
815                                                   {S32, PrivatePtr, 8, 8},
816                                                   {S32, PrivatePtr, 16, 16},
817                                                   {S32, ConstantPtr, 8, 8},
818                                                   {S32, ConstantPtr, 16, 2 * 8}});
819   if (ST.hasFlatAddressSpace()) {
820     ExtLoads.legalForTypesWithMemDesc(
821         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
822   }
823 
824   ExtLoads.clampScalar(0, S32, S32)
825           .widenScalarToNextPow2(0)
826           .unsupportedIfMemSizeNotPow2()
827           .lower();
828 
829   auto &Atomics = getActionDefinitionsBuilder(
830     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
831      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
832      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
833      G_ATOMICRMW_UMIN, G_ATOMIC_CMPXCHG})
834     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
835                {S64, GlobalPtr}, {S64, LocalPtr}});
836   if (ST.hasFlatAddressSpace()) {
837     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
838   }
839 
840   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
841     .legalFor({{S32, LocalPtr}});
842 
843   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS)
844     .lower();
845 
846   // TODO: Pointer types, any 32-bit or 64-bit vector
847   getActionDefinitionsBuilder(G_SELECT)
848     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
849           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
850           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1})
851     .clampScalar(0, S16, S64)
852     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
853     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
854     .scalarize(1)
855     .clampMaxNumElements(0, S32, 2)
856     .clampMaxNumElements(0, LocalPtr, 2)
857     .clampMaxNumElements(0, PrivatePtr, 2)
858     .scalarize(0)
859     .widenScalarToNextPow2(0)
860     .legalIf(all(isPointer(0), typeIs(1, S1)));
861 
862   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
863   // be more flexible with the shift amount type.
864   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
865     .legalFor({{S32, S32}, {S64, S32}});
866   if (ST.has16BitInsts()) {
867     if (ST.hasVOP3PInsts()) {
868       Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
869             .clampMaxNumElements(0, S16, 2);
870     } else
871       Shifts.legalFor({{S16, S32}, {S16, S16}});
872 
873     Shifts.clampScalar(1, S16, S32);
874     Shifts.clampScalar(0, S16, S64);
875     Shifts.widenScalarToNextPow2(0, 16);
876   } else {
877     // Make sure we legalize the shift amount type first, as the general
878     // expansion for the shifted type will produce much worse code if it hasn't
879     // been truncated already.
880     Shifts.clampScalar(1, S32, S32);
881     Shifts.clampScalar(0, S32, S64);
882     Shifts.widenScalarToNextPow2(0, 32);
883   }
884   Shifts.scalarize(0);
885 
886   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
887     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
888     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
889     unsigned IdxTypeIdx = 2;
890 
891     getActionDefinitionsBuilder(Op)
892       .customIf([=](const LegalityQuery &Query) {
893           const LLT EltTy = Query.Types[EltTypeIdx];
894           const LLT VecTy = Query.Types[VecTypeIdx];
895           const LLT IdxTy = Query.Types[IdxTypeIdx];
896           return (EltTy.getSizeInBits() == 16 ||
897                   EltTy.getSizeInBits() % 32 == 0) &&
898                  VecTy.getSizeInBits() % 32 == 0 &&
899                  VecTy.getSizeInBits() <= 1024 &&
900                  IdxTy.getSizeInBits() == 32;
901         })
902       .clampScalar(EltTypeIdx, S32, S64)
903       .clampScalar(VecTypeIdx, S32, S64)
904       .clampScalar(IdxTypeIdx, S32, S32);
905   }
906 
907   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
908     .unsupportedIf([=](const LegalityQuery &Query) {
909         const LLT &EltTy = Query.Types[1].getElementType();
910         return Query.Types[0] != EltTy;
911       });
912 
913   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
914     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
915     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
916 
917     // FIXME: Doesn't handle extract of illegal sizes.
918     getActionDefinitionsBuilder(Op)
919       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
920       // FIXME: Multiples of 16 should not be legal.
921       .legalIf([=](const LegalityQuery &Query) {
922           const LLT BigTy = Query.Types[BigTyIdx];
923           const LLT LitTy = Query.Types[LitTyIdx];
924           return (BigTy.getSizeInBits() % 32 == 0) &&
925                  (LitTy.getSizeInBits() % 16 == 0);
926         })
927       .widenScalarIf(
928         [=](const LegalityQuery &Query) {
929           const LLT BigTy = Query.Types[BigTyIdx];
930           return (BigTy.getScalarSizeInBits() < 16);
931         },
932         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
933       .widenScalarIf(
934         [=](const LegalityQuery &Query) {
935           const LLT LitTy = Query.Types[LitTyIdx];
936           return (LitTy.getScalarSizeInBits() < 16);
937         },
938         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
939       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
940       .widenScalarToNextPow2(BigTyIdx, 32);
941 
942   }
943 
944   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
945     .legalForCartesianProduct(AllS32Vectors, {S32})
946     .legalForCartesianProduct(AllS64Vectors, {S64})
947     .clampNumElements(0, V16S32, V32S32)
948     .clampNumElements(0, V2S64, V16S64);
949 
950   if (ST.hasScalarPackInsts())
951     BuildVector.legalFor({V2S16, S32});
952 
953   BuildVector
954     .minScalarSameAs(1, 0)
955     .legalIf(isRegisterType(0))
956     .minScalarOrElt(0, S32);
957 
958   if (ST.hasScalarPackInsts()) {
959     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
960       .legalFor({V2S16, S32})
961       .lower();
962   } else {
963     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
964       .lower();
965   }
966 
967   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
968     .legalIf(isRegisterType(0));
969 
970   // TODO: Don't fully scalarize v2s16 pieces
971   getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
972 
973   // Merge/Unmerge
974   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
975     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
976     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
977 
978     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
979       const LLT &Ty = Query.Types[TypeIdx];
980       if (Ty.isVector()) {
981         const LLT &EltTy = Ty.getElementType();
982         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
983           return true;
984         if (!isPowerOf2_32(EltTy.getSizeInBits()))
985           return true;
986       }
987       return false;
988     };
989 
990     auto &Builder = getActionDefinitionsBuilder(Op)
991       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
992       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
993       // worth considering the multiples of 64 since 2*192 and 2*384 are not
994       // valid.
995       .clampScalar(LitTyIdx, S16, S256)
996       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
997       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
998       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
999                            elementTypeIs(1, S16)),
1000                        changeTo(1, V2S16))
1001       // Break up vectors with weird elements into scalars
1002       .fewerElementsIf(
1003         [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
1004         scalarize(0))
1005       .fewerElementsIf(
1006         [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
1007         scalarize(1))
1008       .clampScalar(BigTyIdx, S32, S1024)
1009       .lowerFor({{S16, V2S16}});
1010 
1011     if (Op == G_MERGE_VALUES) {
1012       Builder.widenScalarIf(
1013         // TODO: Use 16-bit shifts if legal for 8-bit values?
1014         [=](const LegalityQuery &Query) {
1015           const LLT Ty = Query.Types[LitTyIdx];
1016           return Ty.getSizeInBits() < 32;
1017         },
1018         changeTo(LitTyIdx, S32));
1019     }
1020 
1021     Builder.widenScalarIf(
1022       [=](const LegalityQuery &Query) {
1023         const LLT Ty = Query.Types[BigTyIdx];
1024         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1025           Ty.getSizeInBits() % 16 != 0;
1026       },
1027       [=](const LegalityQuery &Query) {
1028         // Pick the next power of 2, or a multiple of 64 over 128.
1029         // Whichever is smaller.
1030         const LLT &Ty = Query.Types[BigTyIdx];
1031         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1032         if (NewSizeInBits >= 256) {
1033           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1034           if (RoundedTo < NewSizeInBits)
1035             NewSizeInBits = RoundedTo;
1036         }
1037         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1038       })
1039       .legalIf([=](const LegalityQuery &Query) {
1040           const LLT &BigTy = Query.Types[BigTyIdx];
1041           const LLT &LitTy = Query.Types[LitTyIdx];
1042 
1043           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1044             return false;
1045           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1046             return false;
1047 
1048           return BigTy.getSizeInBits() % 16 == 0 &&
1049                  LitTy.getSizeInBits() % 16 == 0 &&
1050                  BigTy.getSizeInBits() <= 1024;
1051         })
1052       // Any vectors left are the wrong size. Scalarize them.
1053       .scalarize(0)
1054       .scalarize(1);
1055   }
1056 
1057   getActionDefinitionsBuilder(G_SEXT_INREG).lower();
1058 
1059   computeTables();
1060   verify(*ST.getInstrInfo());
1061 }
1062 
1063 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1064                                          MachineRegisterInfo &MRI,
1065                                          MachineIRBuilder &B,
1066                                          GISelChangeObserver &Observer) const {
1067   switch (MI.getOpcode()) {
1068   case TargetOpcode::G_ADDRSPACE_CAST:
1069     return legalizeAddrSpaceCast(MI, MRI, B);
1070   case TargetOpcode::G_FRINT:
1071     return legalizeFrint(MI, MRI, B);
1072   case TargetOpcode::G_FCEIL:
1073     return legalizeFceil(MI, MRI, B);
1074   case TargetOpcode::G_INTRINSIC_TRUNC:
1075     return legalizeIntrinsicTrunc(MI, MRI, B);
1076   case TargetOpcode::G_SITOFP:
1077     return legalizeITOFP(MI, MRI, B, true);
1078   case TargetOpcode::G_UITOFP:
1079     return legalizeITOFP(MI, MRI, B, false);
1080   case TargetOpcode::G_FMINNUM:
1081   case TargetOpcode::G_FMAXNUM:
1082   case TargetOpcode::G_FMINNUM_IEEE:
1083   case TargetOpcode::G_FMAXNUM_IEEE:
1084     return legalizeMinNumMaxNum(MI, MRI, B);
1085   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1086     return legalizeExtractVectorElt(MI, MRI, B);
1087   case TargetOpcode::G_INSERT_VECTOR_ELT:
1088     return legalizeInsertVectorElt(MI, MRI, B);
1089   case TargetOpcode::G_FSIN:
1090   case TargetOpcode::G_FCOS:
1091     return legalizeSinCos(MI, MRI, B);
1092   case TargetOpcode::G_GLOBAL_VALUE:
1093     return legalizeGlobalValue(MI, MRI, B);
1094   case TargetOpcode::G_LOAD:
1095     return legalizeLoad(MI, MRI, B, Observer);
1096   case TargetOpcode::G_FMAD:
1097     return legalizeFMad(MI, MRI, B);
1098   default:
1099     return false;
1100   }
1101 
1102   llvm_unreachable("expected switch to return");
1103 }
1104 
1105 Register AMDGPULegalizerInfo::getSegmentAperture(
1106   unsigned AS,
1107   MachineRegisterInfo &MRI,
1108   MachineIRBuilder &B) const {
1109   MachineFunction &MF = B.getMF();
1110   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1111   const LLT S32 = LLT::scalar(32);
1112 
1113   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1114 
1115   if (ST.hasApertureRegs()) {
1116     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1117     // getreg.
1118     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1119         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1120         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1121     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1122         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1123         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1124     unsigned Encoding =
1125         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1126         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1127         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1128 
1129     Register ApertureReg = MRI.createGenericVirtualRegister(S32);
1130     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1131 
1132     B.buildInstr(AMDGPU::S_GETREG_B32)
1133       .addDef(GetReg)
1134       .addImm(Encoding);
1135     MRI.setType(GetReg, S32);
1136 
1137     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1138     B.buildInstr(TargetOpcode::G_SHL)
1139       .addDef(ApertureReg)
1140       .addUse(GetReg)
1141       .addUse(ShiftAmt.getReg(0));
1142 
1143     return ApertureReg;
1144   }
1145 
1146   Register QueuePtr = MRI.createGenericVirtualRegister(
1147     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1148 
1149   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1150   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1151     return Register();
1152 
1153   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1154   // private_segment_aperture_base_hi.
1155   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1156 
1157   // FIXME: Don't use undef
1158   Value *V = UndefValue::get(PointerType::get(
1159                                Type::getInt8Ty(MF.getFunction().getContext()),
1160                                AMDGPUAS::CONSTANT_ADDRESS));
1161 
1162   MachinePointerInfo PtrInfo(V, StructOffset);
1163   MachineMemOperand *MMO = MF.getMachineMemOperand(
1164     PtrInfo,
1165     MachineMemOperand::MOLoad |
1166     MachineMemOperand::MODereferenceable |
1167     MachineMemOperand::MOInvariant,
1168     4,
1169     MinAlign(64, StructOffset));
1170 
1171   Register LoadResult = MRI.createGenericVirtualRegister(S32);
1172   Register LoadAddr;
1173 
1174   B.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1175   B.buildLoad(LoadResult, LoadAddr, *MMO);
1176   return LoadResult;
1177 }
1178 
1179 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1180   MachineInstr &MI, MachineRegisterInfo &MRI,
1181   MachineIRBuilder &B) const {
1182   MachineFunction &MF = B.getMF();
1183 
1184   B.setInstr(MI);
1185 
1186   const LLT S32 = LLT::scalar(32);
1187   Register Dst = MI.getOperand(0).getReg();
1188   Register Src = MI.getOperand(1).getReg();
1189 
1190   LLT DstTy = MRI.getType(Dst);
1191   LLT SrcTy = MRI.getType(Src);
1192   unsigned DestAS = DstTy.getAddressSpace();
1193   unsigned SrcAS = SrcTy.getAddressSpace();
1194 
1195   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1196   // vector element.
1197   assert(!DstTy.isVector());
1198 
1199   const AMDGPUTargetMachine &TM
1200     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1201 
1202   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1203   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1204     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1205     return true;
1206   }
1207 
1208   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1209     // Truncate.
1210     B.buildExtract(Dst, Src, 0);
1211     MI.eraseFromParent();
1212     return true;
1213   }
1214 
1215   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1216     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1217     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1218 
1219     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1220     // another. Merge operands are required to be the same type, but creating an
1221     // extra ptrtoint would be kind of pointless.
1222     auto HighAddr = B.buildConstant(
1223       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1224     B.buildMerge(Dst, {Src, HighAddr.getReg(0)});
1225     MI.eraseFromParent();
1226     return true;
1227   }
1228 
1229   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1230     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1231            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1232     unsigned NullVal = TM.getNullPointerValue(DestAS);
1233 
1234     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1235     auto FlatNull = B.buildConstant(SrcTy, 0);
1236 
1237     Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy);
1238 
1239     // Extract low 32-bits of the pointer.
1240     B.buildExtract(PtrLo32, Src, 0);
1241 
1242     Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1243     B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
1244     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1245 
1246     MI.eraseFromParent();
1247     return true;
1248   }
1249 
1250   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1251     return false;
1252 
1253   if (!ST.hasFlatAddressSpace())
1254     return false;
1255 
1256   auto SegmentNull =
1257       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1258   auto FlatNull =
1259       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1260 
1261   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1262   if (!ApertureReg.isValid())
1263     return false;
1264 
1265   Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1266   B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
1267 
1268   Register BuildPtr = MRI.createGenericVirtualRegister(DstTy);
1269 
1270   // Coerce the type of the low half of the result so we can use merge_values.
1271   Register SrcAsInt = MRI.createGenericVirtualRegister(S32);
1272   B.buildInstr(TargetOpcode::G_PTRTOINT)
1273     .addDef(SrcAsInt)
1274     .addUse(Src);
1275 
1276   // TODO: Should we allow mismatched types but matching sizes in merges to
1277   // avoid the ptrtoint?
1278   B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
1279   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
1280 
1281   MI.eraseFromParent();
1282   return true;
1283 }
1284 
1285 bool AMDGPULegalizerInfo::legalizeFrint(
1286   MachineInstr &MI, MachineRegisterInfo &MRI,
1287   MachineIRBuilder &B) const {
1288   B.setInstr(MI);
1289 
1290   Register Src = MI.getOperand(1).getReg();
1291   LLT Ty = MRI.getType(Src);
1292   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1293 
1294   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1295   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1296 
1297   auto C1 = B.buildFConstant(Ty, C1Val);
1298   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1299 
1300   // TODO: Should this propagate fast-math-flags?
1301   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1302   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1303 
1304   auto C2 = B.buildFConstant(Ty, C2Val);
1305   auto Fabs = B.buildFAbs(Ty, Src);
1306 
1307   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1308   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1309   return true;
1310 }
1311 
1312 bool AMDGPULegalizerInfo::legalizeFceil(
1313   MachineInstr &MI, MachineRegisterInfo &MRI,
1314   MachineIRBuilder &B) const {
1315   B.setInstr(MI);
1316 
1317   const LLT S1 = LLT::scalar(1);
1318   const LLT S64 = LLT::scalar(64);
1319 
1320   Register Src = MI.getOperand(1).getReg();
1321   assert(MRI.getType(Src) == S64);
1322 
1323   // result = trunc(src)
1324   // if (src > 0.0 && src != result)
1325   //   result += 1.0
1326 
1327   auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src});
1328 
1329   const auto Zero = B.buildFConstant(S64, 0.0);
1330   const auto One = B.buildFConstant(S64, 1.0);
1331   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1332   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1333   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1334   auto Add = B.buildSelect(S64, And, One, Zero);
1335 
1336   // TODO: Should this propagate fast-math-flags?
1337   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1338   return true;
1339 }
1340 
1341 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1342                                               MachineIRBuilder &B) {
1343   const unsigned FractBits = 52;
1344   const unsigned ExpBits = 11;
1345   LLT S32 = LLT::scalar(32);
1346 
1347   auto Const0 = B.buildConstant(S32, FractBits - 32);
1348   auto Const1 = B.buildConstant(S32, ExpBits);
1349 
1350   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1351     .addUse(Const0.getReg(0))
1352     .addUse(Const1.getReg(0));
1353 
1354   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1355 }
1356 
1357 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1358   MachineInstr &MI, MachineRegisterInfo &MRI,
1359   MachineIRBuilder &B) const {
1360   B.setInstr(MI);
1361 
1362   const LLT S1 = LLT::scalar(1);
1363   const LLT S32 = LLT::scalar(32);
1364   const LLT S64 = LLT::scalar(64);
1365 
1366   Register Src = MI.getOperand(1).getReg();
1367   assert(MRI.getType(Src) == S64);
1368 
1369   // TODO: Should this use extract since the low half is unused?
1370   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1371   Register Hi = Unmerge.getReg(1);
1372 
1373   // Extract the upper half, since this is where we will find the sign and
1374   // exponent.
1375   auto Exp = extractF64Exponent(Hi, B);
1376 
1377   const unsigned FractBits = 52;
1378 
1379   // Extract the sign bit.
1380   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1381   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1382 
1383   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1384 
1385   const auto Zero32 = B.buildConstant(S32, 0);
1386 
1387   // Extend back to 64-bits.
1388   auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)});
1389 
1390   auto Shr = B.buildAShr(S64, FractMask, Exp);
1391   auto Not = B.buildNot(S64, Shr);
1392   auto Tmp0 = B.buildAnd(S64, Src, Not);
1393   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1394 
1395   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1396   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1397 
1398   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1399   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1400   return true;
1401 }
1402 
1403 bool AMDGPULegalizerInfo::legalizeITOFP(
1404   MachineInstr &MI, MachineRegisterInfo &MRI,
1405   MachineIRBuilder &B, bool Signed) const {
1406   B.setInstr(MI);
1407 
1408   Register Dst = MI.getOperand(0).getReg();
1409   Register Src = MI.getOperand(1).getReg();
1410 
1411   const LLT S64 = LLT::scalar(64);
1412   const LLT S32 = LLT::scalar(32);
1413 
1414   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1415 
1416   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1417 
1418   auto CvtHi = Signed ?
1419     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1420     B.buildUITOFP(S64, Unmerge.getReg(1));
1421 
1422   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1423 
1424   auto ThirtyTwo = B.buildConstant(S32, 32);
1425   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1426     .addUse(CvtHi.getReg(0))
1427     .addUse(ThirtyTwo.getReg(0));
1428 
1429   // TODO: Should this propagate fast-math-flags?
1430   B.buildFAdd(Dst, LdExp, CvtLo);
1431   MI.eraseFromParent();
1432   return true;
1433 }
1434 
1435 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1436   MachineInstr &MI, MachineRegisterInfo &MRI,
1437   MachineIRBuilder &B) const {
1438   MachineFunction &MF = B.getMF();
1439   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1440 
1441   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1442                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1443 
1444   // With ieee_mode disabled, the instructions have the correct behavior
1445   // already for G_FMINNUM/G_FMAXNUM
1446   if (!MFI->getMode().IEEE)
1447     return !IsIEEEOp;
1448 
1449   if (IsIEEEOp)
1450     return true;
1451 
1452   MachineIRBuilder HelperBuilder(MI);
1453   GISelObserverWrapper DummyObserver;
1454   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1455   HelperBuilder.setInstr(MI);
1456   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1457 }
1458 
1459 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1460   MachineInstr &MI, MachineRegisterInfo &MRI,
1461   MachineIRBuilder &B) const {
1462   // TODO: Should move some of this into LegalizerHelper.
1463 
1464   // TODO: Promote dynamic indexing of s16 to s32
1465   // TODO: Dynamic s64 indexing is only legal for SGPR.
1466   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
1467   if (!IdxVal) // Dynamic case will be selected to register indexing.
1468     return true;
1469 
1470   Register Dst = MI.getOperand(0).getReg();
1471   Register Vec = MI.getOperand(1).getReg();
1472 
1473   LLT VecTy = MRI.getType(Vec);
1474   LLT EltTy = VecTy.getElementType();
1475   assert(EltTy == MRI.getType(Dst));
1476 
1477   B.setInstr(MI);
1478 
1479   if (IdxVal.getValue() < VecTy.getNumElements())
1480     B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits());
1481   else
1482     B.buildUndef(Dst);
1483 
1484   MI.eraseFromParent();
1485   return true;
1486 }
1487 
1488 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1489   MachineInstr &MI, MachineRegisterInfo &MRI,
1490   MachineIRBuilder &B) const {
1491   // TODO: Should move some of this into LegalizerHelper.
1492 
1493   // TODO: Promote dynamic indexing of s16 to s32
1494   // TODO: Dynamic s64 indexing is only legal for SGPR.
1495   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI);
1496   if (!IdxVal) // Dynamic case will be selected to register indexing.
1497     return true;
1498 
1499   Register Dst = MI.getOperand(0).getReg();
1500   Register Vec = MI.getOperand(1).getReg();
1501   Register Ins = MI.getOperand(2).getReg();
1502 
1503   LLT VecTy = MRI.getType(Vec);
1504   LLT EltTy = VecTy.getElementType();
1505   assert(EltTy == MRI.getType(Ins));
1506 
1507   B.setInstr(MI);
1508 
1509   if (IdxVal.getValue() < VecTy.getNumElements())
1510     B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits());
1511   else
1512     B.buildUndef(Dst);
1513 
1514   MI.eraseFromParent();
1515   return true;
1516 }
1517 
1518 bool AMDGPULegalizerInfo::legalizeSinCos(
1519   MachineInstr &MI, MachineRegisterInfo &MRI,
1520   MachineIRBuilder &B) const {
1521   B.setInstr(MI);
1522 
1523   Register DstReg = MI.getOperand(0).getReg();
1524   Register SrcReg = MI.getOperand(1).getReg();
1525   LLT Ty = MRI.getType(DstReg);
1526   unsigned Flags = MI.getFlags();
1527 
1528   Register TrigVal;
1529   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1530   if (ST.hasTrigReducedRange()) {
1531     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1532     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1533       .addUse(MulVal.getReg(0))
1534       .setMIFlags(Flags).getReg(0);
1535   } else
1536     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1537 
1538   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1539     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1540   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1541     .addUse(TrigVal)
1542     .setMIFlags(Flags);
1543   MI.eraseFromParent();
1544   return true;
1545 }
1546 
1547 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1548   Register DstReg, LLT PtrTy,
1549   MachineIRBuilder &B, const GlobalValue *GV,
1550   unsigned Offset, unsigned GAFlags) const {
1551   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1552   // to the following code sequence:
1553   //
1554   // For constant address space:
1555   //   s_getpc_b64 s[0:1]
1556   //   s_add_u32 s0, s0, $symbol
1557   //   s_addc_u32 s1, s1, 0
1558   //
1559   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1560   //   a fixup or relocation is emitted to replace $symbol with a literal
1561   //   constant, which is a pc-relative offset from the encoding of the $symbol
1562   //   operand to the global variable.
1563   //
1564   // For global address space:
1565   //   s_getpc_b64 s[0:1]
1566   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1567   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1568   //
1569   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1570   //   fixups or relocations are emitted to replace $symbol@*@lo and
1571   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1572   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
1573   //   operand to the global variable.
1574   //
1575   // What we want here is an offset from the value returned by s_getpc
1576   // (which is the address of the s_add_u32 instruction) to the global
1577   // variable, but since the encoding of $symbol starts 4 bytes after the start
1578   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1579   // small. This requires us to add 4 to the global variable offset in order to
1580   // compute the correct address.
1581 
1582   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1583 
1584   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1585     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1586 
1587   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1588     .addDef(PCReg);
1589 
1590   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1591   if (GAFlags == SIInstrInfo::MO_NONE)
1592     MIB.addImm(0);
1593   else
1594     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1595 
1596   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1597 
1598   if (PtrTy.getSizeInBits() == 32)
1599     B.buildExtract(DstReg, PCReg, 0);
1600   return true;
1601  }
1602 
1603 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1604   MachineInstr &MI, MachineRegisterInfo &MRI,
1605   MachineIRBuilder &B) const {
1606   Register DstReg = MI.getOperand(0).getReg();
1607   LLT Ty = MRI.getType(DstReg);
1608   unsigned AS = Ty.getAddressSpace();
1609 
1610   const GlobalValue *GV = MI.getOperand(1).getGlobal();
1611   MachineFunction &MF = B.getMF();
1612   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1613   B.setInstr(MI);
1614 
1615   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1616     if (!MFI->isEntryFunction()) {
1617       const Function &Fn = MF.getFunction();
1618       DiagnosticInfoUnsupported BadLDSDecl(
1619         Fn, "local memory global used by non-kernel function", MI.getDebugLoc());
1620       Fn.getContext().diagnose(BadLDSDecl);
1621     }
1622 
1623     // TODO: We could emit code to handle the initialization somewhere.
1624     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1625       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1626       MI.eraseFromParent();
1627       return true;
1628     }
1629 
1630     const Function &Fn = MF.getFunction();
1631     DiagnosticInfoUnsupported BadInit(
1632       Fn, "unsupported initializer for address space", MI.getDebugLoc());
1633     Fn.getContext().diagnose(BadInit);
1634     return true;
1635   }
1636 
1637   const SITargetLowering *TLI = ST.getTargetLowering();
1638 
1639   if (TLI->shouldEmitFixup(GV)) {
1640     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
1641     MI.eraseFromParent();
1642     return true;
1643   }
1644 
1645   if (TLI->shouldEmitPCReloc(GV)) {
1646     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
1647     MI.eraseFromParent();
1648     return true;
1649   }
1650 
1651   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1652   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
1653 
1654   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
1655     MachinePointerInfo::getGOT(MF),
1656     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1657     MachineMemOperand::MOInvariant,
1658     8 /*Size*/, 8 /*Align*/);
1659 
1660   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
1661 
1662   if (Ty.getSizeInBits() == 32) {
1663     // Truncate if this is a 32-bit constant adrdess.
1664     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
1665     B.buildExtract(DstReg, Load, 0);
1666   } else
1667     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
1668 
1669   MI.eraseFromParent();
1670   return true;
1671 }
1672 
1673 bool AMDGPULegalizerInfo::legalizeLoad(
1674   MachineInstr &MI, MachineRegisterInfo &MRI,
1675   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
1676   B.setInstr(MI);
1677   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1678   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
1679   Observer.changingInstr(MI);
1680   MI.getOperand(1).setReg(Cast.getReg(0));
1681   Observer.changedInstr(MI);
1682   return true;
1683 }
1684 
1685 bool AMDGPULegalizerInfo::legalizeFMad(
1686   MachineInstr &MI, MachineRegisterInfo &MRI,
1687   MachineIRBuilder &B) const {
1688   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1689   assert(Ty.isScalar());
1690 
1691   // TODO: Always legal with future ftz flag.
1692   if (Ty == LLT::scalar(32) && !ST.hasFP32Denormals())
1693     return true;
1694   if (Ty == LLT::scalar(16) && !ST.hasFP16Denormals())
1695     return true;
1696 
1697   MachineFunction &MF = B.getMF();
1698 
1699   MachineIRBuilder HelperBuilder(MI);
1700   GISelObserverWrapper DummyObserver;
1701   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1702   HelperBuilder.setMBB(*MI.getParent());
1703   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
1704 }
1705 
1706 // Return the use branch instruction, otherwise null if the usage is invalid.
1707 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
1708                                        MachineRegisterInfo &MRI) {
1709   Register CondDef = MI.getOperand(0).getReg();
1710   if (!MRI.hasOneNonDBGUse(CondDef))
1711     return nullptr;
1712 
1713   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
1714   return UseMI.getParent() == MI.getParent() &&
1715     UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr;
1716 }
1717 
1718 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
1719                                                 Register Reg, LLT Ty) const {
1720   Register LiveIn = MRI.getLiveInVirtReg(Reg);
1721   if (LiveIn)
1722     return LiveIn;
1723 
1724   Register NewReg = MRI.createGenericVirtualRegister(Ty);
1725   MRI.addLiveIn(Reg, NewReg);
1726   return NewReg;
1727 }
1728 
1729 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
1730                                          const ArgDescriptor *Arg) const {
1731   if (!Arg->isRegister() || !Arg->getRegister().isValid())
1732     return false; // TODO: Handle these
1733 
1734   assert(Arg->getRegister().isPhysical());
1735 
1736   MachineRegisterInfo &MRI = *B.getMRI();
1737 
1738   LLT Ty = MRI.getType(DstReg);
1739   Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
1740 
1741   if (Arg->isMasked()) {
1742     // TODO: Should we try to emit this once in the entry block?
1743     const LLT S32 = LLT::scalar(32);
1744     const unsigned Mask = Arg->getMask();
1745     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
1746 
1747     Register AndMaskSrc = LiveIn;
1748 
1749     if (Shift != 0) {
1750       auto ShiftAmt = B.buildConstant(S32, Shift);
1751       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
1752     }
1753 
1754     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
1755   } else
1756     B.buildCopy(DstReg, LiveIn);
1757 
1758   // Insert the argument copy if it doens't already exist.
1759   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
1760   if (!MRI.getVRegDef(LiveIn)) {
1761     // FIXME: Should have scoped insert pt
1762     MachineBasicBlock &OrigInsBB = B.getMBB();
1763     auto OrigInsPt = B.getInsertPt();
1764 
1765     MachineBasicBlock &EntryMBB = B.getMF().front();
1766     EntryMBB.addLiveIn(Arg->getRegister());
1767     B.setInsertPt(EntryMBB, EntryMBB.begin());
1768     B.buildCopy(LiveIn, Arg->getRegister());
1769 
1770     B.setInsertPt(OrigInsBB, OrigInsPt);
1771   }
1772 
1773   return true;
1774 }
1775 
1776 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
1777   MachineInstr &MI,
1778   MachineRegisterInfo &MRI,
1779   MachineIRBuilder &B,
1780   AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
1781   B.setInstr(MI);
1782 
1783   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1784 
1785   const ArgDescriptor *Arg;
1786   const TargetRegisterClass *RC;
1787   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
1788   if (!Arg) {
1789     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
1790     return false;
1791   }
1792 
1793   if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
1794     MI.eraseFromParent();
1795     return true;
1796   }
1797 
1798   return false;
1799 }
1800 
1801 bool AMDGPULegalizerInfo::legalizeFDIVFast(MachineInstr &MI,
1802                                            MachineRegisterInfo &MRI,
1803                                            MachineIRBuilder &B) const {
1804   B.setInstr(MI);
1805   Register Res = MI.getOperand(0).getReg();
1806   Register LHS = MI.getOperand(2).getReg();
1807   Register RHS = MI.getOperand(3).getReg();
1808   uint16_t Flags = MI.getFlags();
1809 
1810   LLT S32 = LLT::scalar(32);
1811   LLT S1 = LLT::scalar(1);
1812 
1813   auto Abs = B.buildFAbs(S32, RHS, Flags);
1814   const APFloat C0Val(1.0f);
1815 
1816   auto C0 = B.buildConstant(S32, 0x6f800000);
1817   auto C1 = B.buildConstant(S32, 0x2f800000);
1818   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
1819 
1820   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
1821   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
1822 
1823   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
1824 
1825   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
1826     .addUse(Mul0.getReg(0))
1827     .setMIFlags(Flags);
1828 
1829   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
1830 
1831   B.buildFMul(Res, Sel, Mul1, Flags);
1832 
1833   MI.eraseFromParent();
1834   return true;
1835 }
1836 
1837 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
1838                                                  MachineRegisterInfo &MRI,
1839                                                  MachineIRBuilder &B) const {
1840   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1841   if (!MFI->isEntryFunction()) {
1842     return legalizePreloadedArgIntrin(MI, MRI, B,
1843                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
1844   }
1845 
1846   B.setInstr(MI);
1847 
1848   uint64_t Offset =
1849     ST.getTargetLowering()->getImplicitParameterOffset(
1850       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
1851   Register DstReg = MI.getOperand(0).getReg();
1852   LLT DstTy = MRI.getType(DstReg);
1853   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
1854 
1855   const ArgDescriptor *Arg;
1856   const TargetRegisterClass *RC;
1857   std::tie(Arg, RC)
1858     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1859   if (!Arg)
1860     return false;
1861 
1862   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
1863   if (!loadInputValue(KernargPtrReg, B, Arg))
1864     return false;
1865 
1866   B.buildGEP(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
1867   MI.eraseFromParent();
1868   return true;
1869 }
1870 
1871 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
1872                                               MachineRegisterInfo &MRI,
1873                                               MachineIRBuilder &B,
1874                                               unsigned AddrSpace) const {
1875   B.setInstr(MI);
1876   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
1877   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
1878   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
1879   MI.eraseFromParent();
1880   return true;
1881 }
1882 
1883 /// Handle register layout difference for f16 images for some subtargets.
1884 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
1885                                              MachineRegisterInfo &MRI,
1886                                              Register Reg) const {
1887   if (!ST.hasUnpackedD16VMem())
1888     return Reg;
1889 
1890   const LLT S16 = LLT::scalar(16);
1891   const LLT S32 = LLT::scalar(32);
1892   LLT StoreVT = MRI.getType(Reg);
1893   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
1894 
1895   auto Unmerge = B.buildUnmerge(S16, Reg);
1896 
1897   SmallVector<Register, 4> WideRegs;
1898   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
1899     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
1900 
1901   int NumElts = StoreVT.getNumElements();
1902 
1903   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
1904 }
1905 
1906 bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI,
1907                                                  MachineRegisterInfo &MRI,
1908                                                  MachineIRBuilder &B,
1909                                                  bool IsFormat) const {
1910   // TODO: Reject f16 format on targets where unsupported.
1911   Register VData = MI.getOperand(1).getReg();
1912   LLT Ty = MRI.getType(VData);
1913 
1914   B.setInstr(MI);
1915 
1916   const LLT S32 = LLT::scalar(32);
1917   const LLT S16 = LLT::scalar(16);
1918 
1919   // Fixup illegal register types for i8 stores.
1920   if (Ty == LLT::scalar(8) || Ty == S16) {
1921     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
1922     MI.getOperand(1).setReg(AnyExt);
1923     return true;
1924   }
1925 
1926   if (Ty.isVector()) {
1927     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
1928       if (IsFormat)
1929         MI.getOperand(1).setReg(handleD16VData(B, MRI, VData));
1930       return true;
1931     }
1932 
1933     return Ty.getElementType() == S32 && Ty.getNumElements() <= 4;
1934   }
1935 
1936   return Ty == S32;
1937 }
1938 
1939 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
1940                                             MachineRegisterInfo &MRI,
1941                                             MachineIRBuilder &B) const {
1942   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
1943   switch (MI.getIntrinsicID()) {
1944   case Intrinsic::amdgcn_if: {
1945     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
1946       const SIRegisterInfo *TRI
1947         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1948 
1949       B.setInstr(*BrCond);
1950       Register Def = MI.getOperand(1).getReg();
1951       Register Use = MI.getOperand(3).getReg();
1952       B.buildInstr(AMDGPU::SI_IF)
1953         .addDef(Def)
1954         .addUse(Use)
1955         .addMBB(BrCond->getOperand(1).getMBB());
1956 
1957       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
1958       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
1959       MI.eraseFromParent();
1960       BrCond->eraseFromParent();
1961       return true;
1962     }
1963 
1964     return false;
1965   }
1966   case Intrinsic::amdgcn_loop: {
1967     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
1968       const SIRegisterInfo *TRI
1969         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1970 
1971       B.setInstr(*BrCond);
1972       Register Reg = MI.getOperand(2).getReg();
1973       B.buildInstr(AMDGPU::SI_LOOP)
1974         .addUse(Reg)
1975         .addMBB(BrCond->getOperand(1).getMBB());
1976       MI.eraseFromParent();
1977       BrCond->eraseFromParent();
1978       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
1979       return true;
1980     }
1981 
1982     return false;
1983   }
1984   case Intrinsic::amdgcn_kernarg_segment_ptr:
1985     return legalizePreloadedArgIntrin(
1986       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1987   case Intrinsic::amdgcn_implicitarg_ptr:
1988     return legalizeImplicitArgPtr(MI, MRI, B);
1989   case Intrinsic::amdgcn_workitem_id_x:
1990     return legalizePreloadedArgIntrin(MI, MRI, B,
1991                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
1992   case Intrinsic::amdgcn_workitem_id_y:
1993     return legalizePreloadedArgIntrin(MI, MRI, B,
1994                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
1995   case Intrinsic::amdgcn_workitem_id_z:
1996     return legalizePreloadedArgIntrin(MI, MRI, B,
1997                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
1998   case Intrinsic::amdgcn_workgroup_id_x:
1999     return legalizePreloadedArgIntrin(MI, MRI, B,
2000                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
2001   case Intrinsic::amdgcn_workgroup_id_y:
2002     return legalizePreloadedArgIntrin(MI, MRI, B,
2003                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
2004   case Intrinsic::amdgcn_workgroup_id_z:
2005     return legalizePreloadedArgIntrin(MI, MRI, B,
2006                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
2007   case Intrinsic::amdgcn_dispatch_ptr:
2008     return legalizePreloadedArgIntrin(MI, MRI, B,
2009                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
2010   case Intrinsic::amdgcn_queue_ptr:
2011     return legalizePreloadedArgIntrin(MI, MRI, B,
2012                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
2013   case Intrinsic::amdgcn_implicit_buffer_ptr:
2014     return legalizePreloadedArgIntrin(
2015       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
2016   case Intrinsic::amdgcn_dispatch_id:
2017     return legalizePreloadedArgIntrin(MI, MRI, B,
2018                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
2019   case Intrinsic::amdgcn_fdiv_fast:
2020     return legalizeFDIVFast(MI, MRI, B);
2021   case Intrinsic::amdgcn_is_shared:
2022     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
2023   case Intrinsic::amdgcn_is_private:
2024     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
2025   case Intrinsic::amdgcn_wavefrontsize: {
2026     B.setInstr(MI);
2027     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
2028     MI.eraseFromParent();
2029     return true;
2030   }
2031   case Intrinsic::amdgcn_raw_buffer_store:
2032     return legalizeRawBufferStore(MI, MRI, B, false);
2033   case Intrinsic::amdgcn_raw_buffer_store_format:
2034     return legalizeRawBufferStore(MI, MRI, B, true);
2035   default:
2036     return true;
2037   }
2038 
2039   return true;
2040 }
2041