1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
19 #endif
20 
21 #include "AMDGPU.h"
22 #include "AMDGPULegalizerInfo.h"
23 #include "AMDGPUTargetMachine.h"
24 #include "SIMachineFunctionInfo.h"
25 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/CodeGen/TargetOpcodes.h"
28 #include "llvm/CodeGen/ValueTypes.h"
29 #include "llvm/IR/DerivedTypes.h"
30 #include "llvm/IR/DiagnosticInfo.h"
31 #include "llvm/IR/Type.h"
32 #include "llvm/Support/Debug.h"
33 
34 #define DEBUG_TYPE "amdgpu-legalinfo"
35 
36 using namespace llvm;
37 using namespace LegalizeActions;
38 using namespace LegalizeMutations;
39 using namespace LegalityPredicates;
40 
41 
42 static LegalityPredicate isMultiple32(unsigned TypeIdx,
43                                       unsigned MaxSize = 1024) {
44   return [=](const LegalityQuery &Query) {
45     const LLT Ty = Query.Types[TypeIdx];
46     const LLT EltTy = Ty.getScalarType();
47     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
48   };
49 }
50 
51 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
52   return [=](const LegalityQuery &Query) {
53     return Query.Types[TypeIdx].getSizeInBits() == Size;
54   };
55 }
56 
57 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
58   return [=](const LegalityQuery &Query) {
59     const LLT Ty = Query.Types[TypeIdx];
60     return Ty.isVector() &&
61            Ty.getNumElements() % 2 != 0 &&
62            Ty.getElementType().getSizeInBits() < 32 &&
63            Ty.getSizeInBits() % 32 != 0;
64   };
65 }
66 
67 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
68   return [=](const LegalityQuery &Query) {
69     const LLT Ty = Query.Types[TypeIdx];
70     const LLT EltTy = Ty.getScalarType();
71     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
72   };
73 }
74 
75 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
76   return [=](const LegalityQuery &Query) {
77     const LLT Ty = Query.Types[TypeIdx];
78     const LLT EltTy = Ty.getElementType();
79     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
80   };
81 }
82 
83 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
84   return [=](const LegalityQuery &Query) {
85     const LLT Ty = Query.Types[TypeIdx];
86     const LLT EltTy = Ty.getElementType();
87     unsigned Size = Ty.getSizeInBits();
88     unsigned Pieces = (Size + 63) / 64;
89     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
90     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
91   };
92 }
93 
94 // Increase the number of vector elements to reach the next multiple of 32-bit
95 // type.
96 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
97   return [=](const LegalityQuery &Query) {
98     const LLT Ty = Query.Types[TypeIdx];
99 
100     const LLT EltTy = Ty.getElementType();
101     const int Size = Ty.getSizeInBits();
102     const int EltSize = EltTy.getSizeInBits();
103     const int NextMul32 = (Size + 31) / 32;
104 
105     assert(EltSize < 32);
106 
107     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
108     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
109   };
110 }
111 
112 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
113   return [=](const LegalityQuery &Query) {
114     const LLT QueryTy = Query.Types[TypeIdx];
115     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
116   };
117 }
118 
119 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
120   return [=](const LegalityQuery &Query) {
121     const LLT QueryTy = Query.Types[TypeIdx];
122     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
123   };
124 }
125 
126 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
127   return [=](const LegalityQuery &Query) {
128     const LLT QueryTy = Query.Types[TypeIdx];
129     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
130   };
131 }
132 
133 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
134 // v2s16.
135 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
136   return [=](const LegalityQuery &Query) {
137     const LLT Ty = Query.Types[TypeIdx];
138     if (Ty.isVector()) {
139       const int EltSize = Ty.getElementType().getSizeInBits();
140       return EltSize == 32 || EltSize == 64 ||
141             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
142              EltSize == 128 || EltSize == 256;
143     }
144 
145     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
146   };
147 }
148 
149 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
150   return [=](const LegalityQuery &Query) {
151     return Query.Types[TypeIdx].getElementType() == Type;
152   };
153 }
154 
155 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
156   return [=](const LegalityQuery &Query) {
157     const LLT Ty = Query.Types[TypeIdx];
158     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
159            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
160   };
161 }
162 
163 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
164                                          const GCNTargetMachine &TM)
165   :  ST(ST_) {
166   using namespace TargetOpcode;
167 
168   auto GetAddrSpacePtr = [&TM](unsigned AS) {
169     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
170   };
171 
172   const LLT S1 = LLT::scalar(1);
173   const LLT S8 = LLT::scalar(8);
174   const LLT S16 = LLT::scalar(16);
175   const LLT S32 = LLT::scalar(32);
176   const LLT S64 = LLT::scalar(64);
177   const LLT S96 = LLT::scalar(96);
178   const LLT S128 = LLT::scalar(128);
179   const LLT S256 = LLT::scalar(256);
180   const LLT S1024 = LLT::scalar(1024);
181 
182   const LLT V2S16 = LLT::vector(2, 16);
183   const LLT V4S16 = LLT::vector(4, 16);
184 
185   const LLT V2S32 = LLT::vector(2, 32);
186   const LLT V3S32 = LLT::vector(3, 32);
187   const LLT V4S32 = LLT::vector(4, 32);
188   const LLT V5S32 = LLT::vector(5, 32);
189   const LLT V6S32 = LLT::vector(6, 32);
190   const LLT V7S32 = LLT::vector(7, 32);
191   const LLT V8S32 = LLT::vector(8, 32);
192   const LLT V9S32 = LLT::vector(9, 32);
193   const LLT V10S32 = LLT::vector(10, 32);
194   const LLT V11S32 = LLT::vector(11, 32);
195   const LLT V12S32 = LLT::vector(12, 32);
196   const LLT V13S32 = LLT::vector(13, 32);
197   const LLT V14S32 = LLT::vector(14, 32);
198   const LLT V15S32 = LLT::vector(15, 32);
199   const LLT V16S32 = LLT::vector(16, 32);
200   const LLT V32S32 = LLT::vector(32, 32);
201 
202   const LLT V2S64 = LLT::vector(2, 64);
203   const LLT V3S64 = LLT::vector(3, 64);
204   const LLT V4S64 = LLT::vector(4, 64);
205   const LLT V5S64 = LLT::vector(5, 64);
206   const LLT V6S64 = LLT::vector(6, 64);
207   const LLT V7S64 = LLT::vector(7, 64);
208   const LLT V8S64 = LLT::vector(8, 64);
209   const LLT V16S64 = LLT::vector(16, 64);
210 
211   std::initializer_list<LLT> AllS32Vectors =
212     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
213      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
214   std::initializer_list<LLT> AllS64Vectors =
215     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
216 
217   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
218   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
219   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
220   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
221   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
222   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
223   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
224 
225   const LLT CodePtr = FlatPtr;
226 
227   const std::initializer_list<LLT> AddrSpaces64 = {
228     GlobalPtr, ConstantPtr, FlatPtr
229   };
230 
231   const std::initializer_list<LLT> AddrSpaces32 = {
232     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
233   };
234 
235   const std::initializer_list<LLT> FPTypesBase = {
236     S32, S64
237   };
238 
239   const std::initializer_list<LLT> FPTypes16 = {
240     S32, S64, S16
241   };
242 
243   const std::initializer_list<LLT> FPTypesPK16 = {
244     S32, S64, S16, V2S16
245   };
246 
247   setAction({G_BRCOND, S1}, Legal);
248 
249   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
250   // elements for v3s16
251   getActionDefinitionsBuilder(G_PHI)
252     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
253     .legalFor(AllS32Vectors)
254     .legalFor(AllS64Vectors)
255     .legalFor(AddrSpaces64)
256     .legalFor(AddrSpaces32)
257     .clampScalar(0, S32, S256)
258     .widenScalarToNextPow2(0, 32)
259     .clampMaxNumElements(0, S32, 16)
260     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
261     .legalIf(isPointer(0));
262 
263   if (ST.has16BitInsts()) {
264     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
265       .legalFor({S32, S16})
266       .clampScalar(0, S16, S32)
267       .scalarize(0);
268   } else {
269     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
270       .legalFor({S32})
271       .clampScalar(0, S32, S32)
272       .scalarize(0);
273   }
274 
275   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
276     .legalFor({S32})
277     .clampScalar(0, S32, S32)
278     .scalarize(0);
279 
280   // Report legal for any types we can handle anywhere. For the cases only legal
281   // on the SALU, RegBankSelect will be able to re-legalize.
282   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
283     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
284     .clampScalar(0, S32, S64)
285     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
286     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
287     .widenScalarToNextPow2(0)
288     .scalarize(0);
289 
290   getActionDefinitionsBuilder({G_UADDO, G_SADDO, G_USUBO, G_SSUBO,
291                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
292     .legalFor({{S32, S1}})
293     .clampScalar(0, S32, S32)
294     .scalarize(0); // TODO: Implement.
295 
296   getActionDefinitionsBuilder(G_BITCAST)
297     // Don't worry about the size constraint.
298     .legalIf(all(isRegisterType(0), isRegisterType(1)))
299     // FIXME: Testing hack
300     .legalForCartesianProduct({S16, LLT::vector(2, 8), });
301 
302   getActionDefinitionsBuilder(G_FCONSTANT)
303     .legalFor({S32, S64, S16})
304     .clampScalar(0, S16, S64);
305 
306   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
307     .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
308                ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
309     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
310     .clampScalarOrElt(0, S32, S1024)
311     .legalIf(isMultiple32(0))
312     .widenScalarToNextPow2(0, 32)
313     .clampMaxNumElements(0, S32, 16);
314 
315 
316   // FIXME: i1 operands to intrinsics should always be legal, but other i1
317   // values may not be legal.  We need to figure out how to distinguish
318   // between these two scenarios.
319   getActionDefinitionsBuilder(G_CONSTANT)
320     .legalFor({S1, S32, S64, S16, GlobalPtr,
321                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
322     .clampScalar(0, S32, S64)
323     .widenScalarToNextPow2(0)
324     .legalIf(isPointer(0));
325 
326   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
327   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
328     .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr});
329 
330 
331   auto &FPOpActions = getActionDefinitionsBuilder(
332     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
333     .legalFor({S32, S64});
334   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
335     .customFor({S32, S64});
336 
337   if (ST.has16BitInsts()) {
338     if (ST.hasVOP3PInsts())
339       FPOpActions.legalFor({S16, V2S16});
340     else
341       FPOpActions.legalFor({S16});
342 
343     TrigActions.customFor({S16});
344   }
345 
346   auto &MinNumMaxNum = getActionDefinitionsBuilder({
347       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
348 
349   if (ST.hasVOP3PInsts()) {
350     MinNumMaxNum.customFor(FPTypesPK16)
351       .clampMaxNumElements(0, S16, 2)
352       .clampScalar(0, S16, S64)
353       .scalarize(0);
354   } else if (ST.has16BitInsts()) {
355     MinNumMaxNum.customFor(FPTypes16)
356       .clampScalar(0, S16, S64)
357       .scalarize(0);
358   } else {
359     MinNumMaxNum.customFor(FPTypesBase)
360       .clampScalar(0, S32, S64)
361       .scalarize(0);
362   }
363 
364   if (ST.hasVOP3PInsts())
365     FPOpActions.clampMaxNumElements(0, S16, 2);
366 
367   FPOpActions
368     .scalarize(0)
369     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
370 
371   TrigActions
372     .scalarize(0)
373     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
374 
375   getActionDefinitionsBuilder({G_FNEG, G_FABS})
376     .legalFor(FPTypesPK16)
377     .clampMaxNumElements(0, S16, 2)
378     .scalarize(0)
379     .clampScalar(0, S16, S64);
380 
381   // TODO: Implement
382   getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
383 
384   if (ST.has16BitInsts()) {
385     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
386       .legalFor({S32, S64, S16})
387       .scalarize(0)
388       .clampScalar(0, S16, S64);
389   } else {
390     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
391       .legalFor({S32, S64})
392       .scalarize(0)
393       .clampScalar(0, S32, S64);
394   }
395 
396   getActionDefinitionsBuilder(G_FPTRUNC)
397     .legalFor({{S32, S64}, {S16, S32}})
398     .scalarize(0);
399 
400   getActionDefinitionsBuilder(G_FPEXT)
401     .legalFor({{S64, S32}, {S32, S16}})
402     .lowerFor({{S64, S16}}) // FIXME: Implement
403     .scalarize(0);
404 
405   // TODO: Verify V_BFI_B32 is generated from expanded bit ops.
406   getActionDefinitionsBuilder(G_FCOPYSIGN).lower();
407 
408   getActionDefinitionsBuilder(G_FSUB)
409       // Use actual fsub instruction
410       .legalFor({S32})
411       // Must use fadd + fneg
412       .lowerFor({S64, S16, V2S16})
413       .scalarize(0)
414       .clampScalar(0, S32, S64);
415 
416   // Whether this is legal depends on the floating point mode for the function.
417   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
418   if (ST.hasMadF16())
419     FMad.customFor({S32, S16});
420   else
421     FMad.customFor({S32});
422   FMad.scalarize(0)
423       .lower();
424 
425   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
426     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
427                {S32, S1}, {S64, S1}, {S16, S1},
428                {S96, S32},
429                // FIXME: Hack
430                {S64, LLT::scalar(33)},
431                {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}})
432     .scalarize(0);
433 
434   // TODO: Split s1->s64 during regbankselect for VALU.
435   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
436     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}, {S32, S1}, {S16, S1}, {S64, S1}})
437     .lowerFor({{S32, S64}})
438     .customFor({{S64, S64}});
439   if (ST.has16BitInsts())
440     IToFP.legalFor({{S16, S16}});
441   IToFP.clampScalar(1, S32, S64)
442        .scalarize(0);
443 
444   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
445     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}});
446   if (ST.has16BitInsts())
447     FPToI.legalFor({{S16, S16}});
448   else
449     FPToI.minScalar(1, S32);
450 
451   FPToI.minScalar(0, S32)
452        .scalarize(0);
453 
454   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
455     .legalFor({S32, S64})
456     .scalarize(0);
457 
458   if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
459     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
460       .legalFor({S32, S64})
461       .clampScalar(0, S32, S64)
462       .scalarize(0);
463   } else {
464     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
465       .legalFor({S32})
466       .customFor({S64})
467       .clampScalar(0, S32, S64)
468       .scalarize(0);
469   }
470 
471   getActionDefinitionsBuilder(G_GEP)
472     .legalForCartesianProduct(AddrSpaces64, {S64})
473     .legalForCartesianProduct(AddrSpaces32, {S32})
474     .scalarize(0);
475 
476   getActionDefinitionsBuilder(G_PTR_MASK)
477     .scalarize(0)
478     .alwaysLegal();
479 
480   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
481 
482   auto &CmpBuilder =
483     getActionDefinitionsBuilder(G_ICMP)
484     .legalForCartesianProduct(
485       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
486     .legalFor({{S1, S32}, {S1, S64}});
487   if (ST.has16BitInsts()) {
488     CmpBuilder.legalFor({{S1, S16}});
489   }
490 
491   CmpBuilder
492     .widenScalarToNextPow2(1)
493     .clampScalar(1, S32, S64)
494     .scalarize(0)
495     .legalIf(all(typeIs(0, S1), isPointer(1)));
496 
497   getActionDefinitionsBuilder(G_FCMP)
498     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
499     .widenScalarToNextPow2(1)
500     .clampScalar(1, S32, S64)
501     .scalarize(0);
502 
503   // FIXME: fexp, flog2, flog10 needs to be custom lowered.
504   getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2,
505                                G_FLOG, G_FLOG2, G_FLOG10})
506     .legalFor({S32})
507     .scalarize(0);
508 
509   // The 64-bit versions produce 32-bit results, but only on the SALU.
510   getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF,
511                                G_CTTZ, G_CTTZ_ZERO_UNDEF,
512                                G_CTPOP})
513     .legalFor({{S32, S32}, {S32, S64}})
514     .clampScalar(0, S32, S32)
515     .clampScalar(1, S32, S64)
516     .scalarize(0)
517     .widenScalarToNextPow2(0, 32)
518     .widenScalarToNextPow2(1, 32);
519 
520   // TODO: Expand for > s32
521   getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE})
522     .legalFor({S32})
523     .clampScalar(0, S32, S32)
524     .scalarize(0);
525 
526   if (ST.has16BitInsts()) {
527     if (ST.hasVOP3PInsts()) {
528       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
529         .legalFor({S32, S16, V2S16})
530         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
531         .clampMaxNumElements(0, S16, 2)
532         .clampScalar(0, S16, S32)
533         .widenScalarToNextPow2(0)
534         .scalarize(0);
535     } else {
536       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
537         .legalFor({S32, S16})
538         .widenScalarToNextPow2(0)
539         .clampScalar(0, S16, S32)
540         .scalarize(0);
541     }
542   } else {
543     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
544       .legalFor({S32})
545       .clampScalar(0, S32, S32)
546       .widenScalarToNextPow2(0)
547       .scalarize(0);
548   }
549 
550   auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
551     return [=](const LegalityQuery &Query) {
552       return Query.Types[TypeIdx0].getSizeInBits() <
553              Query.Types[TypeIdx1].getSizeInBits();
554     };
555   };
556 
557   auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
558     return [=](const LegalityQuery &Query) {
559       return Query.Types[TypeIdx0].getSizeInBits() >
560              Query.Types[TypeIdx1].getSizeInBits();
561     };
562   };
563 
564   getActionDefinitionsBuilder(G_INTTOPTR)
565     // List the common cases
566     .legalForCartesianProduct(AddrSpaces64, {S64})
567     .legalForCartesianProduct(AddrSpaces32, {S32})
568     .scalarize(0)
569     // Accept any address space as long as the size matches
570     .legalIf(sameSize(0, 1))
571     .widenScalarIf(smallerThan(1, 0),
572       [](const LegalityQuery &Query) {
573         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
574       })
575     .narrowScalarIf(greaterThan(1, 0),
576       [](const LegalityQuery &Query) {
577         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
578       });
579 
580   getActionDefinitionsBuilder(G_PTRTOINT)
581     // List the common cases
582     .legalForCartesianProduct(AddrSpaces64, {S64})
583     .legalForCartesianProduct(AddrSpaces32, {S32})
584     .scalarize(0)
585     // Accept any address space as long as the size matches
586     .legalIf(sameSize(0, 1))
587     .widenScalarIf(smallerThan(0, 1),
588       [](const LegalityQuery &Query) {
589         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
590       })
591     .narrowScalarIf(
592       greaterThan(0, 1),
593       [](const LegalityQuery &Query) {
594         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
595       });
596 
597   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
598     .scalarize(0)
599     .custom();
600 
601   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
602   // handle some operations by just promoting the register during
603   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
604   auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned {
605     switch (AS) {
606     // FIXME: Private element size.
607     case AMDGPUAS::PRIVATE_ADDRESS:
608       return 32;
609     // FIXME: Check subtarget
610     case AMDGPUAS::LOCAL_ADDRESS:
611       return ST.useDS128() ? 128 : 64;
612 
613     // Treat constant and global as identical. SMRD loads are sometimes usable
614     // for global loads (ideally constant address space should be eliminated)
615     // depending on the context. Legality cannot be context dependent, but
616     // RegBankSelect can split the load as necessary depending on the pointer
617     // register bank/uniformity and if the memory is invariant or not written in
618     // a kernel.
619     case AMDGPUAS::CONSTANT_ADDRESS:
620     case AMDGPUAS::GLOBAL_ADDRESS:
621       return 512;
622     default:
623       return 128;
624     }
625   };
626 
627   const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool {
628     const LLT DstTy = Query.Types[0];
629 
630     // Split vector extloads.
631     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
632     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
633       return true;
634 
635     const LLT PtrTy = Query.Types[1];
636     unsigned AS = PtrTy.getAddressSpace();
637     if (MemSize > maxSizeForAddrSpace(AS))
638       return true;
639 
640     // Catch weird sized loads that don't evenly divide into the access sizes
641     // TODO: May be able to widen depending on alignment etc.
642     unsigned NumRegs = MemSize / 32;
643     if (NumRegs == 3 && !ST.hasDwordx3LoadStores())
644       return true;
645 
646     unsigned Align = Query.MMODescrs[0].AlignInBits;
647     if (Align < MemSize) {
648       const SITargetLowering *TLI = ST.getTargetLowering();
649       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
650     }
651 
652     return false;
653   };
654 
655   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
656   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
657   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
658 
659   // TODO: Refine based on subtargets which support unaligned access or 128-bit
660   // LDS
661   // TODO: Unsupported flat for SI.
662 
663   for (unsigned Op : {G_LOAD, G_STORE}) {
664     const bool IsStore = Op == G_STORE;
665 
666     auto &Actions = getActionDefinitionsBuilder(Op);
667     // Whitelist the common cases.
668     // TODO: Pointer loads
669     // TODO: Wide constant loads
670     // TODO: Only CI+ has 3x loads
671     // TODO: Loads to s16 on gfx9
672     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
673                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
674                                       {V3S32, GlobalPtr, 96, GlobalAlign32},
675                                       {S96, GlobalPtr, 96, GlobalAlign32},
676                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
677                                       {S128, GlobalPtr, 128, GlobalAlign32},
678                                       {S64, GlobalPtr, 64, GlobalAlign32},
679                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
680                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
681                                       {S32, GlobalPtr, 8, GlobalAlign8},
682                                       {S32, GlobalPtr, 16, GlobalAlign16},
683 
684                                       {S32, LocalPtr, 32, 32},
685                                       {S64, LocalPtr, 64, 32},
686                                       {V2S32, LocalPtr, 64, 32},
687                                       {S32, LocalPtr, 8, 8},
688                                       {S32, LocalPtr, 16, 16},
689                                       {V2S16, LocalPtr, 32, 32},
690 
691                                       {S32, PrivatePtr, 32, 32},
692                                       {S32, PrivatePtr, 8, 8},
693                                       {S32, PrivatePtr, 16, 16},
694                                       {V2S16, PrivatePtr, 32, 32},
695 
696                                       {S32, FlatPtr, 32, GlobalAlign32},
697                                       {S32, FlatPtr, 16, GlobalAlign16},
698                                       {S32, FlatPtr, 8, GlobalAlign8},
699                                       {V2S16, FlatPtr, 32, GlobalAlign32},
700 
701                                       {S32, ConstantPtr, 32, GlobalAlign32},
702                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
703                                       {V3S32, ConstantPtr, 96, GlobalAlign32},
704                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
705                                       {S64, ConstantPtr, 64, GlobalAlign32},
706                                       {S128, ConstantPtr, 128, GlobalAlign32},
707                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
708     Actions
709         .customIf(typeIs(1, Constant32Ptr))
710         .narrowScalarIf(
711             [=](const LegalityQuery &Query) -> bool {
712               return !Query.Types[0].isVector() && needToSplitLoad(Query);
713             },
714             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
715               const LLT DstTy = Query.Types[0];
716               const LLT PtrTy = Query.Types[1];
717 
718               const unsigned DstSize = DstTy.getSizeInBits();
719               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
720 
721               // Split extloads.
722               if (DstSize > MemSize)
723                 return std::make_pair(0, LLT::scalar(MemSize));
724 
725               if (DstSize > 32 && (DstSize % 32 != 0)) {
726                 // FIXME: Need a way to specify non-extload of larger size if
727                 // suitably aligned.
728                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
729               }
730 
731               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
732               if (MemSize > MaxSize)
733                 return std::make_pair(0, LLT::scalar(MaxSize));
734 
735               unsigned Align = Query.MMODescrs[0].AlignInBits;
736               return std::make_pair(0, LLT::scalar(Align));
737             })
738         .fewerElementsIf(
739             [=](const LegalityQuery &Query) -> bool {
740               return Query.Types[0].isVector() && needToSplitLoad(Query);
741             },
742             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
743               const LLT DstTy = Query.Types[0];
744               const LLT PtrTy = Query.Types[1];
745 
746               LLT EltTy = DstTy.getElementType();
747               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
748 
749               // Split if it's too large for the address space.
750               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
751                 unsigned NumElts = DstTy.getNumElements();
752                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
753 
754                 // FIXME: Refine when odd breakdowns handled
755                 // The scalars will need to be re-legalized.
756                 if (NumPieces == 1 || NumPieces >= NumElts ||
757                     NumElts % NumPieces != 0)
758                   return std::make_pair(0, EltTy);
759 
760                 return std::make_pair(0,
761                                       LLT::vector(NumElts / NumPieces, EltTy));
762               }
763 
764               // Need to split because of alignment.
765               unsigned Align = Query.MMODescrs[0].AlignInBits;
766               unsigned EltSize = EltTy.getSizeInBits();
767               if (EltSize > Align &&
768                   (EltSize / Align < DstTy.getNumElements())) {
769                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
770               }
771 
772               // May need relegalization for the scalars.
773               return std::make_pair(0, EltTy);
774             })
775         .minScalar(0, S32);
776 
777     if (IsStore)
778       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
779 
780     // TODO: Need a bitcast lower option?
781     Actions
782         .legalIf([=](const LegalityQuery &Query) {
783           const LLT Ty0 = Query.Types[0];
784           unsigned Size = Ty0.getSizeInBits();
785           unsigned MemSize = Query.MMODescrs[0].SizeInBits;
786           unsigned Align = Query.MMODescrs[0].AlignInBits;
787 
788           // No extending vector loads.
789           if (Size > MemSize && Ty0.isVector())
790             return false;
791 
792           // FIXME: Widening store from alignment not valid.
793           if (MemSize < Size)
794             MemSize = std::max(MemSize, Align);
795 
796           switch (MemSize) {
797           case 8:
798           case 16:
799             return Size == 32;
800           case 32:
801           case 64:
802           case 128:
803             return true;
804           case 96:
805             return ST.hasDwordx3LoadStores();
806           case 256:
807           case 512:
808             return true;
809           default:
810             return false;
811           }
812         })
813         .widenScalarToNextPow2(0)
814         // TODO: v3s32->v4s32 with alignment
815         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
816   }
817 
818   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
819                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
820                                                   {S32, GlobalPtr, 16, 2 * 8},
821                                                   {S32, LocalPtr, 8, 8},
822                                                   {S32, LocalPtr, 16, 16},
823                                                   {S32, PrivatePtr, 8, 8},
824                                                   {S32, PrivatePtr, 16, 16},
825                                                   {S32, ConstantPtr, 8, 8},
826                                                   {S32, ConstantPtr, 16, 2 * 8}});
827   if (ST.hasFlatAddressSpace()) {
828     ExtLoads.legalForTypesWithMemDesc(
829         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
830   }
831 
832   ExtLoads.clampScalar(0, S32, S32)
833           .widenScalarToNextPow2(0)
834           .unsupportedIfMemSizeNotPow2()
835           .lower();
836 
837   auto &Atomics = getActionDefinitionsBuilder(
838     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
839      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
840      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
841      G_ATOMICRMW_UMIN, G_ATOMIC_CMPXCHG})
842     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
843                {S64, GlobalPtr}, {S64, LocalPtr}});
844   if (ST.hasFlatAddressSpace()) {
845     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
846   }
847 
848   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
849     .legalFor({{S32, LocalPtr}});
850 
851   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS)
852     .lower();
853 
854   // TODO: Pointer types, any 32-bit or 64-bit vector
855   getActionDefinitionsBuilder(G_SELECT)
856     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
857           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
858           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1})
859     .clampScalar(0, S16, S64)
860     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
861     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
862     .scalarize(1)
863     .clampMaxNumElements(0, S32, 2)
864     .clampMaxNumElements(0, LocalPtr, 2)
865     .clampMaxNumElements(0, PrivatePtr, 2)
866     .scalarize(0)
867     .widenScalarToNextPow2(0)
868     .legalIf(all(isPointer(0), typeIs(1, S1)));
869 
870   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
871   // be more flexible with the shift amount type.
872   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
873     .legalFor({{S32, S32}, {S64, S32}});
874   if (ST.has16BitInsts()) {
875     if (ST.hasVOP3PInsts()) {
876       Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
877             .clampMaxNumElements(0, S16, 2);
878     } else
879       Shifts.legalFor({{S16, S32}, {S16, S16}});
880 
881     Shifts.clampScalar(1, S16, S32);
882     Shifts.clampScalar(0, S16, S64);
883     Shifts.widenScalarToNextPow2(0, 16);
884   } else {
885     // Make sure we legalize the shift amount type first, as the general
886     // expansion for the shifted type will produce much worse code if it hasn't
887     // been truncated already.
888     Shifts.clampScalar(1, S32, S32);
889     Shifts.clampScalar(0, S32, S64);
890     Shifts.widenScalarToNextPow2(0, 32);
891   }
892   Shifts.scalarize(0);
893 
894   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
895     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
896     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
897     unsigned IdxTypeIdx = 2;
898 
899     getActionDefinitionsBuilder(Op)
900       .customIf([=](const LegalityQuery &Query) {
901           const LLT EltTy = Query.Types[EltTypeIdx];
902           const LLT VecTy = Query.Types[VecTypeIdx];
903           const LLT IdxTy = Query.Types[IdxTypeIdx];
904           return (EltTy.getSizeInBits() == 16 ||
905                   EltTy.getSizeInBits() % 32 == 0) &&
906                  VecTy.getSizeInBits() % 32 == 0 &&
907                  VecTy.getSizeInBits() <= 1024 &&
908                  IdxTy.getSizeInBits() == 32;
909         })
910       .clampScalar(EltTypeIdx, S32, S64)
911       .clampScalar(VecTypeIdx, S32, S64)
912       .clampScalar(IdxTypeIdx, S32, S32);
913   }
914 
915   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
916     .unsupportedIf([=](const LegalityQuery &Query) {
917         const LLT &EltTy = Query.Types[1].getElementType();
918         return Query.Types[0] != EltTy;
919       });
920 
921   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
922     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
923     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
924 
925     // FIXME: Doesn't handle extract of illegal sizes.
926     getActionDefinitionsBuilder(Op)
927       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
928       // FIXME: Multiples of 16 should not be legal.
929       .legalIf([=](const LegalityQuery &Query) {
930           const LLT BigTy = Query.Types[BigTyIdx];
931           const LLT LitTy = Query.Types[LitTyIdx];
932           return (BigTy.getSizeInBits() % 32 == 0) &&
933                  (LitTy.getSizeInBits() % 16 == 0);
934         })
935       .widenScalarIf(
936         [=](const LegalityQuery &Query) {
937           const LLT BigTy = Query.Types[BigTyIdx];
938           return (BigTy.getScalarSizeInBits() < 16);
939         },
940         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
941       .widenScalarIf(
942         [=](const LegalityQuery &Query) {
943           const LLT LitTy = Query.Types[LitTyIdx];
944           return (LitTy.getScalarSizeInBits() < 16);
945         },
946         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
947       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
948       .widenScalarToNextPow2(BigTyIdx, 32);
949 
950   }
951 
952   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
953     .legalForCartesianProduct(AllS32Vectors, {S32})
954     .legalForCartesianProduct(AllS64Vectors, {S64})
955     .clampNumElements(0, V16S32, V32S32)
956     .clampNumElements(0, V2S64, V16S64)
957     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
958 
959   if (ST.hasScalarPackInsts())
960     BuildVector.legalFor({V2S16, S32});
961 
962   BuildVector
963     .minScalarSameAs(1, 0)
964     .legalIf(isRegisterType(0))
965     .minScalarOrElt(0, S32);
966 
967   if (ST.hasScalarPackInsts()) {
968     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
969       .legalFor({V2S16, S32})
970       .lower();
971   } else {
972     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
973       .lower();
974   }
975 
976   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
977     .legalIf(isRegisterType(0));
978 
979   // TODO: Don't fully scalarize v2s16 pieces
980   getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
981 
982   // Merge/Unmerge
983   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
984     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
985     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
986 
987     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
988       const LLT &Ty = Query.Types[TypeIdx];
989       if (Ty.isVector()) {
990         const LLT &EltTy = Ty.getElementType();
991         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
992           return true;
993         if (!isPowerOf2_32(EltTy.getSizeInBits()))
994           return true;
995       }
996       return false;
997     };
998 
999     auto &Builder = getActionDefinitionsBuilder(Op)
1000       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1001       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1002       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1003       // valid.
1004       .clampScalar(LitTyIdx, S16, S256)
1005       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1006       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1007       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1008                            elementTypeIs(1, S16)),
1009                        changeTo(1, V2S16))
1010       // Break up vectors with weird elements into scalars
1011       .fewerElementsIf(
1012         [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
1013         scalarize(0))
1014       .fewerElementsIf(
1015         [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
1016         scalarize(1))
1017       .clampScalar(BigTyIdx, S32, S1024)
1018       .lowerFor({{S16, V2S16}});
1019 
1020     if (Op == G_MERGE_VALUES) {
1021       Builder.widenScalarIf(
1022         // TODO: Use 16-bit shifts if legal for 8-bit values?
1023         [=](const LegalityQuery &Query) {
1024           const LLT Ty = Query.Types[LitTyIdx];
1025           return Ty.getSizeInBits() < 32;
1026         },
1027         changeTo(LitTyIdx, S32));
1028     }
1029 
1030     Builder.widenScalarIf(
1031       [=](const LegalityQuery &Query) {
1032         const LLT Ty = Query.Types[BigTyIdx];
1033         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1034           Ty.getSizeInBits() % 16 != 0;
1035       },
1036       [=](const LegalityQuery &Query) {
1037         // Pick the next power of 2, or a multiple of 64 over 128.
1038         // Whichever is smaller.
1039         const LLT &Ty = Query.Types[BigTyIdx];
1040         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1041         if (NewSizeInBits >= 256) {
1042           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1043           if (RoundedTo < NewSizeInBits)
1044             NewSizeInBits = RoundedTo;
1045         }
1046         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1047       })
1048       .legalIf([=](const LegalityQuery &Query) {
1049           const LLT &BigTy = Query.Types[BigTyIdx];
1050           const LLT &LitTy = Query.Types[LitTyIdx];
1051 
1052           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1053             return false;
1054           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1055             return false;
1056 
1057           return BigTy.getSizeInBits() % 16 == 0 &&
1058                  LitTy.getSizeInBits() % 16 == 0 &&
1059                  BigTy.getSizeInBits() <= 1024;
1060         })
1061       // Any vectors left are the wrong size. Scalarize them.
1062       .scalarize(0)
1063       .scalarize(1);
1064   }
1065 
1066   getActionDefinitionsBuilder(G_SEXT_INREG).lower();
1067 
1068   computeTables();
1069   verify(*ST.getInstrInfo());
1070 }
1071 
1072 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1073                                          MachineRegisterInfo &MRI,
1074                                          MachineIRBuilder &B,
1075                                          GISelChangeObserver &Observer) const {
1076   switch (MI.getOpcode()) {
1077   case TargetOpcode::G_ADDRSPACE_CAST:
1078     return legalizeAddrSpaceCast(MI, MRI, B);
1079   case TargetOpcode::G_FRINT:
1080     return legalizeFrint(MI, MRI, B);
1081   case TargetOpcode::G_FCEIL:
1082     return legalizeFceil(MI, MRI, B);
1083   case TargetOpcode::G_INTRINSIC_TRUNC:
1084     return legalizeIntrinsicTrunc(MI, MRI, B);
1085   case TargetOpcode::G_SITOFP:
1086     return legalizeITOFP(MI, MRI, B, true);
1087   case TargetOpcode::G_UITOFP:
1088     return legalizeITOFP(MI, MRI, B, false);
1089   case TargetOpcode::G_FMINNUM:
1090   case TargetOpcode::G_FMAXNUM:
1091   case TargetOpcode::G_FMINNUM_IEEE:
1092   case TargetOpcode::G_FMAXNUM_IEEE:
1093     return legalizeMinNumMaxNum(MI, MRI, B);
1094   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1095     return legalizeExtractVectorElt(MI, MRI, B);
1096   case TargetOpcode::G_INSERT_VECTOR_ELT:
1097     return legalizeInsertVectorElt(MI, MRI, B);
1098   case TargetOpcode::G_FSIN:
1099   case TargetOpcode::G_FCOS:
1100     return legalizeSinCos(MI, MRI, B);
1101   case TargetOpcode::G_GLOBAL_VALUE:
1102     return legalizeGlobalValue(MI, MRI, B);
1103   case TargetOpcode::G_LOAD:
1104     return legalizeLoad(MI, MRI, B, Observer);
1105   case TargetOpcode::G_FMAD:
1106     return legalizeFMad(MI, MRI, B);
1107   default:
1108     return false;
1109   }
1110 
1111   llvm_unreachable("expected switch to return");
1112 }
1113 
1114 Register AMDGPULegalizerInfo::getSegmentAperture(
1115   unsigned AS,
1116   MachineRegisterInfo &MRI,
1117   MachineIRBuilder &B) const {
1118   MachineFunction &MF = B.getMF();
1119   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1120   const LLT S32 = LLT::scalar(32);
1121 
1122   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1123 
1124   if (ST.hasApertureRegs()) {
1125     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1126     // getreg.
1127     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1128         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1129         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1130     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1131         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1132         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1133     unsigned Encoding =
1134         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1135         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1136         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1137 
1138     Register ApertureReg = MRI.createGenericVirtualRegister(S32);
1139     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1140 
1141     B.buildInstr(AMDGPU::S_GETREG_B32)
1142       .addDef(GetReg)
1143       .addImm(Encoding);
1144     MRI.setType(GetReg, S32);
1145 
1146     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1147     B.buildInstr(TargetOpcode::G_SHL)
1148       .addDef(ApertureReg)
1149       .addUse(GetReg)
1150       .addUse(ShiftAmt.getReg(0));
1151 
1152     return ApertureReg;
1153   }
1154 
1155   Register QueuePtr = MRI.createGenericVirtualRegister(
1156     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1157 
1158   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1159   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1160     return Register();
1161 
1162   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1163   // private_segment_aperture_base_hi.
1164   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1165 
1166   // FIXME: Don't use undef
1167   Value *V = UndefValue::get(PointerType::get(
1168                                Type::getInt8Ty(MF.getFunction().getContext()),
1169                                AMDGPUAS::CONSTANT_ADDRESS));
1170 
1171   MachinePointerInfo PtrInfo(V, StructOffset);
1172   MachineMemOperand *MMO = MF.getMachineMemOperand(
1173     PtrInfo,
1174     MachineMemOperand::MOLoad |
1175     MachineMemOperand::MODereferenceable |
1176     MachineMemOperand::MOInvariant,
1177     4,
1178     MinAlign(64, StructOffset));
1179 
1180   Register LoadResult = MRI.createGenericVirtualRegister(S32);
1181   Register LoadAddr;
1182 
1183   B.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1184   B.buildLoad(LoadResult, LoadAddr, *MMO);
1185   return LoadResult;
1186 }
1187 
1188 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1189   MachineInstr &MI, MachineRegisterInfo &MRI,
1190   MachineIRBuilder &B) const {
1191   MachineFunction &MF = B.getMF();
1192 
1193   B.setInstr(MI);
1194 
1195   const LLT S32 = LLT::scalar(32);
1196   Register Dst = MI.getOperand(0).getReg();
1197   Register Src = MI.getOperand(1).getReg();
1198 
1199   LLT DstTy = MRI.getType(Dst);
1200   LLT SrcTy = MRI.getType(Src);
1201   unsigned DestAS = DstTy.getAddressSpace();
1202   unsigned SrcAS = SrcTy.getAddressSpace();
1203 
1204   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1205   // vector element.
1206   assert(!DstTy.isVector());
1207 
1208   const AMDGPUTargetMachine &TM
1209     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1210 
1211   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1212   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1213     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1214     return true;
1215   }
1216 
1217   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1218     // Truncate.
1219     B.buildExtract(Dst, Src, 0);
1220     MI.eraseFromParent();
1221     return true;
1222   }
1223 
1224   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1225     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1226     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1227 
1228     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1229     // another. Merge operands are required to be the same type, but creating an
1230     // extra ptrtoint would be kind of pointless.
1231     auto HighAddr = B.buildConstant(
1232       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1233     B.buildMerge(Dst, {Src, HighAddr.getReg(0)});
1234     MI.eraseFromParent();
1235     return true;
1236   }
1237 
1238   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1239     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1240            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1241     unsigned NullVal = TM.getNullPointerValue(DestAS);
1242 
1243     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1244     auto FlatNull = B.buildConstant(SrcTy, 0);
1245 
1246     Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy);
1247 
1248     // Extract low 32-bits of the pointer.
1249     B.buildExtract(PtrLo32, Src, 0);
1250 
1251     Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1252     B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
1253     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1254 
1255     MI.eraseFromParent();
1256     return true;
1257   }
1258 
1259   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1260     return false;
1261 
1262   if (!ST.hasFlatAddressSpace())
1263     return false;
1264 
1265   auto SegmentNull =
1266       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1267   auto FlatNull =
1268       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1269 
1270   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1271   if (!ApertureReg.isValid())
1272     return false;
1273 
1274   Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1275   B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
1276 
1277   Register BuildPtr = MRI.createGenericVirtualRegister(DstTy);
1278 
1279   // Coerce the type of the low half of the result so we can use merge_values.
1280   Register SrcAsInt = MRI.createGenericVirtualRegister(S32);
1281   B.buildInstr(TargetOpcode::G_PTRTOINT)
1282     .addDef(SrcAsInt)
1283     .addUse(Src);
1284 
1285   // TODO: Should we allow mismatched types but matching sizes in merges to
1286   // avoid the ptrtoint?
1287   B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
1288   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
1289 
1290   MI.eraseFromParent();
1291   return true;
1292 }
1293 
1294 bool AMDGPULegalizerInfo::legalizeFrint(
1295   MachineInstr &MI, MachineRegisterInfo &MRI,
1296   MachineIRBuilder &B) const {
1297   B.setInstr(MI);
1298 
1299   Register Src = MI.getOperand(1).getReg();
1300   LLT Ty = MRI.getType(Src);
1301   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1302 
1303   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1304   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1305 
1306   auto C1 = B.buildFConstant(Ty, C1Val);
1307   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1308 
1309   // TODO: Should this propagate fast-math-flags?
1310   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1311   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1312 
1313   auto C2 = B.buildFConstant(Ty, C2Val);
1314   auto Fabs = B.buildFAbs(Ty, Src);
1315 
1316   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1317   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1318   return true;
1319 }
1320 
1321 bool AMDGPULegalizerInfo::legalizeFceil(
1322   MachineInstr &MI, MachineRegisterInfo &MRI,
1323   MachineIRBuilder &B) const {
1324   B.setInstr(MI);
1325 
1326   const LLT S1 = LLT::scalar(1);
1327   const LLT S64 = LLT::scalar(64);
1328 
1329   Register Src = MI.getOperand(1).getReg();
1330   assert(MRI.getType(Src) == S64);
1331 
1332   // result = trunc(src)
1333   // if (src > 0.0 && src != result)
1334   //   result += 1.0
1335 
1336   auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src});
1337 
1338   const auto Zero = B.buildFConstant(S64, 0.0);
1339   const auto One = B.buildFConstant(S64, 1.0);
1340   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1341   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1342   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1343   auto Add = B.buildSelect(S64, And, One, Zero);
1344 
1345   // TODO: Should this propagate fast-math-flags?
1346   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1347   return true;
1348 }
1349 
1350 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1351                                               MachineIRBuilder &B) {
1352   const unsigned FractBits = 52;
1353   const unsigned ExpBits = 11;
1354   LLT S32 = LLT::scalar(32);
1355 
1356   auto Const0 = B.buildConstant(S32, FractBits - 32);
1357   auto Const1 = B.buildConstant(S32, ExpBits);
1358 
1359   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1360     .addUse(Const0.getReg(0))
1361     .addUse(Const1.getReg(0));
1362 
1363   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1364 }
1365 
1366 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1367   MachineInstr &MI, MachineRegisterInfo &MRI,
1368   MachineIRBuilder &B) const {
1369   B.setInstr(MI);
1370 
1371   const LLT S1 = LLT::scalar(1);
1372   const LLT S32 = LLT::scalar(32);
1373   const LLT S64 = LLT::scalar(64);
1374 
1375   Register Src = MI.getOperand(1).getReg();
1376   assert(MRI.getType(Src) == S64);
1377 
1378   // TODO: Should this use extract since the low half is unused?
1379   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1380   Register Hi = Unmerge.getReg(1);
1381 
1382   // Extract the upper half, since this is where we will find the sign and
1383   // exponent.
1384   auto Exp = extractF64Exponent(Hi, B);
1385 
1386   const unsigned FractBits = 52;
1387 
1388   // Extract the sign bit.
1389   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1390   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1391 
1392   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1393 
1394   const auto Zero32 = B.buildConstant(S32, 0);
1395 
1396   // Extend back to 64-bits.
1397   auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)});
1398 
1399   auto Shr = B.buildAShr(S64, FractMask, Exp);
1400   auto Not = B.buildNot(S64, Shr);
1401   auto Tmp0 = B.buildAnd(S64, Src, Not);
1402   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1403 
1404   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1405   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1406 
1407   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1408   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1409   return true;
1410 }
1411 
1412 bool AMDGPULegalizerInfo::legalizeITOFP(
1413   MachineInstr &MI, MachineRegisterInfo &MRI,
1414   MachineIRBuilder &B, bool Signed) const {
1415   B.setInstr(MI);
1416 
1417   Register Dst = MI.getOperand(0).getReg();
1418   Register Src = MI.getOperand(1).getReg();
1419 
1420   const LLT S64 = LLT::scalar(64);
1421   const LLT S32 = LLT::scalar(32);
1422 
1423   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1424 
1425   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1426 
1427   auto CvtHi = Signed ?
1428     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1429     B.buildUITOFP(S64, Unmerge.getReg(1));
1430 
1431   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1432 
1433   auto ThirtyTwo = B.buildConstant(S32, 32);
1434   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1435     .addUse(CvtHi.getReg(0))
1436     .addUse(ThirtyTwo.getReg(0));
1437 
1438   // TODO: Should this propagate fast-math-flags?
1439   B.buildFAdd(Dst, LdExp, CvtLo);
1440   MI.eraseFromParent();
1441   return true;
1442 }
1443 
1444 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1445   MachineInstr &MI, MachineRegisterInfo &MRI,
1446   MachineIRBuilder &B) const {
1447   MachineFunction &MF = B.getMF();
1448   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1449 
1450   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1451                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1452 
1453   // With ieee_mode disabled, the instructions have the correct behavior
1454   // already for G_FMINNUM/G_FMAXNUM
1455   if (!MFI->getMode().IEEE)
1456     return !IsIEEEOp;
1457 
1458   if (IsIEEEOp)
1459     return true;
1460 
1461   MachineIRBuilder HelperBuilder(MI);
1462   GISelObserverWrapper DummyObserver;
1463   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1464   HelperBuilder.setInstr(MI);
1465   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1466 }
1467 
1468 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1469   MachineInstr &MI, MachineRegisterInfo &MRI,
1470   MachineIRBuilder &B) const {
1471   // TODO: Should move some of this into LegalizerHelper.
1472 
1473   // TODO: Promote dynamic indexing of s16 to s32
1474   // TODO: Dynamic s64 indexing is only legal for SGPR.
1475   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
1476   if (!IdxVal) // Dynamic case will be selected to register indexing.
1477     return true;
1478 
1479   Register Dst = MI.getOperand(0).getReg();
1480   Register Vec = MI.getOperand(1).getReg();
1481 
1482   LLT VecTy = MRI.getType(Vec);
1483   LLT EltTy = VecTy.getElementType();
1484   assert(EltTy == MRI.getType(Dst));
1485 
1486   B.setInstr(MI);
1487 
1488   if (IdxVal.getValue() < VecTy.getNumElements())
1489     B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits());
1490   else
1491     B.buildUndef(Dst);
1492 
1493   MI.eraseFromParent();
1494   return true;
1495 }
1496 
1497 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1498   MachineInstr &MI, MachineRegisterInfo &MRI,
1499   MachineIRBuilder &B) const {
1500   // TODO: Should move some of this into LegalizerHelper.
1501 
1502   // TODO: Promote dynamic indexing of s16 to s32
1503   // TODO: Dynamic s64 indexing is only legal for SGPR.
1504   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI);
1505   if (!IdxVal) // Dynamic case will be selected to register indexing.
1506     return true;
1507 
1508   Register Dst = MI.getOperand(0).getReg();
1509   Register Vec = MI.getOperand(1).getReg();
1510   Register Ins = MI.getOperand(2).getReg();
1511 
1512   LLT VecTy = MRI.getType(Vec);
1513   LLT EltTy = VecTy.getElementType();
1514   assert(EltTy == MRI.getType(Ins));
1515 
1516   B.setInstr(MI);
1517 
1518   if (IdxVal.getValue() < VecTy.getNumElements())
1519     B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits());
1520   else
1521     B.buildUndef(Dst);
1522 
1523   MI.eraseFromParent();
1524   return true;
1525 }
1526 
1527 bool AMDGPULegalizerInfo::legalizeSinCos(
1528   MachineInstr &MI, MachineRegisterInfo &MRI,
1529   MachineIRBuilder &B) const {
1530   B.setInstr(MI);
1531 
1532   Register DstReg = MI.getOperand(0).getReg();
1533   Register SrcReg = MI.getOperand(1).getReg();
1534   LLT Ty = MRI.getType(DstReg);
1535   unsigned Flags = MI.getFlags();
1536 
1537   Register TrigVal;
1538   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1539   if (ST.hasTrigReducedRange()) {
1540     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1541     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1542       .addUse(MulVal.getReg(0))
1543       .setMIFlags(Flags).getReg(0);
1544   } else
1545     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1546 
1547   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1548     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1549   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1550     .addUse(TrigVal)
1551     .setMIFlags(Flags);
1552   MI.eraseFromParent();
1553   return true;
1554 }
1555 
1556 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1557   Register DstReg, LLT PtrTy,
1558   MachineIRBuilder &B, const GlobalValue *GV,
1559   unsigned Offset, unsigned GAFlags) const {
1560   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1561   // to the following code sequence:
1562   //
1563   // For constant address space:
1564   //   s_getpc_b64 s[0:1]
1565   //   s_add_u32 s0, s0, $symbol
1566   //   s_addc_u32 s1, s1, 0
1567   //
1568   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1569   //   a fixup or relocation is emitted to replace $symbol with a literal
1570   //   constant, which is a pc-relative offset from the encoding of the $symbol
1571   //   operand to the global variable.
1572   //
1573   // For global address space:
1574   //   s_getpc_b64 s[0:1]
1575   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1576   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1577   //
1578   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1579   //   fixups or relocations are emitted to replace $symbol@*@lo and
1580   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1581   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
1582   //   operand to the global variable.
1583   //
1584   // What we want here is an offset from the value returned by s_getpc
1585   // (which is the address of the s_add_u32 instruction) to the global
1586   // variable, but since the encoding of $symbol starts 4 bytes after the start
1587   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1588   // small. This requires us to add 4 to the global variable offset in order to
1589   // compute the correct address.
1590 
1591   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1592 
1593   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1594     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1595 
1596   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1597     .addDef(PCReg);
1598 
1599   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1600   if (GAFlags == SIInstrInfo::MO_NONE)
1601     MIB.addImm(0);
1602   else
1603     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1604 
1605   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1606 
1607   if (PtrTy.getSizeInBits() == 32)
1608     B.buildExtract(DstReg, PCReg, 0);
1609   return true;
1610  }
1611 
1612 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1613   MachineInstr &MI, MachineRegisterInfo &MRI,
1614   MachineIRBuilder &B) const {
1615   Register DstReg = MI.getOperand(0).getReg();
1616   LLT Ty = MRI.getType(DstReg);
1617   unsigned AS = Ty.getAddressSpace();
1618 
1619   const GlobalValue *GV = MI.getOperand(1).getGlobal();
1620   MachineFunction &MF = B.getMF();
1621   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1622   B.setInstr(MI);
1623 
1624   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1625     if (!MFI->isEntryFunction()) {
1626       const Function &Fn = MF.getFunction();
1627       DiagnosticInfoUnsupported BadLDSDecl(
1628         Fn, "local memory global used by non-kernel function", MI.getDebugLoc());
1629       Fn.getContext().diagnose(BadLDSDecl);
1630     }
1631 
1632     // TODO: We could emit code to handle the initialization somewhere.
1633     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1634       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1635       MI.eraseFromParent();
1636       return true;
1637     }
1638 
1639     const Function &Fn = MF.getFunction();
1640     DiagnosticInfoUnsupported BadInit(
1641       Fn, "unsupported initializer for address space", MI.getDebugLoc());
1642     Fn.getContext().diagnose(BadInit);
1643     return true;
1644   }
1645 
1646   const SITargetLowering *TLI = ST.getTargetLowering();
1647 
1648   if (TLI->shouldEmitFixup(GV)) {
1649     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
1650     MI.eraseFromParent();
1651     return true;
1652   }
1653 
1654   if (TLI->shouldEmitPCReloc(GV)) {
1655     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
1656     MI.eraseFromParent();
1657     return true;
1658   }
1659 
1660   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1661   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
1662 
1663   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
1664     MachinePointerInfo::getGOT(MF),
1665     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1666     MachineMemOperand::MOInvariant,
1667     8 /*Size*/, 8 /*Align*/);
1668 
1669   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
1670 
1671   if (Ty.getSizeInBits() == 32) {
1672     // Truncate if this is a 32-bit constant adrdess.
1673     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
1674     B.buildExtract(DstReg, Load, 0);
1675   } else
1676     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
1677 
1678   MI.eraseFromParent();
1679   return true;
1680 }
1681 
1682 bool AMDGPULegalizerInfo::legalizeLoad(
1683   MachineInstr &MI, MachineRegisterInfo &MRI,
1684   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
1685   B.setInstr(MI);
1686   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1687   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
1688   Observer.changingInstr(MI);
1689   MI.getOperand(1).setReg(Cast.getReg(0));
1690   Observer.changedInstr(MI);
1691   return true;
1692 }
1693 
1694 bool AMDGPULegalizerInfo::legalizeFMad(
1695   MachineInstr &MI, MachineRegisterInfo &MRI,
1696   MachineIRBuilder &B) const {
1697   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1698   assert(Ty.isScalar());
1699 
1700   // TODO: Always legal with future ftz flag.
1701   if (Ty == LLT::scalar(32) && !ST.hasFP32Denormals())
1702     return true;
1703   if (Ty == LLT::scalar(16) && !ST.hasFP16Denormals())
1704     return true;
1705 
1706   MachineFunction &MF = B.getMF();
1707 
1708   MachineIRBuilder HelperBuilder(MI);
1709   GISelObserverWrapper DummyObserver;
1710   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1711   HelperBuilder.setMBB(*MI.getParent());
1712   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
1713 }
1714 
1715 // Return the use branch instruction, otherwise null if the usage is invalid.
1716 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
1717                                        MachineRegisterInfo &MRI) {
1718   Register CondDef = MI.getOperand(0).getReg();
1719   if (!MRI.hasOneNonDBGUse(CondDef))
1720     return nullptr;
1721 
1722   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
1723   return UseMI.getParent() == MI.getParent() &&
1724     UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr;
1725 }
1726 
1727 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
1728                                                 Register Reg, LLT Ty) const {
1729   Register LiveIn = MRI.getLiveInVirtReg(Reg);
1730   if (LiveIn)
1731     return LiveIn;
1732 
1733   Register NewReg = MRI.createGenericVirtualRegister(Ty);
1734   MRI.addLiveIn(Reg, NewReg);
1735   return NewReg;
1736 }
1737 
1738 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
1739                                          const ArgDescriptor *Arg) const {
1740   if (!Arg->isRegister() || !Arg->getRegister().isValid())
1741     return false; // TODO: Handle these
1742 
1743   assert(Arg->getRegister().isPhysical());
1744 
1745   MachineRegisterInfo &MRI = *B.getMRI();
1746 
1747   LLT Ty = MRI.getType(DstReg);
1748   Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
1749 
1750   if (Arg->isMasked()) {
1751     // TODO: Should we try to emit this once in the entry block?
1752     const LLT S32 = LLT::scalar(32);
1753     const unsigned Mask = Arg->getMask();
1754     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
1755 
1756     Register AndMaskSrc = LiveIn;
1757 
1758     if (Shift != 0) {
1759       auto ShiftAmt = B.buildConstant(S32, Shift);
1760       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
1761     }
1762 
1763     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
1764   } else
1765     B.buildCopy(DstReg, LiveIn);
1766 
1767   // Insert the argument copy if it doens't already exist.
1768   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
1769   if (!MRI.getVRegDef(LiveIn)) {
1770     // FIXME: Should have scoped insert pt
1771     MachineBasicBlock &OrigInsBB = B.getMBB();
1772     auto OrigInsPt = B.getInsertPt();
1773 
1774     MachineBasicBlock &EntryMBB = B.getMF().front();
1775     EntryMBB.addLiveIn(Arg->getRegister());
1776     B.setInsertPt(EntryMBB, EntryMBB.begin());
1777     B.buildCopy(LiveIn, Arg->getRegister());
1778 
1779     B.setInsertPt(OrigInsBB, OrigInsPt);
1780   }
1781 
1782   return true;
1783 }
1784 
1785 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
1786   MachineInstr &MI,
1787   MachineRegisterInfo &MRI,
1788   MachineIRBuilder &B,
1789   AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
1790   B.setInstr(MI);
1791 
1792   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1793 
1794   const ArgDescriptor *Arg;
1795   const TargetRegisterClass *RC;
1796   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
1797   if (!Arg) {
1798     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
1799     return false;
1800   }
1801 
1802   if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
1803     MI.eraseFromParent();
1804     return true;
1805   }
1806 
1807   return false;
1808 }
1809 
1810 bool AMDGPULegalizerInfo::legalizeFDIVFast(MachineInstr &MI,
1811                                            MachineRegisterInfo &MRI,
1812                                            MachineIRBuilder &B) const {
1813   B.setInstr(MI);
1814   Register Res = MI.getOperand(0).getReg();
1815   Register LHS = MI.getOperand(2).getReg();
1816   Register RHS = MI.getOperand(3).getReg();
1817   uint16_t Flags = MI.getFlags();
1818 
1819   LLT S32 = LLT::scalar(32);
1820   LLT S1 = LLT::scalar(1);
1821 
1822   auto Abs = B.buildFAbs(S32, RHS, Flags);
1823   const APFloat C0Val(1.0f);
1824 
1825   auto C0 = B.buildConstant(S32, 0x6f800000);
1826   auto C1 = B.buildConstant(S32, 0x2f800000);
1827   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
1828 
1829   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
1830   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
1831 
1832   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
1833 
1834   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
1835     .addUse(Mul0.getReg(0))
1836     .setMIFlags(Flags);
1837 
1838   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
1839 
1840   B.buildFMul(Res, Sel, Mul1, Flags);
1841 
1842   MI.eraseFromParent();
1843   return true;
1844 }
1845 
1846 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
1847                                                  MachineRegisterInfo &MRI,
1848                                                  MachineIRBuilder &B) const {
1849   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1850   if (!MFI->isEntryFunction()) {
1851     return legalizePreloadedArgIntrin(MI, MRI, B,
1852                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
1853   }
1854 
1855   B.setInstr(MI);
1856 
1857   uint64_t Offset =
1858     ST.getTargetLowering()->getImplicitParameterOffset(
1859       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
1860   Register DstReg = MI.getOperand(0).getReg();
1861   LLT DstTy = MRI.getType(DstReg);
1862   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
1863 
1864   const ArgDescriptor *Arg;
1865   const TargetRegisterClass *RC;
1866   std::tie(Arg, RC)
1867     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1868   if (!Arg)
1869     return false;
1870 
1871   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
1872   if (!loadInputValue(KernargPtrReg, B, Arg))
1873     return false;
1874 
1875   B.buildGEP(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
1876   MI.eraseFromParent();
1877   return true;
1878 }
1879 
1880 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
1881                                               MachineRegisterInfo &MRI,
1882                                               MachineIRBuilder &B,
1883                                               unsigned AddrSpace) const {
1884   B.setInstr(MI);
1885   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
1886   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
1887   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
1888   MI.eraseFromParent();
1889   return true;
1890 }
1891 
1892 /// Handle register layout difference for f16 images for some subtargets.
1893 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
1894                                              MachineRegisterInfo &MRI,
1895                                              Register Reg) const {
1896   if (!ST.hasUnpackedD16VMem())
1897     return Reg;
1898 
1899   const LLT S16 = LLT::scalar(16);
1900   const LLT S32 = LLT::scalar(32);
1901   LLT StoreVT = MRI.getType(Reg);
1902   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
1903 
1904   auto Unmerge = B.buildUnmerge(S16, Reg);
1905 
1906   SmallVector<Register, 4> WideRegs;
1907   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
1908     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
1909 
1910   int NumElts = StoreVT.getNumElements();
1911 
1912   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
1913 }
1914 
1915 bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI,
1916                                                  MachineRegisterInfo &MRI,
1917                                                  MachineIRBuilder &B,
1918                                                  bool IsFormat) const {
1919   // TODO: Reject f16 format on targets where unsupported.
1920   Register VData = MI.getOperand(1).getReg();
1921   LLT Ty = MRI.getType(VData);
1922 
1923   B.setInstr(MI);
1924 
1925   const LLT S32 = LLT::scalar(32);
1926   const LLT S16 = LLT::scalar(16);
1927 
1928   // Fixup illegal register types for i8 stores.
1929   if (Ty == LLT::scalar(8) || Ty == S16) {
1930     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
1931     MI.getOperand(1).setReg(AnyExt);
1932     return true;
1933   }
1934 
1935   if (Ty.isVector()) {
1936     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
1937       if (IsFormat)
1938         MI.getOperand(1).setReg(handleD16VData(B, MRI, VData));
1939       return true;
1940     }
1941 
1942     return Ty.getElementType() == S32 && Ty.getNumElements() <= 4;
1943   }
1944 
1945   return Ty == S32;
1946 }
1947 
1948 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
1949                                             MachineRegisterInfo &MRI,
1950                                             MachineIRBuilder &B) const {
1951   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
1952   switch (MI.getIntrinsicID()) {
1953   case Intrinsic::amdgcn_if: {
1954     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
1955       const SIRegisterInfo *TRI
1956         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1957 
1958       B.setInstr(*BrCond);
1959       Register Def = MI.getOperand(1).getReg();
1960       Register Use = MI.getOperand(3).getReg();
1961       B.buildInstr(AMDGPU::SI_IF)
1962         .addDef(Def)
1963         .addUse(Use)
1964         .addMBB(BrCond->getOperand(1).getMBB());
1965 
1966       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
1967       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
1968       MI.eraseFromParent();
1969       BrCond->eraseFromParent();
1970       return true;
1971     }
1972 
1973     return false;
1974   }
1975   case Intrinsic::amdgcn_loop: {
1976     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
1977       const SIRegisterInfo *TRI
1978         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1979 
1980       B.setInstr(*BrCond);
1981       Register Reg = MI.getOperand(2).getReg();
1982       B.buildInstr(AMDGPU::SI_LOOP)
1983         .addUse(Reg)
1984         .addMBB(BrCond->getOperand(1).getMBB());
1985       MI.eraseFromParent();
1986       BrCond->eraseFromParent();
1987       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
1988       return true;
1989     }
1990 
1991     return false;
1992   }
1993   case Intrinsic::amdgcn_kernarg_segment_ptr:
1994     return legalizePreloadedArgIntrin(
1995       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1996   case Intrinsic::amdgcn_implicitarg_ptr:
1997     return legalizeImplicitArgPtr(MI, MRI, B);
1998   case Intrinsic::amdgcn_workitem_id_x:
1999     return legalizePreloadedArgIntrin(MI, MRI, B,
2000                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
2001   case Intrinsic::amdgcn_workitem_id_y:
2002     return legalizePreloadedArgIntrin(MI, MRI, B,
2003                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
2004   case Intrinsic::amdgcn_workitem_id_z:
2005     return legalizePreloadedArgIntrin(MI, MRI, B,
2006                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
2007   case Intrinsic::amdgcn_workgroup_id_x:
2008     return legalizePreloadedArgIntrin(MI, MRI, B,
2009                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
2010   case Intrinsic::amdgcn_workgroup_id_y:
2011     return legalizePreloadedArgIntrin(MI, MRI, B,
2012                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
2013   case Intrinsic::amdgcn_workgroup_id_z:
2014     return legalizePreloadedArgIntrin(MI, MRI, B,
2015                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
2016   case Intrinsic::amdgcn_dispatch_ptr:
2017     return legalizePreloadedArgIntrin(MI, MRI, B,
2018                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
2019   case Intrinsic::amdgcn_queue_ptr:
2020     return legalizePreloadedArgIntrin(MI, MRI, B,
2021                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
2022   case Intrinsic::amdgcn_implicit_buffer_ptr:
2023     return legalizePreloadedArgIntrin(
2024       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
2025   case Intrinsic::amdgcn_dispatch_id:
2026     return legalizePreloadedArgIntrin(MI, MRI, B,
2027                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
2028   case Intrinsic::amdgcn_fdiv_fast:
2029     return legalizeFDIVFast(MI, MRI, B);
2030   case Intrinsic::amdgcn_is_shared:
2031     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
2032   case Intrinsic::amdgcn_is_private:
2033     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
2034   case Intrinsic::amdgcn_wavefrontsize: {
2035     B.setInstr(MI);
2036     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
2037     MI.eraseFromParent();
2038     return true;
2039   }
2040   case Intrinsic::amdgcn_raw_buffer_store:
2041     return legalizeRawBufferStore(MI, MRI, B, false);
2042   case Intrinsic::amdgcn_raw_buffer_store_format:
2043     return legalizeRawBufferStore(MI, MRI, B, true);
2044   default:
2045     return true;
2046   }
2047 
2048   return true;
2049 }
2050