1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
19 #endif
20 
21 #include "AMDGPU.h"
22 #include "AMDGPULegalizerInfo.h"
23 #include "AMDGPUTargetMachine.h"
24 #include "SIMachineFunctionInfo.h"
25 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/CodeGen/TargetOpcodes.h"
28 #include "llvm/CodeGen/ValueTypes.h"
29 #include "llvm/IR/DerivedTypes.h"
30 #include "llvm/IR/DiagnosticInfo.h"
31 #include "llvm/IR/Type.h"
32 #include "llvm/Support/Debug.h"
33 
34 #define DEBUG_TYPE "amdgpu-legalinfo"
35 
36 using namespace llvm;
37 using namespace LegalizeActions;
38 using namespace LegalizeMutations;
39 using namespace LegalityPredicates;
40 
41 
42 static LegalityPredicate isMultiple32(unsigned TypeIdx,
43                                       unsigned MaxSize = 1024) {
44   return [=](const LegalityQuery &Query) {
45     const LLT Ty = Query.Types[TypeIdx];
46     const LLT EltTy = Ty.getScalarType();
47     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
48   };
49 }
50 
51 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
52   return [=](const LegalityQuery &Query) {
53     return Query.Types[TypeIdx].getSizeInBits() == Size;
54   };
55 }
56 
57 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
58   return [=](const LegalityQuery &Query) {
59     const LLT Ty = Query.Types[TypeIdx];
60     return Ty.isVector() &&
61            Ty.getNumElements() % 2 != 0 &&
62            Ty.getElementType().getSizeInBits() < 32 &&
63            Ty.getSizeInBits() % 32 != 0;
64   };
65 }
66 
67 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
68   return [=](const LegalityQuery &Query) {
69     const LLT Ty = Query.Types[TypeIdx];
70     const LLT EltTy = Ty.getScalarType();
71     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
72   };
73 }
74 
75 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
76   return [=](const LegalityQuery &Query) {
77     const LLT Ty = Query.Types[TypeIdx];
78     const LLT EltTy = Ty.getElementType();
79     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
80   };
81 }
82 
83 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
84   return [=](const LegalityQuery &Query) {
85     const LLT Ty = Query.Types[TypeIdx];
86     const LLT EltTy = Ty.getElementType();
87     unsigned Size = Ty.getSizeInBits();
88     unsigned Pieces = (Size + 63) / 64;
89     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
90     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
91   };
92 }
93 
94 // Increase the number of vector elements to reach the next multiple of 32-bit
95 // type.
96 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
97   return [=](const LegalityQuery &Query) {
98     const LLT Ty = Query.Types[TypeIdx];
99 
100     const LLT EltTy = Ty.getElementType();
101     const int Size = Ty.getSizeInBits();
102     const int EltSize = EltTy.getSizeInBits();
103     const int NextMul32 = (Size + 31) / 32;
104 
105     assert(EltSize < 32);
106 
107     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
108     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
109   };
110 }
111 
112 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
113   return [=](const LegalityQuery &Query) {
114     const LLT QueryTy = Query.Types[TypeIdx];
115     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
116   };
117 }
118 
119 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
120   return [=](const LegalityQuery &Query) {
121     const LLT QueryTy = Query.Types[TypeIdx];
122     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
123   };
124 }
125 
126 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
127   return [=](const LegalityQuery &Query) {
128     const LLT QueryTy = Query.Types[TypeIdx];
129     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
130   };
131 }
132 
133 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
134 // v2s16.
135 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
136   return [=](const LegalityQuery &Query) {
137     const LLT Ty = Query.Types[TypeIdx];
138     if (Ty.isVector()) {
139       const int EltSize = Ty.getElementType().getSizeInBits();
140       return EltSize == 32 || EltSize == 64 ||
141             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
142              EltSize == 128 || EltSize == 256;
143     }
144 
145     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
146   };
147 }
148 
149 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
150   return [=](const LegalityQuery &Query) {
151     return Query.Types[TypeIdx].getElementType() == Type;
152   };
153 }
154 
155 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
156   return [=](const LegalityQuery &Query) {
157     const LLT Ty = Query.Types[TypeIdx];
158     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
159            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
160   };
161 }
162 
163 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
164                                          const GCNTargetMachine &TM)
165   :  ST(ST_) {
166   using namespace TargetOpcode;
167 
168   auto GetAddrSpacePtr = [&TM](unsigned AS) {
169     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
170   };
171 
172   const LLT S1 = LLT::scalar(1);
173   const LLT S8 = LLT::scalar(8);
174   const LLT S16 = LLT::scalar(16);
175   const LLT S32 = LLT::scalar(32);
176   const LLT S64 = LLT::scalar(64);
177   const LLT S96 = LLT::scalar(96);
178   const LLT S128 = LLT::scalar(128);
179   const LLT S256 = LLT::scalar(256);
180   const LLT S1024 = LLT::scalar(1024);
181 
182   const LLT V2S16 = LLT::vector(2, 16);
183   const LLT V4S16 = LLT::vector(4, 16);
184 
185   const LLT V2S32 = LLT::vector(2, 32);
186   const LLT V3S32 = LLT::vector(3, 32);
187   const LLT V4S32 = LLT::vector(4, 32);
188   const LLT V5S32 = LLT::vector(5, 32);
189   const LLT V6S32 = LLT::vector(6, 32);
190   const LLT V7S32 = LLT::vector(7, 32);
191   const LLT V8S32 = LLT::vector(8, 32);
192   const LLT V9S32 = LLT::vector(9, 32);
193   const LLT V10S32 = LLT::vector(10, 32);
194   const LLT V11S32 = LLT::vector(11, 32);
195   const LLT V12S32 = LLT::vector(12, 32);
196   const LLT V13S32 = LLT::vector(13, 32);
197   const LLT V14S32 = LLT::vector(14, 32);
198   const LLT V15S32 = LLT::vector(15, 32);
199   const LLT V16S32 = LLT::vector(16, 32);
200   const LLT V32S32 = LLT::vector(32, 32);
201 
202   const LLT V2S64 = LLT::vector(2, 64);
203   const LLT V3S64 = LLT::vector(3, 64);
204   const LLT V4S64 = LLT::vector(4, 64);
205   const LLT V5S64 = LLT::vector(5, 64);
206   const LLT V6S64 = LLT::vector(6, 64);
207   const LLT V7S64 = LLT::vector(7, 64);
208   const LLT V8S64 = LLT::vector(8, 64);
209   const LLT V16S64 = LLT::vector(16, 64);
210 
211   std::initializer_list<LLT> AllS32Vectors =
212     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
213      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
214   std::initializer_list<LLT> AllS64Vectors =
215     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
216 
217   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
218   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
219   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
220   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
221   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
222   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
223   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
224 
225   const LLT CodePtr = FlatPtr;
226 
227   const std::initializer_list<LLT> AddrSpaces64 = {
228     GlobalPtr, ConstantPtr, FlatPtr
229   };
230 
231   const std::initializer_list<LLT> AddrSpaces32 = {
232     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
233   };
234 
235   const std::initializer_list<LLT> FPTypesBase = {
236     S32, S64
237   };
238 
239   const std::initializer_list<LLT> FPTypes16 = {
240     S32, S64, S16
241   };
242 
243   const std::initializer_list<LLT> FPTypesPK16 = {
244     S32, S64, S16, V2S16
245   };
246 
247   setAction({G_BRCOND, S1}, Legal);
248 
249   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
250   // elements for v3s16
251   getActionDefinitionsBuilder(G_PHI)
252     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
253     .legalFor(AllS32Vectors)
254     .legalFor(AllS64Vectors)
255     .legalFor(AddrSpaces64)
256     .legalFor(AddrSpaces32)
257     .clampScalar(0, S32, S256)
258     .widenScalarToNextPow2(0, 32)
259     .clampMaxNumElements(0, S32, 16)
260     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
261     .legalIf(isPointer(0));
262 
263   if (ST.has16BitInsts()) {
264     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
265       .legalFor({S32, S16})
266       .clampScalar(0, S16, S32)
267       .scalarize(0);
268   } else {
269     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
270       .legalFor({S32})
271       .clampScalar(0, S32, S32)
272       .scalarize(0);
273   }
274 
275   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
276     .legalFor({S32})
277     .clampScalar(0, S32, S32)
278     .scalarize(0);
279 
280   // Report legal for any types we can handle anywhere. For the cases only legal
281   // on the SALU, RegBankSelect will be able to re-legalize.
282   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
283     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
284     .clampScalar(0, S32, S64)
285     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
286     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
287     .widenScalarToNextPow2(0)
288     .scalarize(0);
289 
290   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
291                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
292     .legalFor({{S32, S1}})
293     .clampScalar(0, S32, S32)
294     .scalarize(0); // TODO: Implement.
295 
296   getActionDefinitionsBuilder({G_SADDO, G_SSUBO})
297     .lower();
298 
299   getActionDefinitionsBuilder(G_BITCAST)
300     // Don't worry about the size constraint.
301     .legalIf(all(isRegisterType(0), isRegisterType(1)))
302     // FIXME: Testing hack
303     .legalForCartesianProduct({S16, LLT::vector(2, 8), });
304 
305   getActionDefinitionsBuilder(G_FCONSTANT)
306     .legalFor({S32, S64, S16})
307     .clampScalar(0, S16, S64);
308 
309   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
310     .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
311                ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
312     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
313     .clampScalarOrElt(0, S32, S1024)
314     .legalIf(isMultiple32(0))
315     .widenScalarToNextPow2(0, 32)
316     .clampMaxNumElements(0, S32, 16);
317 
318 
319   // FIXME: i1 operands to intrinsics should always be legal, but other i1
320   // values may not be legal.  We need to figure out how to distinguish
321   // between these two scenarios.
322   getActionDefinitionsBuilder(G_CONSTANT)
323     .legalFor({S1, S32, S64, S16, GlobalPtr,
324                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
325     .clampScalar(0, S32, S64)
326     .widenScalarToNextPow2(0)
327     .legalIf(isPointer(0));
328 
329   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
330   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
331     .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr});
332 
333 
334   auto &FPOpActions = getActionDefinitionsBuilder(
335     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
336     .legalFor({S32, S64});
337   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
338     .customFor({S32, S64});
339 
340   if (ST.has16BitInsts()) {
341     if (ST.hasVOP3PInsts())
342       FPOpActions.legalFor({S16, V2S16});
343     else
344       FPOpActions.legalFor({S16});
345 
346     TrigActions.customFor({S16});
347   }
348 
349   auto &MinNumMaxNum = getActionDefinitionsBuilder({
350       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
351 
352   if (ST.hasVOP3PInsts()) {
353     MinNumMaxNum.customFor(FPTypesPK16)
354       .clampMaxNumElements(0, S16, 2)
355       .clampScalar(0, S16, S64)
356       .scalarize(0);
357   } else if (ST.has16BitInsts()) {
358     MinNumMaxNum.customFor(FPTypes16)
359       .clampScalar(0, S16, S64)
360       .scalarize(0);
361   } else {
362     MinNumMaxNum.customFor(FPTypesBase)
363       .clampScalar(0, S32, S64)
364       .scalarize(0);
365   }
366 
367   if (ST.hasVOP3PInsts())
368     FPOpActions.clampMaxNumElements(0, S16, 2);
369 
370   FPOpActions
371     .scalarize(0)
372     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
373 
374   TrigActions
375     .scalarize(0)
376     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
377 
378   getActionDefinitionsBuilder({G_FNEG, G_FABS})
379     .legalFor(FPTypesPK16)
380     .clampMaxNumElements(0, S16, 2)
381     .scalarize(0)
382     .clampScalar(0, S16, S64);
383 
384   // TODO: Implement
385   getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
386 
387   if (ST.has16BitInsts()) {
388     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
389       .legalFor({S32, S64, S16})
390       .scalarize(0)
391       .clampScalar(0, S16, S64);
392   } else {
393     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
394       .legalFor({S32, S64})
395       .scalarize(0)
396       .clampScalar(0, S32, S64);
397   }
398 
399   getActionDefinitionsBuilder(G_FPTRUNC)
400     .legalFor({{S32, S64}, {S16, S32}})
401     .scalarize(0);
402 
403   getActionDefinitionsBuilder(G_FPEXT)
404     .legalFor({{S64, S32}, {S32, S16}})
405     .lowerFor({{S64, S16}}) // FIXME: Implement
406     .scalarize(0);
407 
408   // TODO: Verify V_BFI_B32 is generated from expanded bit ops.
409   getActionDefinitionsBuilder(G_FCOPYSIGN).lower();
410 
411   getActionDefinitionsBuilder(G_FSUB)
412       // Use actual fsub instruction
413       .legalFor({S32})
414       // Must use fadd + fneg
415       .lowerFor({S64, S16, V2S16})
416       .scalarize(0)
417       .clampScalar(0, S32, S64);
418 
419   // Whether this is legal depends on the floating point mode for the function.
420   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
421   if (ST.hasMadF16())
422     FMad.customFor({S32, S16});
423   else
424     FMad.customFor({S32});
425   FMad.scalarize(0)
426       .lower();
427 
428   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
429     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
430                {S32, S1}, {S64, S1}, {S16, S1},
431                {S96, S32},
432                // FIXME: Hack
433                {S64, LLT::scalar(33)},
434                {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}})
435     .scalarize(0);
436 
437   // TODO: Split s1->s64 during regbankselect for VALU.
438   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
439     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}, {S32, S1}, {S16, S1}, {S64, S1}})
440     .lowerFor({{S32, S64}})
441     .customFor({{S64, S64}});
442   if (ST.has16BitInsts())
443     IToFP.legalFor({{S16, S16}});
444   IToFP.clampScalar(1, S32, S64)
445        .scalarize(0);
446 
447   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
448     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}});
449   if (ST.has16BitInsts())
450     FPToI.legalFor({{S16, S16}});
451   else
452     FPToI.minScalar(1, S32);
453 
454   FPToI.minScalar(0, S32)
455        .scalarize(0);
456 
457   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
458     .legalFor({S32, S64})
459     .scalarize(0);
460 
461   if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
462     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
463       .legalFor({S32, S64})
464       .clampScalar(0, S32, S64)
465       .scalarize(0);
466   } else {
467     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
468       .legalFor({S32})
469       .customFor({S64})
470       .clampScalar(0, S32, S64)
471       .scalarize(0);
472   }
473 
474   getActionDefinitionsBuilder(G_GEP)
475     .legalForCartesianProduct(AddrSpaces64, {S64})
476     .legalForCartesianProduct(AddrSpaces32, {S32})
477     .scalarize(0);
478 
479   getActionDefinitionsBuilder(G_PTR_MASK)
480     .scalarize(0)
481     .alwaysLegal();
482 
483   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
484 
485   auto &CmpBuilder =
486     getActionDefinitionsBuilder(G_ICMP)
487     .legalForCartesianProduct(
488       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
489     .legalFor({{S1, S32}, {S1, S64}});
490   if (ST.has16BitInsts()) {
491     CmpBuilder.legalFor({{S1, S16}});
492   }
493 
494   CmpBuilder
495     .widenScalarToNextPow2(1)
496     .clampScalar(1, S32, S64)
497     .scalarize(0)
498     .legalIf(all(typeIs(0, S1), isPointer(1)));
499 
500   getActionDefinitionsBuilder(G_FCMP)
501     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
502     .widenScalarToNextPow2(1)
503     .clampScalar(1, S32, S64)
504     .scalarize(0);
505 
506   // FIXME: fexp, flog2, flog10 needs to be custom lowered.
507   getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2,
508                                G_FLOG, G_FLOG2, G_FLOG10})
509     .legalFor({S32})
510     .scalarize(0);
511 
512   // The 64-bit versions produce 32-bit results, but only on the SALU.
513   getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF,
514                                G_CTTZ, G_CTTZ_ZERO_UNDEF,
515                                G_CTPOP})
516     .legalFor({{S32, S32}, {S32, S64}})
517     .clampScalar(0, S32, S32)
518     .clampScalar(1, S32, S64)
519     .scalarize(0)
520     .widenScalarToNextPow2(0, 32)
521     .widenScalarToNextPow2(1, 32);
522 
523   // TODO: Expand for > s32
524   getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE})
525     .legalFor({S32})
526     .clampScalar(0, S32, S32)
527     .scalarize(0);
528 
529   if (ST.has16BitInsts()) {
530     if (ST.hasVOP3PInsts()) {
531       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
532         .legalFor({S32, S16, V2S16})
533         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
534         .clampMaxNumElements(0, S16, 2)
535         .clampScalar(0, S16, S32)
536         .widenScalarToNextPow2(0)
537         .scalarize(0);
538     } else {
539       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
540         .legalFor({S32, S16})
541         .widenScalarToNextPow2(0)
542         .clampScalar(0, S16, S32)
543         .scalarize(0);
544     }
545   } else {
546     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
547       .legalFor({S32})
548       .clampScalar(0, S32, S32)
549       .widenScalarToNextPow2(0)
550       .scalarize(0);
551   }
552 
553   auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
554     return [=](const LegalityQuery &Query) {
555       return Query.Types[TypeIdx0].getSizeInBits() <
556              Query.Types[TypeIdx1].getSizeInBits();
557     };
558   };
559 
560   auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
561     return [=](const LegalityQuery &Query) {
562       return Query.Types[TypeIdx0].getSizeInBits() >
563              Query.Types[TypeIdx1].getSizeInBits();
564     };
565   };
566 
567   getActionDefinitionsBuilder(G_INTTOPTR)
568     // List the common cases
569     .legalForCartesianProduct(AddrSpaces64, {S64})
570     .legalForCartesianProduct(AddrSpaces32, {S32})
571     .scalarize(0)
572     // Accept any address space as long as the size matches
573     .legalIf(sameSize(0, 1))
574     .widenScalarIf(smallerThan(1, 0),
575       [](const LegalityQuery &Query) {
576         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
577       })
578     .narrowScalarIf(greaterThan(1, 0),
579       [](const LegalityQuery &Query) {
580         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
581       });
582 
583   getActionDefinitionsBuilder(G_PTRTOINT)
584     // List the common cases
585     .legalForCartesianProduct(AddrSpaces64, {S64})
586     .legalForCartesianProduct(AddrSpaces32, {S32})
587     .scalarize(0)
588     // Accept any address space as long as the size matches
589     .legalIf(sameSize(0, 1))
590     .widenScalarIf(smallerThan(0, 1),
591       [](const LegalityQuery &Query) {
592         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
593       })
594     .narrowScalarIf(
595       greaterThan(0, 1),
596       [](const LegalityQuery &Query) {
597         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
598       });
599 
600   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
601     .scalarize(0)
602     .custom();
603 
604   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
605   // handle some operations by just promoting the register during
606   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
607   auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned {
608     switch (AS) {
609     // FIXME: Private element size.
610     case AMDGPUAS::PRIVATE_ADDRESS:
611       return 32;
612     // FIXME: Check subtarget
613     case AMDGPUAS::LOCAL_ADDRESS:
614       return ST.useDS128() ? 128 : 64;
615 
616     // Treat constant and global as identical. SMRD loads are sometimes usable
617     // for global loads (ideally constant address space should be eliminated)
618     // depending on the context. Legality cannot be context dependent, but
619     // RegBankSelect can split the load as necessary depending on the pointer
620     // register bank/uniformity and if the memory is invariant or not written in
621     // a kernel.
622     case AMDGPUAS::CONSTANT_ADDRESS:
623     case AMDGPUAS::GLOBAL_ADDRESS:
624       return 512;
625     default:
626       return 128;
627     }
628   };
629 
630   const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool {
631     const LLT DstTy = Query.Types[0];
632 
633     // Split vector extloads.
634     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
635     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
636       return true;
637 
638     const LLT PtrTy = Query.Types[1];
639     unsigned AS = PtrTy.getAddressSpace();
640     if (MemSize > maxSizeForAddrSpace(AS))
641       return true;
642 
643     // Catch weird sized loads that don't evenly divide into the access sizes
644     // TODO: May be able to widen depending on alignment etc.
645     unsigned NumRegs = MemSize / 32;
646     if (NumRegs == 3 && !ST.hasDwordx3LoadStores())
647       return true;
648 
649     unsigned Align = Query.MMODescrs[0].AlignInBits;
650     if (Align < MemSize) {
651       const SITargetLowering *TLI = ST.getTargetLowering();
652       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
653     }
654 
655     return false;
656   };
657 
658   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
659   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
660   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
661 
662   // TODO: Refine based on subtargets which support unaligned access or 128-bit
663   // LDS
664   // TODO: Unsupported flat for SI.
665 
666   for (unsigned Op : {G_LOAD, G_STORE}) {
667     const bool IsStore = Op == G_STORE;
668 
669     auto &Actions = getActionDefinitionsBuilder(Op);
670     // Whitelist the common cases.
671     // TODO: Pointer loads
672     // TODO: Wide constant loads
673     // TODO: Only CI+ has 3x loads
674     // TODO: Loads to s16 on gfx9
675     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
676                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
677                                       {V3S32, GlobalPtr, 96, GlobalAlign32},
678                                       {S96, GlobalPtr, 96, GlobalAlign32},
679                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
680                                       {S128, GlobalPtr, 128, GlobalAlign32},
681                                       {S64, GlobalPtr, 64, GlobalAlign32},
682                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
683                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
684                                       {S32, GlobalPtr, 8, GlobalAlign8},
685                                       {S32, GlobalPtr, 16, GlobalAlign16},
686 
687                                       {S32, LocalPtr, 32, 32},
688                                       {S64, LocalPtr, 64, 32},
689                                       {V2S32, LocalPtr, 64, 32},
690                                       {S32, LocalPtr, 8, 8},
691                                       {S32, LocalPtr, 16, 16},
692                                       {V2S16, LocalPtr, 32, 32},
693 
694                                       {S32, PrivatePtr, 32, 32},
695                                       {S32, PrivatePtr, 8, 8},
696                                       {S32, PrivatePtr, 16, 16},
697                                       {V2S16, PrivatePtr, 32, 32},
698 
699                                       {S32, FlatPtr, 32, GlobalAlign32},
700                                       {S32, FlatPtr, 16, GlobalAlign16},
701                                       {S32, FlatPtr, 8, GlobalAlign8},
702                                       {V2S16, FlatPtr, 32, GlobalAlign32},
703 
704                                       {S32, ConstantPtr, 32, GlobalAlign32},
705                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
706                                       {V3S32, ConstantPtr, 96, GlobalAlign32},
707                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
708                                       {S64, ConstantPtr, 64, GlobalAlign32},
709                                       {S128, ConstantPtr, 128, GlobalAlign32},
710                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
711     Actions
712         .customIf(typeIs(1, Constant32Ptr))
713         .narrowScalarIf(
714             [=](const LegalityQuery &Query) -> bool {
715               return !Query.Types[0].isVector() && needToSplitLoad(Query);
716             },
717             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
718               const LLT DstTy = Query.Types[0];
719               const LLT PtrTy = Query.Types[1];
720 
721               const unsigned DstSize = DstTy.getSizeInBits();
722               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
723 
724               // Split extloads.
725               if (DstSize > MemSize)
726                 return std::make_pair(0, LLT::scalar(MemSize));
727 
728               if (DstSize > 32 && (DstSize % 32 != 0)) {
729                 // FIXME: Need a way to specify non-extload of larger size if
730                 // suitably aligned.
731                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
732               }
733 
734               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
735               if (MemSize > MaxSize)
736                 return std::make_pair(0, LLT::scalar(MaxSize));
737 
738               unsigned Align = Query.MMODescrs[0].AlignInBits;
739               return std::make_pair(0, LLT::scalar(Align));
740             })
741         .fewerElementsIf(
742             [=](const LegalityQuery &Query) -> bool {
743               return Query.Types[0].isVector() && needToSplitLoad(Query);
744             },
745             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
746               const LLT DstTy = Query.Types[0];
747               const LLT PtrTy = Query.Types[1];
748 
749               LLT EltTy = DstTy.getElementType();
750               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
751 
752               // Split if it's too large for the address space.
753               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
754                 unsigned NumElts = DstTy.getNumElements();
755                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
756 
757                 // FIXME: Refine when odd breakdowns handled
758                 // The scalars will need to be re-legalized.
759                 if (NumPieces == 1 || NumPieces >= NumElts ||
760                     NumElts % NumPieces != 0)
761                   return std::make_pair(0, EltTy);
762 
763                 return std::make_pair(0,
764                                       LLT::vector(NumElts / NumPieces, EltTy));
765               }
766 
767               // Need to split because of alignment.
768               unsigned Align = Query.MMODescrs[0].AlignInBits;
769               unsigned EltSize = EltTy.getSizeInBits();
770               if (EltSize > Align &&
771                   (EltSize / Align < DstTy.getNumElements())) {
772                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
773               }
774 
775               // May need relegalization for the scalars.
776               return std::make_pair(0, EltTy);
777             })
778         .minScalar(0, S32);
779 
780     if (IsStore)
781       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
782 
783     // TODO: Need a bitcast lower option?
784     Actions
785         .legalIf([=](const LegalityQuery &Query) {
786           const LLT Ty0 = Query.Types[0];
787           unsigned Size = Ty0.getSizeInBits();
788           unsigned MemSize = Query.MMODescrs[0].SizeInBits;
789           unsigned Align = Query.MMODescrs[0].AlignInBits;
790 
791           // No extending vector loads.
792           if (Size > MemSize && Ty0.isVector())
793             return false;
794 
795           // FIXME: Widening store from alignment not valid.
796           if (MemSize < Size)
797             MemSize = std::max(MemSize, Align);
798 
799           switch (MemSize) {
800           case 8:
801           case 16:
802             return Size == 32;
803           case 32:
804           case 64:
805           case 128:
806             return true;
807           case 96:
808             return ST.hasDwordx3LoadStores();
809           case 256:
810           case 512:
811             return true;
812           default:
813             return false;
814           }
815         })
816         .widenScalarToNextPow2(0)
817         // TODO: v3s32->v4s32 with alignment
818         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
819   }
820 
821   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
822                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
823                                                   {S32, GlobalPtr, 16, 2 * 8},
824                                                   {S32, LocalPtr, 8, 8},
825                                                   {S32, LocalPtr, 16, 16},
826                                                   {S32, PrivatePtr, 8, 8},
827                                                   {S32, PrivatePtr, 16, 16},
828                                                   {S32, ConstantPtr, 8, 8},
829                                                   {S32, ConstantPtr, 16, 2 * 8}});
830   if (ST.hasFlatAddressSpace()) {
831     ExtLoads.legalForTypesWithMemDesc(
832         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
833   }
834 
835   ExtLoads.clampScalar(0, S32, S32)
836           .widenScalarToNextPow2(0)
837           .unsupportedIfMemSizeNotPow2()
838           .lower();
839 
840   auto &Atomics = getActionDefinitionsBuilder(
841     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
842      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
843      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
844      G_ATOMICRMW_UMIN, G_ATOMIC_CMPXCHG})
845     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
846                {S64, GlobalPtr}, {S64, LocalPtr}});
847   if (ST.hasFlatAddressSpace()) {
848     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
849   }
850 
851   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
852     .legalFor({{S32, LocalPtr}});
853 
854   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS)
855     .lower();
856 
857   // TODO: Pointer types, any 32-bit or 64-bit vector
858   getActionDefinitionsBuilder(G_SELECT)
859     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
860           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
861           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1})
862     .clampScalar(0, S16, S64)
863     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
864     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
865     .scalarize(1)
866     .clampMaxNumElements(0, S32, 2)
867     .clampMaxNumElements(0, LocalPtr, 2)
868     .clampMaxNumElements(0, PrivatePtr, 2)
869     .scalarize(0)
870     .widenScalarToNextPow2(0)
871     .legalIf(all(isPointer(0), typeIs(1, S1)));
872 
873   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
874   // be more flexible with the shift amount type.
875   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
876     .legalFor({{S32, S32}, {S64, S32}});
877   if (ST.has16BitInsts()) {
878     if (ST.hasVOP3PInsts()) {
879       Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
880             .clampMaxNumElements(0, S16, 2);
881     } else
882       Shifts.legalFor({{S16, S32}, {S16, S16}});
883 
884     Shifts.clampScalar(1, S16, S32);
885     Shifts.clampScalar(0, S16, S64);
886     Shifts.widenScalarToNextPow2(0, 16);
887   } else {
888     // Make sure we legalize the shift amount type first, as the general
889     // expansion for the shifted type will produce much worse code if it hasn't
890     // been truncated already.
891     Shifts.clampScalar(1, S32, S32);
892     Shifts.clampScalar(0, S32, S64);
893     Shifts.widenScalarToNextPow2(0, 32);
894   }
895   Shifts.scalarize(0);
896 
897   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
898     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
899     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
900     unsigned IdxTypeIdx = 2;
901 
902     getActionDefinitionsBuilder(Op)
903       .customIf([=](const LegalityQuery &Query) {
904           const LLT EltTy = Query.Types[EltTypeIdx];
905           const LLT VecTy = Query.Types[VecTypeIdx];
906           const LLT IdxTy = Query.Types[IdxTypeIdx];
907           return (EltTy.getSizeInBits() == 16 ||
908                   EltTy.getSizeInBits() % 32 == 0) &&
909                  VecTy.getSizeInBits() % 32 == 0 &&
910                  VecTy.getSizeInBits() <= 1024 &&
911                  IdxTy.getSizeInBits() == 32;
912         })
913       .clampScalar(EltTypeIdx, S32, S64)
914       .clampScalar(VecTypeIdx, S32, S64)
915       .clampScalar(IdxTypeIdx, S32, S32);
916   }
917 
918   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
919     .unsupportedIf([=](const LegalityQuery &Query) {
920         const LLT &EltTy = Query.Types[1].getElementType();
921         return Query.Types[0] != EltTy;
922       });
923 
924   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
925     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
926     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
927 
928     // FIXME: Doesn't handle extract of illegal sizes.
929     getActionDefinitionsBuilder(Op)
930       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
931       // FIXME: Multiples of 16 should not be legal.
932       .legalIf([=](const LegalityQuery &Query) {
933           const LLT BigTy = Query.Types[BigTyIdx];
934           const LLT LitTy = Query.Types[LitTyIdx];
935           return (BigTy.getSizeInBits() % 32 == 0) &&
936                  (LitTy.getSizeInBits() % 16 == 0);
937         })
938       .widenScalarIf(
939         [=](const LegalityQuery &Query) {
940           const LLT BigTy = Query.Types[BigTyIdx];
941           return (BigTy.getScalarSizeInBits() < 16);
942         },
943         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
944       .widenScalarIf(
945         [=](const LegalityQuery &Query) {
946           const LLT LitTy = Query.Types[LitTyIdx];
947           return (LitTy.getScalarSizeInBits() < 16);
948         },
949         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
950       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
951       .widenScalarToNextPow2(BigTyIdx, 32);
952 
953   }
954 
955   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
956     .legalForCartesianProduct(AllS32Vectors, {S32})
957     .legalForCartesianProduct(AllS64Vectors, {S64})
958     .clampNumElements(0, V16S32, V32S32)
959     .clampNumElements(0, V2S64, V16S64)
960     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
961 
962   if (ST.hasScalarPackInsts())
963     BuildVector.legalFor({V2S16, S32});
964 
965   BuildVector
966     .minScalarSameAs(1, 0)
967     .legalIf(isRegisterType(0))
968     .minScalarOrElt(0, S32);
969 
970   if (ST.hasScalarPackInsts()) {
971     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
972       .legalFor({V2S16, S32})
973       .lower();
974   } else {
975     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
976       .lower();
977   }
978 
979   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
980     .legalIf(isRegisterType(0));
981 
982   // TODO: Don't fully scalarize v2s16 pieces
983   getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
984 
985   // Merge/Unmerge
986   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
987     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
988     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
989 
990     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
991       const LLT &Ty = Query.Types[TypeIdx];
992       if (Ty.isVector()) {
993         const LLT &EltTy = Ty.getElementType();
994         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
995           return true;
996         if (!isPowerOf2_32(EltTy.getSizeInBits()))
997           return true;
998       }
999       return false;
1000     };
1001 
1002     auto &Builder = getActionDefinitionsBuilder(Op)
1003       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1004       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1005       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1006       // valid.
1007       .clampScalar(LitTyIdx, S16, S256)
1008       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1009       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1010       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1011                            elementTypeIs(1, S16)),
1012                        changeTo(1, V2S16))
1013       // Break up vectors with weird elements into scalars
1014       .fewerElementsIf(
1015         [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
1016         scalarize(0))
1017       .fewerElementsIf(
1018         [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
1019         scalarize(1))
1020       .clampScalar(BigTyIdx, S32, S1024)
1021       .lowerFor({{S16, V2S16}});
1022 
1023     if (Op == G_MERGE_VALUES) {
1024       Builder.widenScalarIf(
1025         // TODO: Use 16-bit shifts if legal for 8-bit values?
1026         [=](const LegalityQuery &Query) {
1027           const LLT Ty = Query.Types[LitTyIdx];
1028           return Ty.getSizeInBits() < 32;
1029         },
1030         changeTo(LitTyIdx, S32));
1031     }
1032 
1033     Builder.widenScalarIf(
1034       [=](const LegalityQuery &Query) {
1035         const LLT Ty = Query.Types[BigTyIdx];
1036         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1037           Ty.getSizeInBits() % 16 != 0;
1038       },
1039       [=](const LegalityQuery &Query) {
1040         // Pick the next power of 2, or a multiple of 64 over 128.
1041         // Whichever is smaller.
1042         const LLT &Ty = Query.Types[BigTyIdx];
1043         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1044         if (NewSizeInBits >= 256) {
1045           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1046           if (RoundedTo < NewSizeInBits)
1047             NewSizeInBits = RoundedTo;
1048         }
1049         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1050       })
1051       .legalIf([=](const LegalityQuery &Query) {
1052           const LLT &BigTy = Query.Types[BigTyIdx];
1053           const LLT &LitTy = Query.Types[LitTyIdx];
1054 
1055           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1056             return false;
1057           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1058             return false;
1059 
1060           return BigTy.getSizeInBits() % 16 == 0 &&
1061                  LitTy.getSizeInBits() % 16 == 0 &&
1062                  BigTy.getSizeInBits() <= 1024;
1063         })
1064       // Any vectors left are the wrong size. Scalarize them.
1065       .scalarize(0)
1066       .scalarize(1);
1067   }
1068 
1069   getActionDefinitionsBuilder(G_SEXT_INREG).lower();
1070 
1071   computeTables();
1072   verify(*ST.getInstrInfo());
1073 }
1074 
1075 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1076                                          MachineRegisterInfo &MRI,
1077                                          MachineIRBuilder &B,
1078                                          GISelChangeObserver &Observer) const {
1079   switch (MI.getOpcode()) {
1080   case TargetOpcode::G_ADDRSPACE_CAST:
1081     return legalizeAddrSpaceCast(MI, MRI, B);
1082   case TargetOpcode::G_FRINT:
1083     return legalizeFrint(MI, MRI, B);
1084   case TargetOpcode::G_FCEIL:
1085     return legalizeFceil(MI, MRI, B);
1086   case TargetOpcode::G_INTRINSIC_TRUNC:
1087     return legalizeIntrinsicTrunc(MI, MRI, B);
1088   case TargetOpcode::G_SITOFP:
1089     return legalizeITOFP(MI, MRI, B, true);
1090   case TargetOpcode::G_UITOFP:
1091     return legalizeITOFP(MI, MRI, B, false);
1092   case TargetOpcode::G_FMINNUM:
1093   case TargetOpcode::G_FMAXNUM:
1094   case TargetOpcode::G_FMINNUM_IEEE:
1095   case TargetOpcode::G_FMAXNUM_IEEE:
1096     return legalizeMinNumMaxNum(MI, MRI, B);
1097   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1098     return legalizeExtractVectorElt(MI, MRI, B);
1099   case TargetOpcode::G_INSERT_VECTOR_ELT:
1100     return legalizeInsertVectorElt(MI, MRI, B);
1101   case TargetOpcode::G_FSIN:
1102   case TargetOpcode::G_FCOS:
1103     return legalizeSinCos(MI, MRI, B);
1104   case TargetOpcode::G_GLOBAL_VALUE:
1105     return legalizeGlobalValue(MI, MRI, B);
1106   case TargetOpcode::G_LOAD:
1107     return legalizeLoad(MI, MRI, B, Observer);
1108   case TargetOpcode::G_FMAD:
1109     return legalizeFMad(MI, MRI, B);
1110   default:
1111     return false;
1112   }
1113 
1114   llvm_unreachable("expected switch to return");
1115 }
1116 
1117 Register AMDGPULegalizerInfo::getSegmentAperture(
1118   unsigned AS,
1119   MachineRegisterInfo &MRI,
1120   MachineIRBuilder &B) const {
1121   MachineFunction &MF = B.getMF();
1122   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1123   const LLT S32 = LLT::scalar(32);
1124 
1125   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1126 
1127   if (ST.hasApertureRegs()) {
1128     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1129     // getreg.
1130     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1131         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1132         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1133     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1134         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1135         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1136     unsigned Encoding =
1137         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1138         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1139         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1140 
1141     Register ApertureReg = MRI.createGenericVirtualRegister(S32);
1142     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1143 
1144     B.buildInstr(AMDGPU::S_GETREG_B32)
1145       .addDef(GetReg)
1146       .addImm(Encoding);
1147     MRI.setType(GetReg, S32);
1148 
1149     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1150     B.buildInstr(TargetOpcode::G_SHL)
1151       .addDef(ApertureReg)
1152       .addUse(GetReg)
1153       .addUse(ShiftAmt.getReg(0));
1154 
1155     return ApertureReg;
1156   }
1157 
1158   Register QueuePtr = MRI.createGenericVirtualRegister(
1159     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1160 
1161   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1162   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1163     return Register();
1164 
1165   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1166   // private_segment_aperture_base_hi.
1167   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1168 
1169   // FIXME: Don't use undef
1170   Value *V = UndefValue::get(PointerType::get(
1171                                Type::getInt8Ty(MF.getFunction().getContext()),
1172                                AMDGPUAS::CONSTANT_ADDRESS));
1173 
1174   MachinePointerInfo PtrInfo(V, StructOffset);
1175   MachineMemOperand *MMO = MF.getMachineMemOperand(
1176     PtrInfo,
1177     MachineMemOperand::MOLoad |
1178     MachineMemOperand::MODereferenceable |
1179     MachineMemOperand::MOInvariant,
1180     4,
1181     MinAlign(64, StructOffset));
1182 
1183   Register LoadResult = MRI.createGenericVirtualRegister(S32);
1184   Register LoadAddr;
1185 
1186   B.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1187   B.buildLoad(LoadResult, LoadAddr, *MMO);
1188   return LoadResult;
1189 }
1190 
1191 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1192   MachineInstr &MI, MachineRegisterInfo &MRI,
1193   MachineIRBuilder &B) const {
1194   MachineFunction &MF = B.getMF();
1195 
1196   B.setInstr(MI);
1197 
1198   const LLT S32 = LLT::scalar(32);
1199   Register Dst = MI.getOperand(0).getReg();
1200   Register Src = MI.getOperand(1).getReg();
1201 
1202   LLT DstTy = MRI.getType(Dst);
1203   LLT SrcTy = MRI.getType(Src);
1204   unsigned DestAS = DstTy.getAddressSpace();
1205   unsigned SrcAS = SrcTy.getAddressSpace();
1206 
1207   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1208   // vector element.
1209   assert(!DstTy.isVector());
1210 
1211   const AMDGPUTargetMachine &TM
1212     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1213 
1214   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1215   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1216     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1217     return true;
1218   }
1219 
1220   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1221     // Truncate.
1222     B.buildExtract(Dst, Src, 0);
1223     MI.eraseFromParent();
1224     return true;
1225   }
1226 
1227   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1228     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1229     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1230 
1231     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1232     // another. Merge operands are required to be the same type, but creating an
1233     // extra ptrtoint would be kind of pointless.
1234     auto HighAddr = B.buildConstant(
1235       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1236     B.buildMerge(Dst, {Src, HighAddr.getReg(0)});
1237     MI.eraseFromParent();
1238     return true;
1239   }
1240 
1241   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1242     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1243            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1244     unsigned NullVal = TM.getNullPointerValue(DestAS);
1245 
1246     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1247     auto FlatNull = B.buildConstant(SrcTy, 0);
1248 
1249     Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy);
1250 
1251     // Extract low 32-bits of the pointer.
1252     B.buildExtract(PtrLo32, Src, 0);
1253 
1254     Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1255     B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
1256     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1257 
1258     MI.eraseFromParent();
1259     return true;
1260   }
1261 
1262   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1263     return false;
1264 
1265   if (!ST.hasFlatAddressSpace())
1266     return false;
1267 
1268   auto SegmentNull =
1269       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1270   auto FlatNull =
1271       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1272 
1273   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1274   if (!ApertureReg.isValid())
1275     return false;
1276 
1277   Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1278   B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
1279 
1280   Register BuildPtr = MRI.createGenericVirtualRegister(DstTy);
1281 
1282   // Coerce the type of the low half of the result so we can use merge_values.
1283   Register SrcAsInt = MRI.createGenericVirtualRegister(S32);
1284   B.buildInstr(TargetOpcode::G_PTRTOINT)
1285     .addDef(SrcAsInt)
1286     .addUse(Src);
1287 
1288   // TODO: Should we allow mismatched types but matching sizes in merges to
1289   // avoid the ptrtoint?
1290   B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
1291   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
1292 
1293   MI.eraseFromParent();
1294   return true;
1295 }
1296 
1297 bool AMDGPULegalizerInfo::legalizeFrint(
1298   MachineInstr &MI, MachineRegisterInfo &MRI,
1299   MachineIRBuilder &B) const {
1300   B.setInstr(MI);
1301 
1302   Register Src = MI.getOperand(1).getReg();
1303   LLT Ty = MRI.getType(Src);
1304   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1305 
1306   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1307   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1308 
1309   auto C1 = B.buildFConstant(Ty, C1Val);
1310   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1311 
1312   // TODO: Should this propagate fast-math-flags?
1313   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1314   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1315 
1316   auto C2 = B.buildFConstant(Ty, C2Val);
1317   auto Fabs = B.buildFAbs(Ty, Src);
1318 
1319   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1320   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1321   return true;
1322 }
1323 
1324 bool AMDGPULegalizerInfo::legalizeFceil(
1325   MachineInstr &MI, MachineRegisterInfo &MRI,
1326   MachineIRBuilder &B) const {
1327   B.setInstr(MI);
1328 
1329   const LLT S1 = LLT::scalar(1);
1330   const LLT S64 = LLT::scalar(64);
1331 
1332   Register Src = MI.getOperand(1).getReg();
1333   assert(MRI.getType(Src) == S64);
1334 
1335   // result = trunc(src)
1336   // if (src > 0.0 && src != result)
1337   //   result += 1.0
1338 
1339   auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src});
1340 
1341   const auto Zero = B.buildFConstant(S64, 0.0);
1342   const auto One = B.buildFConstant(S64, 1.0);
1343   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1344   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1345   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1346   auto Add = B.buildSelect(S64, And, One, Zero);
1347 
1348   // TODO: Should this propagate fast-math-flags?
1349   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1350   return true;
1351 }
1352 
1353 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1354                                               MachineIRBuilder &B) {
1355   const unsigned FractBits = 52;
1356   const unsigned ExpBits = 11;
1357   LLT S32 = LLT::scalar(32);
1358 
1359   auto Const0 = B.buildConstant(S32, FractBits - 32);
1360   auto Const1 = B.buildConstant(S32, ExpBits);
1361 
1362   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1363     .addUse(Const0.getReg(0))
1364     .addUse(Const1.getReg(0));
1365 
1366   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1367 }
1368 
1369 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1370   MachineInstr &MI, MachineRegisterInfo &MRI,
1371   MachineIRBuilder &B) const {
1372   B.setInstr(MI);
1373 
1374   const LLT S1 = LLT::scalar(1);
1375   const LLT S32 = LLT::scalar(32);
1376   const LLT S64 = LLT::scalar(64);
1377 
1378   Register Src = MI.getOperand(1).getReg();
1379   assert(MRI.getType(Src) == S64);
1380 
1381   // TODO: Should this use extract since the low half is unused?
1382   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1383   Register Hi = Unmerge.getReg(1);
1384 
1385   // Extract the upper half, since this is where we will find the sign and
1386   // exponent.
1387   auto Exp = extractF64Exponent(Hi, B);
1388 
1389   const unsigned FractBits = 52;
1390 
1391   // Extract the sign bit.
1392   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1393   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1394 
1395   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1396 
1397   const auto Zero32 = B.buildConstant(S32, 0);
1398 
1399   // Extend back to 64-bits.
1400   auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)});
1401 
1402   auto Shr = B.buildAShr(S64, FractMask, Exp);
1403   auto Not = B.buildNot(S64, Shr);
1404   auto Tmp0 = B.buildAnd(S64, Src, Not);
1405   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1406 
1407   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1408   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1409 
1410   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1411   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1412   return true;
1413 }
1414 
1415 bool AMDGPULegalizerInfo::legalizeITOFP(
1416   MachineInstr &MI, MachineRegisterInfo &MRI,
1417   MachineIRBuilder &B, bool Signed) const {
1418   B.setInstr(MI);
1419 
1420   Register Dst = MI.getOperand(0).getReg();
1421   Register Src = MI.getOperand(1).getReg();
1422 
1423   const LLT S64 = LLT::scalar(64);
1424   const LLT S32 = LLT::scalar(32);
1425 
1426   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1427 
1428   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1429 
1430   auto CvtHi = Signed ?
1431     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1432     B.buildUITOFP(S64, Unmerge.getReg(1));
1433 
1434   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1435 
1436   auto ThirtyTwo = B.buildConstant(S32, 32);
1437   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1438     .addUse(CvtHi.getReg(0))
1439     .addUse(ThirtyTwo.getReg(0));
1440 
1441   // TODO: Should this propagate fast-math-flags?
1442   B.buildFAdd(Dst, LdExp, CvtLo);
1443   MI.eraseFromParent();
1444   return true;
1445 }
1446 
1447 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1448   MachineInstr &MI, MachineRegisterInfo &MRI,
1449   MachineIRBuilder &B) const {
1450   MachineFunction &MF = B.getMF();
1451   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1452 
1453   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1454                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1455 
1456   // With ieee_mode disabled, the instructions have the correct behavior
1457   // already for G_FMINNUM/G_FMAXNUM
1458   if (!MFI->getMode().IEEE)
1459     return !IsIEEEOp;
1460 
1461   if (IsIEEEOp)
1462     return true;
1463 
1464   MachineIRBuilder HelperBuilder(MI);
1465   GISelObserverWrapper DummyObserver;
1466   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1467   HelperBuilder.setInstr(MI);
1468   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1469 }
1470 
1471 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1472   MachineInstr &MI, MachineRegisterInfo &MRI,
1473   MachineIRBuilder &B) const {
1474   // TODO: Should move some of this into LegalizerHelper.
1475 
1476   // TODO: Promote dynamic indexing of s16 to s32
1477   // TODO: Dynamic s64 indexing is only legal for SGPR.
1478   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
1479   if (!IdxVal) // Dynamic case will be selected to register indexing.
1480     return true;
1481 
1482   Register Dst = MI.getOperand(0).getReg();
1483   Register Vec = MI.getOperand(1).getReg();
1484 
1485   LLT VecTy = MRI.getType(Vec);
1486   LLT EltTy = VecTy.getElementType();
1487   assert(EltTy == MRI.getType(Dst));
1488 
1489   B.setInstr(MI);
1490 
1491   if (IdxVal.getValue() < VecTy.getNumElements())
1492     B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits());
1493   else
1494     B.buildUndef(Dst);
1495 
1496   MI.eraseFromParent();
1497   return true;
1498 }
1499 
1500 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1501   MachineInstr &MI, MachineRegisterInfo &MRI,
1502   MachineIRBuilder &B) const {
1503   // TODO: Should move some of this into LegalizerHelper.
1504 
1505   // TODO: Promote dynamic indexing of s16 to s32
1506   // TODO: Dynamic s64 indexing is only legal for SGPR.
1507   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI);
1508   if (!IdxVal) // Dynamic case will be selected to register indexing.
1509     return true;
1510 
1511   Register Dst = MI.getOperand(0).getReg();
1512   Register Vec = MI.getOperand(1).getReg();
1513   Register Ins = MI.getOperand(2).getReg();
1514 
1515   LLT VecTy = MRI.getType(Vec);
1516   LLT EltTy = VecTy.getElementType();
1517   assert(EltTy == MRI.getType(Ins));
1518 
1519   B.setInstr(MI);
1520 
1521   if (IdxVal.getValue() < VecTy.getNumElements())
1522     B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits());
1523   else
1524     B.buildUndef(Dst);
1525 
1526   MI.eraseFromParent();
1527   return true;
1528 }
1529 
1530 bool AMDGPULegalizerInfo::legalizeSinCos(
1531   MachineInstr &MI, MachineRegisterInfo &MRI,
1532   MachineIRBuilder &B) const {
1533   B.setInstr(MI);
1534 
1535   Register DstReg = MI.getOperand(0).getReg();
1536   Register SrcReg = MI.getOperand(1).getReg();
1537   LLT Ty = MRI.getType(DstReg);
1538   unsigned Flags = MI.getFlags();
1539 
1540   Register TrigVal;
1541   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1542   if (ST.hasTrigReducedRange()) {
1543     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1544     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1545       .addUse(MulVal.getReg(0))
1546       .setMIFlags(Flags).getReg(0);
1547   } else
1548     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1549 
1550   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1551     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1552   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1553     .addUse(TrigVal)
1554     .setMIFlags(Flags);
1555   MI.eraseFromParent();
1556   return true;
1557 }
1558 
1559 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1560   Register DstReg, LLT PtrTy,
1561   MachineIRBuilder &B, const GlobalValue *GV,
1562   unsigned Offset, unsigned GAFlags) const {
1563   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1564   // to the following code sequence:
1565   //
1566   // For constant address space:
1567   //   s_getpc_b64 s[0:1]
1568   //   s_add_u32 s0, s0, $symbol
1569   //   s_addc_u32 s1, s1, 0
1570   //
1571   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1572   //   a fixup or relocation is emitted to replace $symbol with a literal
1573   //   constant, which is a pc-relative offset from the encoding of the $symbol
1574   //   operand to the global variable.
1575   //
1576   // For global address space:
1577   //   s_getpc_b64 s[0:1]
1578   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1579   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1580   //
1581   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1582   //   fixups or relocations are emitted to replace $symbol@*@lo and
1583   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1584   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
1585   //   operand to the global variable.
1586   //
1587   // What we want here is an offset from the value returned by s_getpc
1588   // (which is the address of the s_add_u32 instruction) to the global
1589   // variable, but since the encoding of $symbol starts 4 bytes after the start
1590   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1591   // small. This requires us to add 4 to the global variable offset in order to
1592   // compute the correct address.
1593 
1594   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1595 
1596   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1597     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1598 
1599   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1600     .addDef(PCReg);
1601 
1602   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1603   if (GAFlags == SIInstrInfo::MO_NONE)
1604     MIB.addImm(0);
1605   else
1606     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1607 
1608   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1609 
1610   if (PtrTy.getSizeInBits() == 32)
1611     B.buildExtract(DstReg, PCReg, 0);
1612   return true;
1613  }
1614 
1615 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1616   MachineInstr &MI, MachineRegisterInfo &MRI,
1617   MachineIRBuilder &B) const {
1618   Register DstReg = MI.getOperand(0).getReg();
1619   LLT Ty = MRI.getType(DstReg);
1620   unsigned AS = Ty.getAddressSpace();
1621 
1622   const GlobalValue *GV = MI.getOperand(1).getGlobal();
1623   MachineFunction &MF = B.getMF();
1624   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1625   B.setInstr(MI);
1626 
1627   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1628     if (!MFI->isEntryFunction()) {
1629       const Function &Fn = MF.getFunction();
1630       DiagnosticInfoUnsupported BadLDSDecl(
1631         Fn, "local memory global used by non-kernel function", MI.getDebugLoc());
1632       Fn.getContext().diagnose(BadLDSDecl);
1633     }
1634 
1635     // TODO: We could emit code to handle the initialization somewhere.
1636     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1637       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1638       MI.eraseFromParent();
1639       return true;
1640     }
1641 
1642     const Function &Fn = MF.getFunction();
1643     DiagnosticInfoUnsupported BadInit(
1644       Fn, "unsupported initializer for address space", MI.getDebugLoc());
1645     Fn.getContext().diagnose(BadInit);
1646     return true;
1647   }
1648 
1649   const SITargetLowering *TLI = ST.getTargetLowering();
1650 
1651   if (TLI->shouldEmitFixup(GV)) {
1652     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
1653     MI.eraseFromParent();
1654     return true;
1655   }
1656 
1657   if (TLI->shouldEmitPCReloc(GV)) {
1658     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
1659     MI.eraseFromParent();
1660     return true;
1661   }
1662 
1663   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1664   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
1665 
1666   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
1667     MachinePointerInfo::getGOT(MF),
1668     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1669     MachineMemOperand::MOInvariant,
1670     8 /*Size*/, 8 /*Align*/);
1671 
1672   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
1673 
1674   if (Ty.getSizeInBits() == 32) {
1675     // Truncate if this is a 32-bit constant adrdess.
1676     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
1677     B.buildExtract(DstReg, Load, 0);
1678   } else
1679     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
1680 
1681   MI.eraseFromParent();
1682   return true;
1683 }
1684 
1685 bool AMDGPULegalizerInfo::legalizeLoad(
1686   MachineInstr &MI, MachineRegisterInfo &MRI,
1687   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
1688   B.setInstr(MI);
1689   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1690   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
1691   Observer.changingInstr(MI);
1692   MI.getOperand(1).setReg(Cast.getReg(0));
1693   Observer.changedInstr(MI);
1694   return true;
1695 }
1696 
1697 bool AMDGPULegalizerInfo::legalizeFMad(
1698   MachineInstr &MI, MachineRegisterInfo &MRI,
1699   MachineIRBuilder &B) const {
1700   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1701   assert(Ty.isScalar());
1702 
1703   // TODO: Always legal with future ftz flag.
1704   if (Ty == LLT::scalar(32) && !ST.hasFP32Denormals())
1705     return true;
1706   if (Ty == LLT::scalar(16) && !ST.hasFP16Denormals())
1707     return true;
1708 
1709   MachineFunction &MF = B.getMF();
1710 
1711   MachineIRBuilder HelperBuilder(MI);
1712   GISelObserverWrapper DummyObserver;
1713   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1714   HelperBuilder.setMBB(*MI.getParent());
1715   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
1716 }
1717 
1718 // Return the use branch instruction, otherwise null if the usage is invalid.
1719 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
1720                                        MachineRegisterInfo &MRI) {
1721   Register CondDef = MI.getOperand(0).getReg();
1722   if (!MRI.hasOneNonDBGUse(CondDef))
1723     return nullptr;
1724 
1725   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
1726   return UseMI.getParent() == MI.getParent() &&
1727     UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr;
1728 }
1729 
1730 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
1731                                                 Register Reg, LLT Ty) const {
1732   Register LiveIn = MRI.getLiveInVirtReg(Reg);
1733   if (LiveIn)
1734     return LiveIn;
1735 
1736   Register NewReg = MRI.createGenericVirtualRegister(Ty);
1737   MRI.addLiveIn(Reg, NewReg);
1738   return NewReg;
1739 }
1740 
1741 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
1742                                          const ArgDescriptor *Arg) const {
1743   if (!Arg->isRegister() || !Arg->getRegister().isValid())
1744     return false; // TODO: Handle these
1745 
1746   assert(Arg->getRegister().isPhysical());
1747 
1748   MachineRegisterInfo &MRI = *B.getMRI();
1749 
1750   LLT Ty = MRI.getType(DstReg);
1751   Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
1752 
1753   if (Arg->isMasked()) {
1754     // TODO: Should we try to emit this once in the entry block?
1755     const LLT S32 = LLT::scalar(32);
1756     const unsigned Mask = Arg->getMask();
1757     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
1758 
1759     Register AndMaskSrc = LiveIn;
1760 
1761     if (Shift != 0) {
1762       auto ShiftAmt = B.buildConstant(S32, Shift);
1763       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
1764     }
1765 
1766     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
1767   } else
1768     B.buildCopy(DstReg, LiveIn);
1769 
1770   // Insert the argument copy if it doens't already exist.
1771   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
1772   if (!MRI.getVRegDef(LiveIn)) {
1773     // FIXME: Should have scoped insert pt
1774     MachineBasicBlock &OrigInsBB = B.getMBB();
1775     auto OrigInsPt = B.getInsertPt();
1776 
1777     MachineBasicBlock &EntryMBB = B.getMF().front();
1778     EntryMBB.addLiveIn(Arg->getRegister());
1779     B.setInsertPt(EntryMBB, EntryMBB.begin());
1780     B.buildCopy(LiveIn, Arg->getRegister());
1781 
1782     B.setInsertPt(OrigInsBB, OrigInsPt);
1783   }
1784 
1785   return true;
1786 }
1787 
1788 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
1789   MachineInstr &MI,
1790   MachineRegisterInfo &MRI,
1791   MachineIRBuilder &B,
1792   AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
1793   B.setInstr(MI);
1794 
1795   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1796 
1797   const ArgDescriptor *Arg;
1798   const TargetRegisterClass *RC;
1799   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
1800   if (!Arg) {
1801     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
1802     return false;
1803   }
1804 
1805   if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
1806     MI.eraseFromParent();
1807     return true;
1808   }
1809 
1810   return false;
1811 }
1812 
1813 bool AMDGPULegalizerInfo::legalizeFDIVFast(MachineInstr &MI,
1814                                            MachineRegisterInfo &MRI,
1815                                            MachineIRBuilder &B) const {
1816   B.setInstr(MI);
1817   Register Res = MI.getOperand(0).getReg();
1818   Register LHS = MI.getOperand(2).getReg();
1819   Register RHS = MI.getOperand(3).getReg();
1820   uint16_t Flags = MI.getFlags();
1821 
1822   LLT S32 = LLT::scalar(32);
1823   LLT S1 = LLT::scalar(1);
1824 
1825   auto Abs = B.buildFAbs(S32, RHS, Flags);
1826   const APFloat C0Val(1.0f);
1827 
1828   auto C0 = B.buildConstant(S32, 0x6f800000);
1829   auto C1 = B.buildConstant(S32, 0x2f800000);
1830   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
1831 
1832   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
1833   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
1834 
1835   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
1836 
1837   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
1838     .addUse(Mul0.getReg(0))
1839     .setMIFlags(Flags);
1840 
1841   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
1842 
1843   B.buildFMul(Res, Sel, Mul1, Flags);
1844 
1845   MI.eraseFromParent();
1846   return true;
1847 }
1848 
1849 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
1850                                                  MachineRegisterInfo &MRI,
1851                                                  MachineIRBuilder &B) const {
1852   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1853   if (!MFI->isEntryFunction()) {
1854     return legalizePreloadedArgIntrin(MI, MRI, B,
1855                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
1856   }
1857 
1858   B.setInstr(MI);
1859 
1860   uint64_t Offset =
1861     ST.getTargetLowering()->getImplicitParameterOffset(
1862       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
1863   Register DstReg = MI.getOperand(0).getReg();
1864   LLT DstTy = MRI.getType(DstReg);
1865   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
1866 
1867   const ArgDescriptor *Arg;
1868   const TargetRegisterClass *RC;
1869   std::tie(Arg, RC)
1870     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1871   if (!Arg)
1872     return false;
1873 
1874   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
1875   if (!loadInputValue(KernargPtrReg, B, Arg))
1876     return false;
1877 
1878   B.buildGEP(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
1879   MI.eraseFromParent();
1880   return true;
1881 }
1882 
1883 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
1884                                               MachineRegisterInfo &MRI,
1885                                               MachineIRBuilder &B,
1886                                               unsigned AddrSpace) const {
1887   B.setInstr(MI);
1888   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
1889   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
1890   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
1891   MI.eraseFromParent();
1892   return true;
1893 }
1894 
1895 /// Handle register layout difference for f16 images for some subtargets.
1896 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
1897                                              MachineRegisterInfo &MRI,
1898                                              Register Reg) const {
1899   if (!ST.hasUnpackedD16VMem())
1900     return Reg;
1901 
1902   const LLT S16 = LLT::scalar(16);
1903   const LLT S32 = LLT::scalar(32);
1904   LLT StoreVT = MRI.getType(Reg);
1905   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
1906 
1907   auto Unmerge = B.buildUnmerge(S16, Reg);
1908 
1909   SmallVector<Register, 4> WideRegs;
1910   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
1911     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
1912 
1913   int NumElts = StoreVT.getNumElements();
1914 
1915   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
1916 }
1917 
1918 bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI,
1919                                                  MachineRegisterInfo &MRI,
1920                                                  MachineIRBuilder &B,
1921                                                  bool IsFormat) const {
1922   // TODO: Reject f16 format on targets where unsupported.
1923   Register VData = MI.getOperand(1).getReg();
1924   LLT Ty = MRI.getType(VData);
1925 
1926   B.setInstr(MI);
1927 
1928   const LLT S32 = LLT::scalar(32);
1929   const LLT S16 = LLT::scalar(16);
1930 
1931   // Fixup illegal register types for i8 stores.
1932   if (Ty == LLT::scalar(8) || Ty == S16) {
1933     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
1934     MI.getOperand(1).setReg(AnyExt);
1935     return true;
1936   }
1937 
1938   if (Ty.isVector()) {
1939     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
1940       if (IsFormat)
1941         MI.getOperand(1).setReg(handleD16VData(B, MRI, VData));
1942       return true;
1943     }
1944 
1945     return Ty.getElementType() == S32 && Ty.getNumElements() <= 4;
1946   }
1947 
1948   return Ty == S32;
1949 }
1950 
1951 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
1952                                             MachineRegisterInfo &MRI,
1953                                             MachineIRBuilder &B) const {
1954   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
1955   switch (MI.getIntrinsicID()) {
1956   case Intrinsic::amdgcn_if: {
1957     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
1958       const SIRegisterInfo *TRI
1959         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1960 
1961       B.setInstr(*BrCond);
1962       Register Def = MI.getOperand(1).getReg();
1963       Register Use = MI.getOperand(3).getReg();
1964       B.buildInstr(AMDGPU::SI_IF)
1965         .addDef(Def)
1966         .addUse(Use)
1967         .addMBB(BrCond->getOperand(1).getMBB());
1968 
1969       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
1970       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
1971       MI.eraseFromParent();
1972       BrCond->eraseFromParent();
1973       return true;
1974     }
1975 
1976     return false;
1977   }
1978   case Intrinsic::amdgcn_loop: {
1979     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
1980       const SIRegisterInfo *TRI
1981         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1982 
1983       B.setInstr(*BrCond);
1984       Register Reg = MI.getOperand(2).getReg();
1985       B.buildInstr(AMDGPU::SI_LOOP)
1986         .addUse(Reg)
1987         .addMBB(BrCond->getOperand(1).getMBB());
1988       MI.eraseFromParent();
1989       BrCond->eraseFromParent();
1990       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
1991       return true;
1992     }
1993 
1994     return false;
1995   }
1996   case Intrinsic::amdgcn_kernarg_segment_ptr:
1997     return legalizePreloadedArgIntrin(
1998       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1999   case Intrinsic::amdgcn_implicitarg_ptr:
2000     return legalizeImplicitArgPtr(MI, MRI, B);
2001   case Intrinsic::amdgcn_workitem_id_x:
2002     return legalizePreloadedArgIntrin(MI, MRI, B,
2003                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
2004   case Intrinsic::amdgcn_workitem_id_y:
2005     return legalizePreloadedArgIntrin(MI, MRI, B,
2006                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
2007   case Intrinsic::amdgcn_workitem_id_z:
2008     return legalizePreloadedArgIntrin(MI, MRI, B,
2009                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
2010   case Intrinsic::amdgcn_workgroup_id_x:
2011     return legalizePreloadedArgIntrin(MI, MRI, B,
2012                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
2013   case Intrinsic::amdgcn_workgroup_id_y:
2014     return legalizePreloadedArgIntrin(MI, MRI, B,
2015                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
2016   case Intrinsic::amdgcn_workgroup_id_z:
2017     return legalizePreloadedArgIntrin(MI, MRI, B,
2018                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
2019   case Intrinsic::amdgcn_dispatch_ptr:
2020     return legalizePreloadedArgIntrin(MI, MRI, B,
2021                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
2022   case Intrinsic::amdgcn_queue_ptr:
2023     return legalizePreloadedArgIntrin(MI, MRI, B,
2024                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
2025   case Intrinsic::amdgcn_implicit_buffer_ptr:
2026     return legalizePreloadedArgIntrin(
2027       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
2028   case Intrinsic::amdgcn_dispatch_id:
2029     return legalizePreloadedArgIntrin(MI, MRI, B,
2030                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
2031   case Intrinsic::amdgcn_fdiv_fast:
2032     return legalizeFDIVFast(MI, MRI, B);
2033   case Intrinsic::amdgcn_is_shared:
2034     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
2035   case Intrinsic::amdgcn_is_private:
2036     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
2037   case Intrinsic::amdgcn_wavefrontsize: {
2038     B.setInstr(MI);
2039     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
2040     MI.eraseFromParent();
2041     return true;
2042   }
2043   case Intrinsic::amdgcn_raw_buffer_store:
2044     return legalizeRawBufferStore(MI, MRI, B, false);
2045   case Intrinsic::amdgcn_raw_buffer_store_format:
2046     return legalizeRawBufferStore(MI, MRI, B, true);
2047   default:
2048     return true;
2049   }
2050 
2051   return true;
2052 }
2053