1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
19 #endif
20 
21 #include "AMDGPU.h"
22 #include "AMDGPULegalizerInfo.h"
23 #include "AMDGPUTargetMachine.h"
24 #include "SIMachineFunctionInfo.h"
25 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/CodeGen/TargetOpcodes.h"
28 #include "llvm/CodeGen/ValueTypes.h"
29 #include "llvm/IR/DerivedTypes.h"
30 #include "llvm/IR/DiagnosticInfo.h"
31 #include "llvm/IR/Type.h"
32 #include "llvm/Support/Debug.h"
33 
34 #define DEBUG_TYPE "amdgpu-legalinfo"
35 
36 using namespace llvm;
37 using namespace LegalizeActions;
38 using namespace LegalizeMutations;
39 using namespace LegalityPredicates;
40 
41 
42 static LegalityPredicate isMultiple32(unsigned TypeIdx,
43                                       unsigned MaxSize = 1024) {
44   return [=](const LegalityQuery &Query) {
45     const LLT Ty = Query.Types[TypeIdx];
46     const LLT EltTy = Ty.getScalarType();
47     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
48   };
49 }
50 
51 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
52   return [=](const LegalityQuery &Query) {
53     return Query.Types[TypeIdx].getSizeInBits() == Size;
54   };
55 }
56 
57 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
58   return [=](const LegalityQuery &Query) {
59     const LLT Ty = Query.Types[TypeIdx];
60     return Ty.isVector() &&
61            Ty.getNumElements() % 2 != 0 &&
62            Ty.getElementType().getSizeInBits() < 32 &&
63            Ty.getSizeInBits() % 32 != 0;
64   };
65 }
66 
67 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
68   return [=](const LegalityQuery &Query) {
69     const LLT Ty = Query.Types[TypeIdx];
70     const LLT EltTy = Ty.getScalarType();
71     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
72   };
73 }
74 
75 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
76   return [=](const LegalityQuery &Query) {
77     const LLT Ty = Query.Types[TypeIdx];
78     const LLT EltTy = Ty.getElementType();
79     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
80   };
81 }
82 
83 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
84   return [=](const LegalityQuery &Query) {
85     const LLT Ty = Query.Types[TypeIdx];
86     const LLT EltTy = Ty.getElementType();
87     unsigned Size = Ty.getSizeInBits();
88     unsigned Pieces = (Size + 63) / 64;
89     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
90     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
91   };
92 }
93 
94 // Increase the number of vector elements to reach the next multiple of 32-bit
95 // type.
96 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
97   return [=](const LegalityQuery &Query) {
98     const LLT Ty = Query.Types[TypeIdx];
99 
100     const LLT EltTy = Ty.getElementType();
101     const int Size = Ty.getSizeInBits();
102     const int EltSize = EltTy.getSizeInBits();
103     const int NextMul32 = (Size + 31) / 32;
104 
105     assert(EltSize < 32);
106 
107     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
108     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
109   };
110 }
111 
112 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
113   return [=](const LegalityQuery &Query) {
114     const LLT QueryTy = Query.Types[TypeIdx];
115     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
116   };
117 }
118 
119 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
120   return [=](const LegalityQuery &Query) {
121     const LLT QueryTy = Query.Types[TypeIdx];
122     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
123   };
124 }
125 
126 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
127   return [=](const LegalityQuery &Query) {
128     const LLT QueryTy = Query.Types[TypeIdx];
129     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
130   };
131 }
132 
133 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
134 // v2s16.
135 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
136   return [=](const LegalityQuery &Query) {
137     const LLT Ty = Query.Types[TypeIdx];
138     if (Ty.isVector()) {
139       const int EltSize = Ty.getElementType().getSizeInBits();
140       return EltSize == 32 || EltSize == 64 ||
141             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
142              EltSize == 128 || EltSize == 256;
143     }
144 
145     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
146   };
147 }
148 
149 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
150   return [=](const LegalityQuery &Query) {
151     return Query.Types[TypeIdx].getElementType() == Type;
152   };
153 }
154 
155 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
156   return [=](const LegalityQuery &Query) {
157     const LLT Ty = Query.Types[TypeIdx];
158     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
159            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
160   };
161 }
162 
163 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
164                                          const GCNTargetMachine &TM)
165   :  ST(ST_) {
166   using namespace TargetOpcode;
167 
168   auto GetAddrSpacePtr = [&TM](unsigned AS) {
169     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
170   };
171 
172   const LLT S1 = LLT::scalar(1);
173   const LLT S8 = LLT::scalar(8);
174   const LLT S16 = LLT::scalar(16);
175   const LLT S32 = LLT::scalar(32);
176   const LLT S64 = LLT::scalar(64);
177   const LLT S96 = LLT::scalar(96);
178   const LLT S128 = LLT::scalar(128);
179   const LLT S256 = LLT::scalar(256);
180   const LLT S1024 = LLT::scalar(1024);
181 
182   const LLT V2S16 = LLT::vector(2, 16);
183   const LLT V4S16 = LLT::vector(4, 16);
184 
185   const LLT V2S32 = LLT::vector(2, 32);
186   const LLT V3S32 = LLT::vector(3, 32);
187   const LLT V4S32 = LLT::vector(4, 32);
188   const LLT V5S32 = LLT::vector(5, 32);
189   const LLT V6S32 = LLT::vector(6, 32);
190   const LLT V7S32 = LLT::vector(7, 32);
191   const LLT V8S32 = LLT::vector(8, 32);
192   const LLT V9S32 = LLT::vector(9, 32);
193   const LLT V10S32 = LLT::vector(10, 32);
194   const LLT V11S32 = LLT::vector(11, 32);
195   const LLT V12S32 = LLT::vector(12, 32);
196   const LLT V13S32 = LLT::vector(13, 32);
197   const LLT V14S32 = LLT::vector(14, 32);
198   const LLT V15S32 = LLT::vector(15, 32);
199   const LLT V16S32 = LLT::vector(16, 32);
200   const LLT V32S32 = LLT::vector(32, 32);
201 
202   const LLT V2S64 = LLT::vector(2, 64);
203   const LLT V3S64 = LLT::vector(3, 64);
204   const LLT V4S64 = LLT::vector(4, 64);
205   const LLT V5S64 = LLT::vector(5, 64);
206   const LLT V6S64 = LLT::vector(6, 64);
207   const LLT V7S64 = LLT::vector(7, 64);
208   const LLT V8S64 = LLT::vector(8, 64);
209   const LLT V16S64 = LLT::vector(16, 64);
210 
211   std::initializer_list<LLT> AllS32Vectors =
212     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
213      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
214   std::initializer_list<LLT> AllS64Vectors =
215     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
216 
217   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
218   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
219   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
220   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
221   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
222   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
223   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
224 
225   const LLT CodePtr = FlatPtr;
226 
227   const std::initializer_list<LLT> AddrSpaces64 = {
228     GlobalPtr, ConstantPtr, FlatPtr
229   };
230 
231   const std::initializer_list<LLT> AddrSpaces32 = {
232     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
233   };
234 
235   const std::initializer_list<LLT> FPTypesBase = {
236     S32, S64
237   };
238 
239   const std::initializer_list<LLT> FPTypes16 = {
240     S32, S64, S16
241   };
242 
243   const std::initializer_list<LLT> FPTypesPK16 = {
244     S32, S64, S16, V2S16
245   };
246 
247   setAction({G_BRCOND, S1}, Legal);
248 
249   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
250   // elements for v3s16
251   getActionDefinitionsBuilder(G_PHI)
252     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
253     .legalFor(AllS32Vectors)
254     .legalFor(AllS64Vectors)
255     .legalFor(AddrSpaces64)
256     .legalFor(AddrSpaces32)
257     .clampScalar(0, S32, S256)
258     .widenScalarToNextPow2(0, 32)
259     .clampMaxNumElements(0, S32, 16)
260     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
261     .legalIf(isPointer(0));
262 
263   if (ST.has16BitInsts()) {
264     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
265       .legalFor({S32, S16})
266       .clampScalar(0, S16, S32)
267       .scalarize(0);
268   } else {
269     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
270       .legalFor({S32})
271       .clampScalar(0, S32, S32)
272       .scalarize(0);
273   }
274 
275   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
276     .legalFor({S32})
277     .clampScalar(0, S32, S32)
278     .scalarize(0);
279 
280   // Report legal for any types we can handle anywhere. For the cases only legal
281   // on the SALU, RegBankSelect will be able to re-legalize.
282   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
283     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
284     .clampScalar(0, S32, S64)
285     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
286     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
287     .widenScalarToNextPow2(0)
288     .scalarize(0);
289 
290   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
291                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
292     .legalFor({{S32, S1}})
293     .clampScalar(0, S32, S32)
294     .scalarize(0); // TODO: Implement.
295 
296   getActionDefinitionsBuilder({G_SADDO, G_SSUBO})
297     .lower();
298 
299   getActionDefinitionsBuilder(G_BITCAST)
300     // Don't worry about the size constraint.
301     .legalIf(all(isRegisterType(0), isRegisterType(1)))
302     // FIXME: Testing hack
303     .legalForCartesianProduct({S16, LLT::vector(2, 8), });
304 
305   getActionDefinitionsBuilder(G_FCONSTANT)
306     .legalFor({S32, S64, S16})
307     .clampScalar(0, S16, S64);
308 
309   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
310     .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
311                ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
312     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
313     .clampScalarOrElt(0, S32, S1024)
314     .legalIf(isMultiple32(0))
315     .widenScalarToNextPow2(0, 32)
316     .clampMaxNumElements(0, S32, 16);
317 
318 
319   // FIXME: i1 operands to intrinsics should always be legal, but other i1
320   // values may not be legal.  We need to figure out how to distinguish
321   // between these two scenarios.
322   getActionDefinitionsBuilder(G_CONSTANT)
323     .legalFor({S1, S32, S64, S16, GlobalPtr,
324                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
325     .clampScalar(0, S32, S64)
326     .widenScalarToNextPow2(0)
327     .legalIf(isPointer(0));
328 
329   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
330   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
331     .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr});
332 
333 
334   auto &FPOpActions = getActionDefinitionsBuilder(
335     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
336     .legalFor({S32, S64});
337   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
338     .customFor({S32, S64});
339   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
340     .customFor({S32, S64});
341 
342   if (ST.has16BitInsts()) {
343     if (ST.hasVOP3PInsts())
344       FPOpActions.legalFor({S16, V2S16});
345     else
346       FPOpActions.legalFor({S16});
347 
348     TrigActions.customFor({S16});
349     FDIVActions.customFor({S16});
350   }
351 
352   auto &MinNumMaxNum = getActionDefinitionsBuilder({
353       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
354 
355   if (ST.hasVOP3PInsts()) {
356     MinNumMaxNum.customFor(FPTypesPK16)
357       .clampMaxNumElements(0, S16, 2)
358       .clampScalar(0, S16, S64)
359       .scalarize(0);
360   } else if (ST.has16BitInsts()) {
361     MinNumMaxNum.customFor(FPTypes16)
362       .clampScalar(0, S16, S64)
363       .scalarize(0);
364   } else {
365     MinNumMaxNum.customFor(FPTypesBase)
366       .clampScalar(0, S32, S64)
367       .scalarize(0);
368   }
369 
370   if (ST.hasVOP3PInsts())
371     FPOpActions.clampMaxNumElements(0, S16, 2);
372 
373   FPOpActions
374     .scalarize(0)
375     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
376 
377   TrigActions
378     .scalarize(0)
379     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
380 
381   FDIVActions
382     .scalarize(0)
383     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
384 
385   getActionDefinitionsBuilder({G_FNEG, G_FABS})
386     .legalFor(FPTypesPK16)
387     .clampMaxNumElements(0, S16, 2)
388     .scalarize(0)
389     .clampScalar(0, S16, S64);
390 
391   // TODO: Implement
392   getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
393 
394   if (ST.has16BitInsts()) {
395     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
396       .legalFor({S32, S64, S16})
397       .scalarize(0)
398       .clampScalar(0, S16, S64);
399   } else {
400     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
401       .legalFor({S32, S64})
402       .scalarize(0)
403       .clampScalar(0, S32, S64);
404   }
405 
406   getActionDefinitionsBuilder(G_FPTRUNC)
407     .legalFor({{S32, S64}, {S16, S32}})
408     .scalarize(0);
409 
410   getActionDefinitionsBuilder(G_FPEXT)
411     .legalFor({{S64, S32}, {S32, S16}})
412     .lowerFor({{S64, S16}}) // FIXME: Implement
413     .scalarize(0);
414 
415   // TODO: Verify V_BFI_B32 is generated from expanded bit ops.
416   getActionDefinitionsBuilder(G_FCOPYSIGN).lower();
417 
418   getActionDefinitionsBuilder(G_FSUB)
419       // Use actual fsub instruction
420       .legalFor({S32})
421       // Must use fadd + fneg
422       .lowerFor({S64, S16, V2S16})
423       .scalarize(0)
424       .clampScalar(0, S32, S64);
425 
426   // Whether this is legal depends on the floating point mode for the function.
427   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
428   if (ST.hasMadF16())
429     FMad.customFor({S32, S16});
430   else
431     FMad.customFor({S32});
432   FMad.scalarize(0)
433       .lower();
434 
435   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
436     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
437                {S32, S1}, {S64, S1}, {S16, S1},
438                {S96, S32},
439                // FIXME: Hack
440                {S64, LLT::scalar(33)},
441                {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}})
442     .scalarize(0);
443 
444   // TODO: Split s1->s64 during regbankselect for VALU.
445   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
446     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
447     .lowerFor({{S32, S64}})
448     .lowerIf(typeIs(1, S1))
449     .customFor({{S64, S64}});
450   if (ST.has16BitInsts())
451     IToFP.legalFor({{S16, S16}});
452   IToFP.clampScalar(1, S32, S64)
453        .scalarize(0);
454 
455   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
456     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}});
457   if (ST.has16BitInsts())
458     FPToI.legalFor({{S16, S16}});
459   else
460     FPToI.minScalar(1, S32);
461 
462   FPToI.minScalar(0, S32)
463        .scalarize(0);
464 
465   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
466     .legalFor({S32, S64})
467     .scalarize(0);
468 
469   if (ST.has16BitInsts()) {
470     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
471       .legalFor({S16, S32, S64})
472       .clampScalar(0, S16, S64)
473       .scalarize(0);
474   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
475     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
476       .legalFor({S32, S64})
477       .clampScalar(0, S32, S64)
478       .scalarize(0);
479   } else {
480     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
481       .legalFor({S32})
482       .customFor({S64})
483       .clampScalar(0, S32, S64)
484       .scalarize(0);
485   }
486 
487   getActionDefinitionsBuilder(G_PTR_ADD)
488     .legalForCartesianProduct(AddrSpaces64, {S64})
489     .legalForCartesianProduct(AddrSpaces32, {S32})
490     .scalarize(0);
491 
492   getActionDefinitionsBuilder(G_PTR_MASK)
493     .scalarize(0)
494     .alwaysLegal();
495 
496   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
497 
498   auto &CmpBuilder =
499     getActionDefinitionsBuilder(G_ICMP)
500     .legalForCartesianProduct(
501       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
502     .legalFor({{S1, S32}, {S1, S64}});
503   if (ST.has16BitInsts()) {
504     CmpBuilder.legalFor({{S1, S16}});
505   }
506 
507   CmpBuilder
508     .widenScalarToNextPow2(1)
509     .clampScalar(1, S32, S64)
510     .scalarize(0)
511     .legalIf(all(typeIs(0, S1), isPointer(1)));
512 
513   getActionDefinitionsBuilder(G_FCMP)
514     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
515     .widenScalarToNextPow2(1)
516     .clampScalar(1, S32, S64)
517     .scalarize(0);
518 
519   // FIXME: fexp, flog2, flog10 needs to be custom lowered.
520   getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2,
521                                G_FLOG, G_FLOG2, G_FLOG10})
522     .legalFor({S32})
523     .scalarize(0);
524 
525   // The 64-bit versions produce 32-bit results, but only on the SALU.
526   getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF,
527                                G_CTTZ, G_CTTZ_ZERO_UNDEF,
528                                G_CTPOP})
529     .legalFor({{S32, S32}, {S32, S64}})
530     .clampScalar(0, S32, S32)
531     .clampScalar(1, S32, S64)
532     .scalarize(0)
533     .widenScalarToNextPow2(0, 32)
534     .widenScalarToNextPow2(1, 32);
535 
536   // TODO: Expand for > s32
537   getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE})
538     .legalFor({S32})
539     .clampScalar(0, S32, S32)
540     .scalarize(0);
541 
542   if (ST.has16BitInsts()) {
543     if (ST.hasVOP3PInsts()) {
544       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
545         .legalFor({S32, S16, V2S16})
546         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
547         .clampMaxNumElements(0, S16, 2)
548         .clampScalar(0, S16, S32)
549         .widenScalarToNextPow2(0)
550         .scalarize(0);
551     } else {
552       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
553         .legalFor({S32, S16})
554         .widenScalarToNextPow2(0)
555         .clampScalar(0, S16, S32)
556         .scalarize(0);
557     }
558   } else {
559     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
560       .legalFor({S32})
561       .clampScalar(0, S32, S32)
562       .widenScalarToNextPow2(0)
563       .scalarize(0);
564   }
565 
566   auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
567     return [=](const LegalityQuery &Query) {
568       return Query.Types[TypeIdx0].getSizeInBits() <
569              Query.Types[TypeIdx1].getSizeInBits();
570     };
571   };
572 
573   auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
574     return [=](const LegalityQuery &Query) {
575       return Query.Types[TypeIdx0].getSizeInBits() >
576              Query.Types[TypeIdx1].getSizeInBits();
577     };
578   };
579 
580   getActionDefinitionsBuilder(G_INTTOPTR)
581     // List the common cases
582     .legalForCartesianProduct(AddrSpaces64, {S64})
583     .legalForCartesianProduct(AddrSpaces32, {S32})
584     .scalarize(0)
585     // Accept any address space as long as the size matches
586     .legalIf(sameSize(0, 1))
587     .widenScalarIf(smallerThan(1, 0),
588       [](const LegalityQuery &Query) {
589         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
590       })
591     .narrowScalarIf(greaterThan(1, 0),
592       [](const LegalityQuery &Query) {
593         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
594       });
595 
596   getActionDefinitionsBuilder(G_PTRTOINT)
597     // List the common cases
598     .legalForCartesianProduct(AddrSpaces64, {S64})
599     .legalForCartesianProduct(AddrSpaces32, {S32})
600     .scalarize(0)
601     // Accept any address space as long as the size matches
602     .legalIf(sameSize(0, 1))
603     .widenScalarIf(smallerThan(0, 1),
604       [](const LegalityQuery &Query) {
605         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
606       })
607     .narrowScalarIf(
608       greaterThan(0, 1),
609       [](const LegalityQuery &Query) {
610         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
611       });
612 
613   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
614     .scalarize(0)
615     .custom();
616 
617   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
618   // handle some operations by just promoting the register during
619   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
620   auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned {
621     switch (AS) {
622     // FIXME: Private element size.
623     case AMDGPUAS::PRIVATE_ADDRESS:
624       return 32;
625     // FIXME: Check subtarget
626     case AMDGPUAS::LOCAL_ADDRESS:
627       return ST.useDS128() ? 128 : 64;
628 
629     // Treat constant and global as identical. SMRD loads are sometimes usable
630     // for global loads (ideally constant address space should be eliminated)
631     // depending on the context. Legality cannot be context dependent, but
632     // RegBankSelect can split the load as necessary depending on the pointer
633     // register bank/uniformity and if the memory is invariant or not written in
634     // a kernel.
635     case AMDGPUAS::CONSTANT_ADDRESS:
636     case AMDGPUAS::GLOBAL_ADDRESS:
637       return 512;
638     default:
639       return 128;
640     }
641   };
642 
643   const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool {
644     const LLT DstTy = Query.Types[0];
645 
646     // Split vector extloads.
647     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
648     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
649       return true;
650 
651     const LLT PtrTy = Query.Types[1];
652     unsigned AS = PtrTy.getAddressSpace();
653     if (MemSize > maxSizeForAddrSpace(AS))
654       return true;
655 
656     // Catch weird sized loads that don't evenly divide into the access sizes
657     // TODO: May be able to widen depending on alignment etc.
658     unsigned NumRegs = MemSize / 32;
659     if (NumRegs == 3 && !ST.hasDwordx3LoadStores())
660       return true;
661 
662     unsigned Align = Query.MMODescrs[0].AlignInBits;
663     if (Align < MemSize) {
664       const SITargetLowering *TLI = ST.getTargetLowering();
665       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
666     }
667 
668     return false;
669   };
670 
671   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
672   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
673   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
674 
675   // TODO: Refine based on subtargets which support unaligned access or 128-bit
676   // LDS
677   // TODO: Unsupported flat for SI.
678 
679   for (unsigned Op : {G_LOAD, G_STORE}) {
680     const bool IsStore = Op == G_STORE;
681 
682     auto &Actions = getActionDefinitionsBuilder(Op);
683     // Whitelist the common cases.
684     // TODO: Pointer loads
685     // TODO: Wide constant loads
686     // TODO: Only CI+ has 3x loads
687     // TODO: Loads to s16 on gfx9
688     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
689                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
690                                       {V3S32, GlobalPtr, 96, GlobalAlign32},
691                                       {S96, GlobalPtr, 96, GlobalAlign32},
692                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
693                                       {S128, GlobalPtr, 128, GlobalAlign32},
694                                       {S64, GlobalPtr, 64, GlobalAlign32},
695                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
696                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
697                                       {S32, GlobalPtr, 8, GlobalAlign8},
698                                       {S32, GlobalPtr, 16, GlobalAlign16},
699 
700                                       {S32, LocalPtr, 32, 32},
701                                       {S64, LocalPtr, 64, 32},
702                                       {V2S32, LocalPtr, 64, 32},
703                                       {S32, LocalPtr, 8, 8},
704                                       {S32, LocalPtr, 16, 16},
705                                       {V2S16, LocalPtr, 32, 32},
706 
707                                       {S32, PrivatePtr, 32, 32},
708                                       {S32, PrivatePtr, 8, 8},
709                                       {S32, PrivatePtr, 16, 16},
710                                       {V2S16, PrivatePtr, 32, 32},
711 
712                                       {S32, FlatPtr, 32, GlobalAlign32},
713                                       {S32, FlatPtr, 16, GlobalAlign16},
714                                       {S32, FlatPtr, 8, GlobalAlign8},
715                                       {V2S16, FlatPtr, 32, GlobalAlign32},
716 
717                                       {S32, ConstantPtr, 32, GlobalAlign32},
718                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
719                                       {V3S32, ConstantPtr, 96, GlobalAlign32},
720                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
721                                       {S64, ConstantPtr, 64, GlobalAlign32},
722                                       {S128, ConstantPtr, 128, GlobalAlign32},
723                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
724     Actions
725         .customIf(typeIs(1, Constant32Ptr))
726         .narrowScalarIf(
727             [=](const LegalityQuery &Query) -> bool {
728               return !Query.Types[0].isVector() && needToSplitLoad(Query);
729             },
730             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
731               const LLT DstTy = Query.Types[0];
732               const LLT PtrTy = Query.Types[1];
733 
734               const unsigned DstSize = DstTy.getSizeInBits();
735               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
736 
737               // Split extloads.
738               if (DstSize > MemSize)
739                 return std::make_pair(0, LLT::scalar(MemSize));
740 
741               if (DstSize > 32 && (DstSize % 32 != 0)) {
742                 // FIXME: Need a way to specify non-extload of larger size if
743                 // suitably aligned.
744                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
745               }
746 
747               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
748               if (MemSize > MaxSize)
749                 return std::make_pair(0, LLT::scalar(MaxSize));
750 
751               unsigned Align = Query.MMODescrs[0].AlignInBits;
752               return std::make_pair(0, LLT::scalar(Align));
753             })
754         .fewerElementsIf(
755             [=](const LegalityQuery &Query) -> bool {
756               return Query.Types[0].isVector() && needToSplitLoad(Query);
757             },
758             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
759               const LLT DstTy = Query.Types[0];
760               const LLT PtrTy = Query.Types[1];
761 
762               LLT EltTy = DstTy.getElementType();
763               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
764 
765               // Split if it's too large for the address space.
766               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
767                 unsigned NumElts = DstTy.getNumElements();
768                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
769 
770                 // FIXME: Refine when odd breakdowns handled
771                 // The scalars will need to be re-legalized.
772                 if (NumPieces == 1 || NumPieces >= NumElts ||
773                     NumElts % NumPieces != 0)
774                   return std::make_pair(0, EltTy);
775 
776                 return std::make_pair(0,
777                                       LLT::vector(NumElts / NumPieces, EltTy));
778               }
779 
780               // Need to split because of alignment.
781               unsigned Align = Query.MMODescrs[0].AlignInBits;
782               unsigned EltSize = EltTy.getSizeInBits();
783               if (EltSize > Align &&
784                   (EltSize / Align < DstTy.getNumElements())) {
785                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
786               }
787 
788               // May need relegalization for the scalars.
789               return std::make_pair(0, EltTy);
790             })
791         .minScalar(0, S32);
792 
793     if (IsStore)
794       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
795 
796     // TODO: Need a bitcast lower option?
797     Actions
798         .legalIf([=](const LegalityQuery &Query) {
799           const LLT Ty0 = Query.Types[0];
800           unsigned Size = Ty0.getSizeInBits();
801           unsigned MemSize = Query.MMODescrs[0].SizeInBits;
802           unsigned Align = Query.MMODescrs[0].AlignInBits;
803 
804           // No extending vector loads.
805           if (Size > MemSize && Ty0.isVector())
806             return false;
807 
808           // FIXME: Widening store from alignment not valid.
809           if (MemSize < Size)
810             MemSize = std::max(MemSize, Align);
811 
812           switch (MemSize) {
813           case 8:
814           case 16:
815             return Size == 32;
816           case 32:
817           case 64:
818           case 128:
819             return true;
820           case 96:
821             return ST.hasDwordx3LoadStores();
822           case 256:
823           case 512:
824             return true;
825           default:
826             return false;
827           }
828         })
829         .widenScalarToNextPow2(0)
830         // TODO: v3s32->v4s32 with alignment
831         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
832   }
833 
834   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
835                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
836                                                   {S32, GlobalPtr, 16, 2 * 8},
837                                                   {S32, LocalPtr, 8, 8},
838                                                   {S32, LocalPtr, 16, 16},
839                                                   {S32, PrivatePtr, 8, 8},
840                                                   {S32, PrivatePtr, 16, 16},
841                                                   {S32, ConstantPtr, 8, 8},
842                                                   {S32, ConstantPtr, 16, 2 * 8}});
843   if (ST.hasFlatAddressSpace()) {
844     ExtLoads.legalForTypesWithMemDesc(
845         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
846   }
847 
848   ExtLoads.clampScalar(0, S32, S32)
849           .widenScalarToNextPow2(0)
850           .unsupportedIfMemSizeNotPow2()
851           .lower();
852 
853   auto &Atomics = getActionDefinitionsBuilder(
854     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
855      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
856      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
857      G_ATOMICRMW_UMIN})
858     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
859                {S64, GlobalPtr}, {S64, LocalPtr}});
860   if (ST.hasFlatAddressSpace()) {
861     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
862   }
863 
864   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
865     .legalFor({{S32, LocalPtr}});
866 
867   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
868   // demarshalling
869   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
870     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
871                 {S32, FlatPtr}, {S64, FlatPtr}})
872     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
873                {S32, RegionPtr}, {S64, RegionPtr}});
874 
875   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS)
876     .lower();
877 
878   // TODO: Pointer types, any 32-bit or 64-bit vector
879   getActionDefinitionsBuilder(G_SELECT)
880     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
881           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
882           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1})
883     .clampScalar(0, S16, S64)
884     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
885     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
886     .scalarize(1)
887     .clampMaxNumElements(0, S32, 2)
888     .clampMaxNumElements(0, LocalPtr, 2)
889     .clampMaxNumElements(0, PrivatePtr, 2)
890     .scalarize(0)
891     .widenScalarToNextPow2(0)
892     .legalIf(all(isPointer(0), typeIs(1, S1)));
893 
894   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
895   // be more flexible with the shift amount type.
896   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
897     .legalFor({{S32, S32}, {S64, S32}});
898   if (ST.has16BitInsts()) {
899     if (ST.hasVOP3PInsts()) {
900       Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
901             .clampMaxNumElements(0, S16, 2);
902     } else
903       Shifts.legalFor({{S16, S32}, {S16, S16}});
904 
905     Shifts.clampScalar(1, S16, S32);
906     Shifts.clampScalar(0, S16, S64);
907     Shifts.widenScalarToNextPow2(0, 16);
908   } else {
909     // Make sure we legalize the shift amount type first, as the general
910     // expansion for the shifted type will produce much worse code if it hasn't
911     // been truncated already.
912     Shifts.clampScalar(1, S32, S32);
913     Shifts.clampScalar(0, S32, S64);
914     Shifts.widenScalarToNextPow2(0, 32);
915   }
916   Shifts.scalarize(0);
917 
918   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
919     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
920     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
921     unsigned IdxTypeIdx = 2;
922 
923     getActionDefinitionsBuilder(Op)
924       .customIf([=](const LegalityQuery &Query) {
925           const LLT EltTy = Query.Types[EltTypeIdx];
926           const LLT VecTy = Query.Types[VecTypeIdx];
927           const LLT IdxTy = Query.Types[IdxTypeIdx];
928           return (EltTy.getSizeInBits() == 16 ||
929                   EltTy.getSizeInBits() % 32 == 0) &&
930                  VecTy.getSizeInBits() % 32 == 0 &&
931                  VecTy.getSizeInBits() <= 1024 &&
932                  IdxTy.getSizeInBits() == 32;
933         })
934       .clampScalar(EltTypeIdx, S32, S64)
935       .clampScalar(VecTypeIdx, S32, S64)
936       .clampScalar(IdxTypeIdx, S32, S32);
937   }
938 
939   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
940     .unsupportedIf([=](const LegalityQuery &Query) {
941         const LLT &EltTy = Query.Types[1].getElementType();
942         return Query.Types[0] != EltTy;
943       });
944 
945   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
946     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
947     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
948 
949     // FIXME: Doesn't handle extract of illegal sizes.
950     getActionDefinitionsBuilder(Op)
951       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
952       // FIXME: Multiples of 16 should not be legal.
953       .legalIf([=](const LegalityQuery &Query) {
954           const LLT BigTy = Query.Types[BigTyIdx];
955           const LLT LitTy = Query.Types[LitTyIdx];
956           return (BigTy.getSizeInBits() % 32 == 0) &&
957                  (LitTy.getSizeInBits() % 16 == 0);
958         })
959       .widenScalarIf(
960         [=](const LegalityQuery &Query) {
961           const LLT BigTy = Query.Types[BigTyIdx];
962           return (BigTy.getScalarSizeInBits() < 16);
963         },
964         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
965       .widenScalarIf(
966         [=](const LegalityQuery &Query) {
967           const LLT LitTy = Query.Types[LitTyIdx];
968           return (LitTy.getScalarSizeInBits() < 16);
969         },
970         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
971       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
972       .widenScalarToNextPow2(BigTyIdx, 32);
973 
974   }
975 
976   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
977     .legalForCartesianProduct(AllS32Vectors, {S32})
978     .legalForCartesianProduct(AllS64Vectors, {S64})
979     .clampNumElements(0, V16S32, V32S32)
980     .clampNumElements(0, V2S64, V16S64)
981     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
982 
983   if (ST.hasScalarPackInsts())
984     BuildVector.legalFor({V2S16, S32});
985 
986   BuildVector
987     .minScalarSameAs(1, 0)
988     .legalIf(isRegisterType(0))
989     .minScalarOrElt(0, S32);
990 
991   if (ST.hasScalarPackInsts()) {
992     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
993       .legalFor({V2S16, S32})
994       .lower();
995   } else {
996     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
997       .lower();
998   }
999 
1000   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1001     .legalIf(isRegisterType(0));
1002 
1003   // TODO: Don't fully scalarize v2s16 pieces
1004   getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1005 
1006   // Merge/Unmerge
1007   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1008     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1009     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1010 
1011     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1012       const LLT &Ty = Query.Types[TypeIdx];
1013       if (Ty.isVector()) {
1014         const LLT &EltTy = Ty.getElementType();
1015         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
1016           return true;
1017         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1018           return true;
1019       }
1020       return false;
1021     };
1022 
1023     auto &Builder = getActionDefinitionsBuilder(Op)
1024       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1025       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1026       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1027       // valid.
1028       .clampScalar(LitTyIdx, S16, S256)
1029       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1030       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1031       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1032                            elementTypeIs(1, S16)),
1033                        changeTo(1, V2S16))
1034       // Break up vectors with weird elements into scalars
1035       .fewerElementsIf(
1036         [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
1037         scalarize(0))
1038       .fewerElementsIf(
1039         [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
1040         scalarize(1))
1041       .clampScalar(BigTyIdx, S32, S1024)
1042       .lowerFor({{S16, V2S16}});
1043 
1044     if (Op == G_MERGE_VALUES) {
1045       Builder.widenScalarIf(
1046         // TODO: Use 16-bit shifts if legal for 8-bit values?
1047         [=](const LegalityQuery &Query) {
1048           const LLT Ty = Query.Types[LitTyIdx];
1049           return Ty.getSizeInBits() < 32;
1050         },
1051         changeTo(LitTyIdx, S32));
1052     }
1053 
1054     Builder.widenScalarIf(
1055       [=](const LegalityQuery &Query) {
1056         const LLT Ty = Query.Types[BigTyIdx];
1057         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1058           Ty.getSizeInBits() % 16 != 0;
1059       },
1060       [=](const LegalityQuery &Query) {
1061         // Pick the next power of 2, or a multiple of 64 over 128.
1062         // Whichever is smaller.
1063         const LLT &Ty = Query.Types[BigTyIdx];
1064         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1065         if (NewSizeInBits >= 256) {
1066           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1067           if (RoundedTo < NewSizeInBits)
1068             NewSizeInBits = RoundedTo;
1069         }
1070         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1071       })
1072       .legalIf([=](const LegalityQuery &Query) {
1073           const LLT &BigTy = Query.Types[BigTyIdx];
1074           const LLT &LitTy = Query.Types[LitTyIdx];
1075 
1076           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1077             return false;
1078           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1079             return false;
1080 
1081           return BigTy.getSizeInBits() % 16 == 0 &&
1082                  LitTy.getSizeInBits() % 16 == 0 &&
1083                  BigTy.getSizeInBits() <= 1024;
1084         })
1085       // Any vectors left are the wrong size. Scalarize them.
1086       .scalarize(0)
1087       .scalarize(1);
1088   }
1089 
1090   getActionDefinitionsBuilder(G_SEXT_INREG).lower();
1091 
1092   computeTables();
1093   verify(*ST.getInstrInfo());
1094 }
1095 
1096 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1097                                          MachineRegisterInfo &MRI,
1098                                          MachineIRBuilder &B,
1099                                          GISelChangeObserver &Observer) const {
1100   switch (MI.getOpcode()) {
1101   case TargetOpcode::G_ADDRSPACE_CAST:
1102     return legalizeAddrSpaceCast(MI, MRI, B);
1103   case TargetOpcode::G_FRINT:
1104     return legalizeFrint(MI, MRI, B);
1105   case TargetOpcode::G_FCEIL:
1106     return legalizeFceil(MI, MRI, B);
1107   case TargetOpcode::G_INTRINSIC_TRUNC:
1108     return legalizeIntrinsicTrunc(MI, MRI, B);
1109   case TargetOpcode::G_SITOFP:
1110     return legalizeITOFP(MI, MRI, B, true);
1111   case TargetOpcode::G_UITOFP:
1112     return legalizeITOFP(MI, MRI, B, false);
1113   case TargetOpcode::G_FMINNUM:
1114   case TargetOpcode::G_FMAXNUM:
1115   case TargetOpcode::G_FMINNUM_IEEE:
1116   case TargetOpcode::G_FMAXNUM_IEEE:
1117     return legalizeMinNumMaxNum(MI, MRI, B);
1118   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1119     return legalizeExtractVectorElt(MI, MRI, B);
1120   case TargetOpcode::G_INSERT_VECTOR_ELT:
1121     return legalizeInsertVectorElt(MI, MRI, B);
1122   case TargetOpcode::G_FSIN:
1123   case TargetOpcode::G_FCOS:
1124     return legalizeSinCos(MI, MRI, B);
1125   case TargetOpcode::G_GLOBAL_VALUE:
1126     return legalizeGlobalValue(MI, MRI, B);
1127   case TargetOpcode::G_LOAD:
1128     return legalizeLoad(MI, MRI, B, Observer);
1129   case TargetOpcode::G_FMAD:
1130     return legalizeFMad(MI, MRI, B);
1131   case TargetOpcode::G_FDIV:
1132     return legalizeFDIV(MI, MRI, B);
1133   case TargetOpcode::G_ATOMIC_CMPXCHG:
1134     return legalizeAtomicCmpXChg(MI, MRI, B);
1135   default:
1136     return false;
1137   }
1138 
1139   llvm_unreachable("expected switch to return");
1140 }
1141 
1142 Register AMDGPULegalizerInfo::getSegmentAperture(
1143   unsigned AS,
1144   MachineRegisterInfo &MRI,
1145   MachineIRBuilder &B) const {
1146   MachineFunction &MF = B.getMF();
1147   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1148   const LLT S32 = LLT::scalar(32);
1149 
1150   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1151 
1152   if (ST.hasApertureRegs()) {
1153     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1154     // getreg.
1155     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1156         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1157         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1158     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1159         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1160         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1161     unsigned Encoding =
1162         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1163         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1164         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1165 
1166     Register ApertureReg = MRI.createGenericVirtualRegister(S32);
1167     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1168 
1169     B.buildInstr(AMDGPU::S_GETREG_B32)
1170       .addDef(GetReg)
1171       .addImm(Encoding);
1172     MRI.setType(GetReg, S32);
1173 
1174     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1175     B.buildInstr(TargetOpcode::G_SHL)
1176       .addDef(ApertureReg)
1177       .addUse(GetReg)
1178       .addUse(ShiftAmt.getReg(0));
1179 
1180     return ApertureReg;
1181   }
1182 
1183   Register QueuePtr = MRI.createGenericVirtualRegister(
1184     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1185 
1186   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1187   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1188     return Register();
1189 
1190   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1191   // private_segment_aperture_base_hi.
1192   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1193 
1194   // TODO: can we be smarter about machine pointer info?
1195   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1196   MachineMemOperand *MMO = MF.getMachineMemOperand(
1197     PtrInfo,
1198     MachineMemOperand::MOLoad |
1199     MachineMemOperand::MODereferenceable |
1200     MachineMemOperand::MOInvariant,
1201     4,
1202     MinAlign(64, StructOffset));
1203 
1204   Register LoadResult = MRI.createGenericVirtualRegister(S32);
1205   Register LoadAddr;
1206 
1207   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1208   B.buildLoad(LoadResult, LoadAddr, *MMO);
1209   return LoadResult;
1210 }
1211 
1212 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1213   MachineInstr &MI, MachineRegisterInfo &MRI,
1214   MachineIRBuilder &B) const {
1215   MachineFunction &MF = B.getMF();
1216 
1217   B.setInstr(MI);
1218 
1219   const LLT S32 = LLT::scalar(32);
1220   Register Dst = MI.getOperand(0).getReg();
1221   Register Src = MI.getOperand(1).getReg();
1222 
1223   LLT DstTy = MRI.getType(Dst);
1224   LLT SrcTy = MRI.getType(Src);
1225   unsigned DestAS = DstTy.getAddressSpace();
1226   unsigned SrcAS = SrcTy.getAddressSpace();
1227 
1228   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1229   // vector element.
1230   assert(!DstTy.isVector());
1231 
1232   const AMDGPUTargetMachine &TM
1233     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1234 
1235   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1236   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1237     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1238     return true;
1239   }
1240 
1241   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1242     // Truncate.
1243     B.buildExtract(Dst, Src, 0);
1244     MI.eraseFromParent();
1245     return true;
1246   }
1247 
1248   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1249     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1250     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1251 
1252     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1253     // another. Merge operands are required to be the same type, but creating an
1254     // extra ptrtoint would be kind of pointless.
1255     auto HighAddr = B.buildConstant(
1256       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1257     B.buildMerge(Dst, {Src, HighAddr.getReg(0)});
1258     MI.eraseFromParent();
1259     return true;
1260   }
1261 
1262   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1263     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1264            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1265     unsigned NullVal = TM.getNullPointerValue(DestAS);
1266 
1267     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1268     auto FlatNull = B.buildConstant(SrcTy, 0);
1269 
1270     Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy);
1271 
1272     // Extract low 32-bits of the pointer.
1273     B.buildExtract(PtrLo32, Src, 0);
1274 
1275     Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1276     B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
1277     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1278 
1279     MI.eraseFromParent();
1280     return true;
1281   }
1282 
1283   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1284     return false;
1285 
1286   if (!ST.hasFlatAddressSpace())
1287     return false;
1288 
1289   auto SegmentNull =
1290       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1291   auto FlatNull =
1292       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1293 
1294   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1295   if (!ApertureReg.isValid())
1296     return false;
1297 
1298   Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1299   B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
1300 
1301   Register BuildPtr = MRI.createGenericVirtualRegister(DstTy);
1302 
1303   // Coerce the type of the low half of the result so we can use merge_values.
1304   Register SrcAsInt = MRI.createGenericVirtualRegister(S32);
1305   B.buildInstr(TargetOpcode::G_PTRTOINT)
1306     .addDef(SrcAsInt)
1307     .addUse(Src);
1308 
1309   // TODO: Should we allow mismatched types but matching sizes in merges to
1310   // avoid the ptrtoint?
1311   B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
1312   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
1313 
1314   MI.eraseFromParent();
1315   return true;
1316 }
1317 
1318 bool AMDGPULegalizerInfo::legalizeFrint(
1319   MachineInstr &MI, MachineRegisterInfo &MRI,
1320   MachineIRBuilder &B) const {
1321   B.setInstr(MI);
1322 
1323   Register Src = MI.getOperand(1).getReg();
1324   LLT Ty = MRI.getType(Src);
1325   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1326 
1327   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1328   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1329 
1330   auto C1 = B.buildFConstant(Ty, C1Val);
1331   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1332 
1333   // TODO: Should this propagate fast-math-flags?
1334   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1335   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1336 
1337   auto C2 = B.buildFConstant(Ty, C2Val);
1338   auto Fabs = B.buildFAbs(Ty, Src);
1339 
1340   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1341   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1342   return true;
1343 }
1344 
1345 bool AMDGPULegalizerInfo::legalizeFceil(
1346   MachineInstr &MI, MachineRegisterInfo &MRI,
1347   MachineIRBuilder &B) const {
1348   B.setInstr(MI);
1349 
1350   const LLT S1 = LLT::scalar(1);
1351   const LLT S64 = LLT::scalar(64);
1352 
1353   Register Src = MI.getOperand(1).getReg();
1354   assert(MRI.getType(Src) == S64);
1355 
1356   // result = trunc(src)
1357   // if (src > 0.0 && src != result)
1358   //   result += 1.0
1359 
1360   auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src});
1361 
1362   const auto Zero = B.buildFConstant(S64, 0.0);
1363   const auto One = B.buildFConstant(S64, 1.0);
1364   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1365   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1366   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1367   auto Add = B.buildSelect(S64, And, One, Zero);
1368 
1369   // TODO: Should this propagate fast-math-flags?
1370   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1371   return true;
1372 }
1373 
1374 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1375                                               MachineIRBuilder &B) {
1376   const unsigned FractBits = 52;
1377   const unsigned ExpBits = 11;
1378   LLT S32 = LLT::scalar(32);
1379 
1380   auto Const0 = B.buildConstant(S32, FractBits - 32);
1381   auto Const1 = B.buildConstant(S32, ExpBits);
1382 
1383   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1384     .addUse(Const0.getReg(0))
1385     .addUse(Const1.getReg(0));
1386 
1387   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1388 }
1389 
1390 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1391   MachineInstr &MI, MachineRegisterInfo &MRI,
1392   MachineIRBuilder &B) const {
1393   B.setInstr(MI);
1394 
1395   const LLT S1 = LLT::scalar(1);
1396   const LLT S32 = LLT::scalar(32);
1397   const LLT S64 = LLT::scalar(64);
1398 
1399   Register Src = MI.getOperand(1).getReg();
1400   assert(MRI.getType(Src) == S64);
1401 
1402   // TODO: Should this use extract since the low half is unused?
1403   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1404   Register Hi = Unmerge.getReg(1);
1405 
1406   // Extract the upper half, since this is where we will find the sign and
1407   // exponent.
1408   auto Exp = extractF64Exponent(Hi, B);
1409 
1410   const unsigned FractBits = 52;
1411 
1412   // Extract the sign bit.
1413   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1414   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1415 
1416   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1417 
1418   const auto Zero32 = B.buildConstant(S32, 0);
1419 
1420   // Extend back to 64-bits.
1421   auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)});
1422 
1423   auto Shr = B.buildAShr(S64, FractMask, Exp);
1424   auto Not = B.buildNot(S64, Shr);
1425   auto Tmp0 = B.buildAnd(S64, Src, Not);
1426   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1427 
1428   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1429   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1430 
1431   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1432   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1433   return true;
1434 }
1435 
1436 bool AMDGPULegalizerInfo::legalizeITOFP(
1437   MachineInstr &MI, MachineRegisterInfo &MRI,
1438   MachineIRBuilder &B, bool Signed) const {
1439   B.setInstr(MI);
1440 
1441   Register Dst = MI.getOperand(0).getReg();
1442   Register Src = MI.getOperand(1).getReg();
1443 
1444   const LLT S64 = LLT::scalar(64);
1445   const LLT S32 = LLT::scalar(32);
1446 
1447   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1448 
1449   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1450 
1451   auto CvtHi = Signed ?
1452     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1453     B.buildUITOFP(S64, Unmerge.getReg(1));
1454 
1455   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1456 
1457   auto ThirtyTwo = B.buildConstant(S32, 32);
1458   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1459     .addUse(CvtHi.getReg(0))
1460     .addUse(ThirtyTwo.getReg(0));
1461 
1462   // TODO: Should this propagate fast-math-flags?
1463   B.buildFAdd(Dst, LdExp, CvtLo);
1464   MI.eraseFromParent();
1465   return true;
1466 }
1467 
1468 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1469   MachineInstr &MI, MachineRegisterInfo &MRI,
1470   MachineIRBuilder &B) const {
1471   MachineFunction &MF = B.getMF();
1472   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1473 
1474   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1475                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1476 
1477   // With ieee_mode disabled, the instructions have the correct behavior
1478   // already for G_FMINNUM/G_FMAXNUM
1479   if (!MFI->getMode().IEEE)
1480     return !IsIEEEOp;
1481 
1482   if (IsIEEEOp)
1483     return true;
1484 
1485   MachineIRBuilder HelperBuilder(MI);
1486   GISelObserverWrapper DummyObserver;
1487   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1488   HelperBuilder.setInstr(MI);
1489   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1490 }
1491 
1492 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1493   MachineInstr &MI, MachineRegisterInfo &MRI,
1494   MachineIRBuilder &B) const {
1495   // TODO: Should move some of this into LegalizerHelper.
1496 
1497   // TODO: Promote dynamic indexing of s16 to s32
1498   // TODO: Dynamic s64 indexing is only legal for SGPR.
1499   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
1500   if (!IdxVal) // Dynamic case will be selected to register indexing.
1501     return true;
1502 
1503   Register Dst = MI.getOperand(0).getReg();
1504   Register Vec = MI.getOperand(1).getReg();
1505 
1506   LLT VecTy = MRI.getType(Vec);
1507   LLT EltTy = VecTy.getElementType();
1508   assert(EltTy == MRI.getType(Dst));
1509 
1510   B.setInstr(MI);
1511 
1512   if (IdxVal.getValue() < VecTy.getNumElements())
1513     B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits());
1514   else
1515     B.buildUndef(Dst);
1516 
1517   MI.eraseFromParent();
1518   return true;
1519 }
1520 
1521 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1522   MachineInstr &MI, MachineRegisterInfo &MRI,
1523   MachineIRBuilder &B) const {
1524   // TODO: Should move some of this into LegalizerHelper.
1525 
1526   // TODO: Promote dynamic indexing of s16 to s32
1527   // TODO: Dynamic s64 indexing is only legal for SGPR.
1528   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI);
1529   if (!IdxVal) // Dynamic case will be selected to register indexing.
1530     return true;
1531 
1532   Register Dst = MI.getOperand(0).getReg();
1533   Register Vec = MI.getOperand(1).getReg();
1534   Register Ins = MI.getOperand(2).getReg();
1535 
1536   LLT VecTy = MRI.getType(Vec);
1537   LLT EltTy = VecTy.getElementType();
1538   assert(EltTy == MRI.getType(Ins));
1539 
1540   B.setInstr(MI);
1541 
1542   if (IdxVal.getValue() < VecTy.getNumElements())
1543     B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits());
1544   else
1545     B.buildUndef(Dst);
1546 
1547   MI.eraseFromParent();
1548   return true;
1549 }
1550 
1551 bool AMDGPULegalizerInfo::legalizeSinCos(
1552   MachineInstr &MI, MachineRegisterInfo &MRI,
1553   MachineIRBuilder &B) const {
1554   B.setInstr(MI);
1555 
1556   Register DstReg = MI.getOperand(0).getReg();
1557   Register SrcReg = MI.getOperand(1).getReg();
1558   LLT Ty = MRI.getType(DstReg);
1559   unsigned Flags = MI.getFlags();
1560 
1561   Register TrigVal;
1562   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1563   if (ST.hasTrigReducedRange()) {
1564     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1565     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1566       .addUse(MulVal.getReg(0))
1567       .setMIFlags(Flags).getReg(0);
1568   } else
1569     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1570 
1571   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1572     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1573   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1574     .addUse(TrigVal)
1575     .setMIFlags(Flags);
1576   MI.eraseFromParent();
1577   return true;
1578 }
1579 
1580 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1581   Register DstReg, LLT PtrTy,
1582   MachineIRBuilder &B, const GlobalValue *GV,
1583   unsigned Offset, unsigned GAFlags) const {
1584   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1585   // to the following code sequence:
1586   //
1587   // For constant address space:
1588   //   s_getpc_b64 s[0:1]
1589   //   s_add_u32 s0, s0, $symbol
1590   //   s_addc_u32 s1, s1, 0
1591   //
1592   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1593   //   a fixup or relocation is emitted to replace $symbol with a literal
1594   //   constant, which is a pc-relative offset from the encoding of the $symbol
1595   //   operand to the global variable.
1596   //
1597   // For global address space:
1598   //   s_getpc_b64 s[0:1]
1599   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1600   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1601   //
1602   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1603   //   fixups or relocations are emitted to replace $symbol@*@lo and
1604   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1605   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
1606   //   operand to the global variable.
1607   //
1608   // What we want here is an offset from the value returned by s_getpc
1609   // (which is the address of the s_add_u32 instruction) to the global
1610   // variable, but since the encoding of $symbol starts 4 bytes after the start
1611   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1612   // small. This requires us to add 4 to the global variable offset in order to
1613   // compute the correct address.
1614 
1615   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1616 
1617   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1618     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1619 
1620   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1621     .addDef(PCReg);
1622 
1623   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1624   if (GAFlags == SIInstrInfo::MO_NONE)
1625     MIB.addImm(0);
1626   else
1627     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1628 
1629   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1630 
1631   if (PtrTy.getSizeInBits() == 32)
1632     B.buildExtract(DstReg, PCReg, 0);
1633   return true;
1634  }
1635 
1636 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1637   MachineInstr &MI, MachineRegisterInfo &MRI,
1638   MachineIRBuilder &B) const {
1639   Register DstReg = MI.getOperand(0).getReg();
1640   LLT Ty = MRI.getType(DstReg);
1641   unsigned AS = Ty.getAddressSpace();
1642 
1643   const GlobalValue *GV = MI.getOperand(1).getGlobal();
1644   MachineFunction &MF = B.getMF();
1645   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1646   B.setInstr(MI);
1647 
1648   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1649     if (!MFI->isEntryFunction()) {
1650       const Function &Fn = MF.getFunction();
1651       DiagnosticInfoUnsupported BadLDSDecl(
1652         Fn, "local memory global used by non-kernel function", MI.getDebugLoc());
1653       Fn.getContext().diagnose(BadLDSDecl);
1654     }
1655 
1656     // TODO: We could emit code to handle the initialization somewhere.
1657     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1658       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1659       MI.eraseFromParent();
1660       return true;
1661     }
1662 
1663     const Function &Fn = MF.getFunction();
1664     DiagnosticInfoUnsupported BadInit(
1665       Fn, "unsupported initializer for address space", MI.getDebugLoc());
1666     Fn.getContext().diagnose(BadInit);
1667     return true;
1668   }
1669 
1670   const SITargetLowering *TLI = ST.getTargetLowering();
1671 
1672   if (TLI->shouldEmitFixup(GV)) {
1673     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
1674     MI.eraseFromParent();
1675     return true;
1676   }
1677 
1678   if (TLI->shouldEmitPCReloc(GV)) {
1679     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
1680     MI.eraseFromParent();
1681     return true;
1682   }
1683 
1684   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1685   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
1686 
1687   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
1688     MachinePointerInfo::getGOT(MF),
1689     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1690     MachineMemOperand::MOInvariant,
1691     8 /*Size*/, 8 /*Align*/);
1692 
1693   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
1694 
1695   if (Ty.getSizeInBits() == 32) {
1696     // Truncate if this is a 32-bit constant adrdess.
1697     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
1698     B.buildExtract(DstReg, Load, 0);
1699   } else
1700     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
1701 
1702   MI.eraseFromParent();
1703   return true;
1704 }
1705 
1706 bool AMDGPULegalizerInfo::legalizeLoad(
1707   MachineInstr &MI, MachineRegisterInfo &MRI,
1708   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
1709   B.setInstr(MI);
1710   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1711   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
1712   Observer.changingInstr(MI);
1713   MI.getOperand(1).setReg(Cast.getReg(0));
1714   Observer.changedInstr(MI);
1715   return true;
1716 }
1717 
1718 bool AMDGPULegalizerInfo::legalizeFMad(
1719   MachineInstr &MI, MachineRegisterInfo &MRI,
1720   MachineIRBuilder &B) const {
1721   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1722   assert(Ty.isScalar());
1723 
1724   MachineFunction &MF = B.getMF();
1725   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1726 
1727   // TODO: Always legal with future ftz flag.
1728   if (Ty == LLT::scalar(32) && !MFI->getMode().FP32Denormals)
1729     return true;
1730   if (Ty == LLT::scalar(16) && !MFI->getMode().FP64FP16Denormals)
1731     return true;
1732 
1733 
1734   MachineIRBuilder HelperBuilder(MI);
1735   GISelObserverWrapper DummyObserver;
1736   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1737   HelperBuilder.setMBB(*MI.getParent());
1738   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
1739 }
1740 
1741 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
1742   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
1743   Register DstReg = MI.getOperand(0).getReg();
1744   Register PtrReg = MI.getOperand(1).getReg();
1745   Register CmpVal = MI.getOperand(2).getReg();
1746   Register NewVal = MI.getOperand(3).getReg();
1747 
1748   assert(SITargetLowering::isFlatGlobalAddrSpace(
1749            MRI.getType(PtrReg).getAddressSpace()) &&
1750          "this should not have been custom lowered");
1751 
1752   LLT ValTy = MRI.getType(CmpVal);
1753   LLT VecTy = LLT::vector(2, ValTy);
1754 
1755   B.setInstr(MI);
1756   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
1757 
1758   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
1759     .addDef(DstReg)
1760     .addUse(PtrReg)
1761     .addUse(PackedVal)
1762     .setMemRefs(MI.memoperands());
1763 
1764   MI.eraseFromParent();
1765   return true;
1766 }
1767 
1768 // Return the use branch instruction, otherwise null if the usage is invalid.
1769 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
1770                                        MachineRegisterInfo &MRI) {
1771   Register CondDef = MI.getOperand(0).getReg();
1772   if (!MRI.hasOneNonDBGUse(CondDef))
1773     return nullptr;
1774 
1775   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
1776   return UseMI.getParent() == MI.getParent() &&
1777     UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr;
1778 }
1779 
1780 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
1781                                                 Register Reg, LLT Ty) const {
1782   Register LiveIn = MRI.getLiveInVirtReg(Reg);
1783   if (LiveIn)
1784     return LiveIn;
1785 
1786   Register NewReg = MRI.createGenericVirtualRegister(Ty);
1787   MRI.addLiveIn(Reg, NewReg);
1788   return NewReg;
1789 }
1790 
1791 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
1792                                          const ArgDescriptor *Arg) const {
1793   if (!Arg->isRegister() || !Arg->getRegister().isValid())
1794     return false; // TODO: Handle these
1795 
1796   assert(Arg->getRegister().isPhysical());
1797 
1798   MachineRegisterInfo &MRI = *B.getMRI();
1799 
1800   LLT Ty = MRI.getType(DstReg);
1801   Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
1802 
1803   if (Arg->isMasked()) {
1804     // TODO: Should we try to emit this once in the entry block?
1805     const LLT S32 = LLT::scalar(32);
1806     const unsigned Mask = Arg->getMask();
1807     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
1808 
1809     Register AndMaskSrc = LiveIn;
1810 
1811     if (Shift != 0) {
1812       auto ShiftAmt = B.buildConstant(S32, Shift);
1813       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
1814     }
1815 
1816     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
1817   } else
1818     B.buildCopy(DstReg, LiveIn);
1819 
1820   // Insert the argument copy if it doens't already exist.
1821   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
1822   if (!MRI.getVRegDef(LiveIn)) {
1823     // FIXME: Should have scoped insert pt
1824     MachineBasicBlock &OrigInsBB = B.getMBB();
1825     auto OrigInsPt = B.getInsertPt();
1826 
1827     MachineBasicBlock &EntryMBB = B.getMF().front();
1828     EntryMBB.addLiveIn(Arg->getRegister());
1829     B.setInsertPt(EntryMBB, EntryMBB.begin());
1830     B.buildCopy(LiveIn, Arg->getRegister());
1831 
1832     B.setInsertPt(OrigInsBB, OrigInsPt);
1833   }
1834 
1835   return true;
1836 }
1837 
1838 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
1839   MachineInstr &MI,
1840   MachineRegisterInfo &MRI,
1841   MachineIRBuilder &B,
1842   AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
1843   B.setInstr(MI);
1844 
1845   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1846 
1847   const ArgDescriptor *Arg;
1848   const TargetRegisterClass *RC;
1849   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
1850   if (!Arg) {
1851     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
1852     return false;
1853   }
1854 
1855   if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
1856     MI.eraseFromParent();
1857     return true;
1858   }
1859 
1860   return false;
1861 }
1862 
1863 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
1864                                        MachineRegisterInfo &MRI,
1865                                        MachineIRBuilder &B) const {
1866   B.setInstr(MI);
1867   Register Dst = MI.getOperand(0).getReg();
1868   LLT DstTy = MRI.getType(Dst);
1869   LLT S16 = LLT::scalar(16);
1870   LLT S32 = LLT::scalar(32);
1871   LLT S64 = LLT::scalar(64);
1872 
1873   if (legalizeFastUnsafeFDIV(MI, MRI, B))
1874     return true;
1875 
1876   if (DstTy == S16)
1877     return legalizeFDIV16(MI, MRI, B);
1878   if (DstTy == S32)
1879     return legalizeFDIV32(MI, MRI, B);
1880   if (DstTy == S64)
1881     return legalizeFDIV64(MI, MRI, B);
1882 
1883   return false;
1884 }
1885 
1886 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
1887                                                  MachineRegisterInfo &MRI,
1888                                                  MachineIRBuilder &B) const {
1889   Register Res = MI.getOperand(0).getReg();
1890   Register LHS = MI.getOperand(1).getReg();
1891   Register RHS = MI.getOperand(2).getReg();
1892 
1893   uint16_t Flags = MI.getFlags();
1894 
1895   LLT ResTy = MRI.getType(Res);
1896   LLT S32 = LLT::scalar(32);
1897   LLT S64 = LLT::scalar(64);
1898 
1899   const MachineFunction &MF = B.getMF();
1900   bool Unsafe =
1901     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
1902 
1903   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
1904     return false;
1905 
1906   if (!Unsafe && ResTy == S32 &&
1907       MF.getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals)
1908     return false;
1909 
1910   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
1911     // 1 / x -> RCP(x)
1912     if (CLHS->isExactlyValue(1.0)) {
1913       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
1914         .addUse(RHS)
1915         .setMIFlags(Flags);
1916 
1917       MI.eraseFromParent();
1918       return true;
1919     }
1920 
1921     // -1 / x -> RCP( FNEG(x) )
1922     if (CLHS->isExactlyValue(-1.0)) {
1923       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
1924       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
1925         .addUse(FNeg.getReg(0))
1926         .setMIFlags(Flags);
1927 
1928       MI.eraseFromParent();
1929       return true;
1930     }
1931   }
1932 
1933   // x / y -> x * (1.0 / y)
1934   if (Unsafe) {
1935     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
1936       .addUse(RHS)
1937       .setMIFlags(Flags);
1938     B.buildFMul(Res, LHS, RCP, Flags);
1939 
1940     MI.eraseFromParent();
1941     return true;
1942   }
1943 
1944   return false;
1945 }
1946 
1947 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
1948                                          MachineRegisterInfo &MRI,
1949                                          MachineIRBuilder &B) const {
1950   B.setInstr(MI);
1951   Register Res = MI.getOperand(0).getReg();
1952   Register LHS = MI.getOperand(1).getReg();
1953   Register RHS = MI.getOperand(2).getReg();
1954 
1955   uint16_t Flags = MI.getFlags();
1956 
1957   LLT S16 = LLT::scalar(16);
1958   LLT S32 = LLT::scalar(32);
1959 
1960   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
1961   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
1962 
1963   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
1964     .addUse(RHSExt.getReg(0))
1965     .setMIFlags(Flags);
1966 
1967   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
1968   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
1969 
1970   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
1971     .addUse(RDst.getReg(0))
1972     .addUse(RHS)
1973     .addUse(LHS)
1974     .setMIFlags(Flags);
1975 
1976   MI.eraseFromParent();
1977   return true;
1978 }
1979 
1980 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
1981 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
1982 static void toggleSPDenormMode(bool Enable,
1983                                MachineIRBuilder &B,
1984                                const GCNSubtarget &ST,
1985                                AMDGPU::SIModeRegisterDefaults Mode) {
1986   // Set SP denorm mode to this value.
1987   unsigned SPDenormMode =
1988     Enable ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
1989 
1990   if (ST.hasDenormModeInst()) {
1991     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
1992     unsigned DPDenormModeDefault = Mode.FP64FP16Denormals
1993                                    ? FP_DENORM_FLUSH_NONE
1994                                    : FP_DENORM_FLUSH_IN_FLUSH_OUT;
1995 
1996     unsigned NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
1997     B.buildInstr(AMDGPU::S_DENORM_MODE)
1998       .addImm(NewDenormModeValue);
1999 
2000   } else {
2001     // Select FP32 bit field in mode register.
2002     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2003                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2004                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2005 
2006     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2007       .addImm(SPDenormMode)
2008       .addImm(SPDenormModeBitField);
2009   }
2010 }
2011 
2012 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2013                                          MachineRegisterInfo &MRI,
2014                                          MachineIRBuilder &B) const {
2015   B.setInstr(MI);
2016   Register Res = MI.getOperand(0).getReg();
2017   Register LHS = MI.getOperand(1).getReg();
2018   Register RHS = MI.getOperand(2).getReg();
2019   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2020   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2021 
2022   uint16_t Flags = MI.getFlags();
2023 
2024   LLT S32 = LLT::scalar(32);
2025   LLT S1 = LLT::scalar(1);
2026 
2027   auto One = B.buildFConstant(S32, 1.0f);
2028 
2029   auto DenominatorScaled =
2030     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2031       .addUse(RHS)
2032       .addUse(LHS)
2033       .addImm(1)
2034       .setMIFlags(Flags);
2035   auto NumeratorScaled =
2036     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2037       .addUse(LHS)
2038       .addUse(RHS)
2039       .addImm(0)
2040       .setMIFlags(Flags);
2041 
2042   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2043     .addUse(DenominatorScaled.getReg(0))
2044     .setMIFlags(Flags);
2045   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2046 
2047   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2048   // aren't modeled as reading it.
2049   if (!Mode.FP32Denormals)
2050     toggleSPDenormMode(true, B, ST, Mode);
2051 
2052   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2053   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2054   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2055   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2056   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2057   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2058 
2059   if (!Mode.FP32Denormals)
2060     toggleSPDenormMode(false, B, ST, Mode);
2061 
2062   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2063     .addUse(Fma4.getReg(0))
2064     .addUse(Fma1.getReg(0))
2065     .addUse(Fma3.getReg(0))
2066     .addUse(NumeratorScaled.getReg(1))
2067     .setMIFlags(Flags);
2068 
2069   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2070     .addUse(Fmas.getReg(0))
2071     .addUse(RHS)
2072     .addUse(LHS)
2073     .setMIFlags(Flags);
2074 
2075   MI.eraseFromParent();
2076   return true;
2077 }
2078 
2079 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
2080                                          MachineRegisterInfo &MRI,
2081                                          MachineIRBuilder &B) const {
2082   B.setInstr(MI);
2083   Register Res = MI.getOperand(0).getReg();
2084   Register LHS = MI.getOperand(1).getReg();
2085   Register RHS = MI.getOperand(2).getReg();
2086 
2087   uint16_t Flags = MI.getFlags();
2088 
2089   LLT S64 = LLT::scalar(64);
2090   LLT S1 = LLT::scalar(1);
2091 
2092   auto One = B.buildFConstant(S64, 1.0);
2093 
2094   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2095     .addUse(LHS)
2096     .addUse(RHS)
2097     .addImm(1)
2098     .setMIFlags(Flags);
2099 
2100   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
2101 
2102   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
2103     .addUse(DivScale0.getReg(0))
2104     .setMIFlags(Flags);
2105 
2106   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
2107   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
2108   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
2109 
2110   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2111     .addUse(LHS)
2112     .addUse(RHS)
2113     .addImm(0)
2114     .setMIFlags(Flags);
2115 
2116   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
2117   auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags);
2118   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
2119 
2120   Register Scale;
2121   if (!ST.hasUsableDivScaleConditionOutput()) {
2122     // Workaround a hardware bug on SI where the condition output from div_scale
2123     // is not usable.
2124 
2125     Scale = MRI.createGenericVirtualRegister(S1);
2126 
2127     LLT S32 = LLT::scalar(32);
2128 
2129     auto NumUnmerge = B.buildUnmerge(S32, LHS);
2130     auto DenUnmerge = B.buildUnmerge(S32, RHS);
2131     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
2132     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
2133 
2134     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
2135                               Scale1Unmerge.getReg(1));
2136     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
2137                               Scale0Unmerge.getReg(1));
2138     B.buildXor(Scale, CmpNum, CmpDen);
2139   } else {
2140     Scale = DivScale1.getReg(1);
2141   }
2142 
2143   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
2144     .addUse(Fma4.getReg(0))
2145     .addUse(Fma3.getReg(0))
2146     .addUse(Mul.getReg(0))
2147     .addUse(Scale)
2148     .setMIFlags(Flags);
2149 
2150   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, {S64}, false)
2151     .addDef(Res)
2152     .addUse(Fmas.getReg(0))
2153     .addUse(RHS)
2154     .addUse(LHS)
2155     .setMIFlags(Flags);
2156 
2157   MI.eraseFromParent();
2158   return true;
2159 }
2160 
2161 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
2162                                                  MachineRegisterInfo &MRI,
2163                                                  MachineIRBuilder &B) const {
2164   B.setInstr(MI);
2165   Register Res = MI.getOperand(0).getReg();
2166   Register LHS = MI.getOperand(2).getReg();
2167   Register RHS = MI.getOperand(3).getReg();
2168   uint16_t Flags = MI.getFlags();
2169 
2170   LLT S32 = LLT::scalar(32);
2171   LLT S1 = LLT::scalar(1);
2172 
2173   auto Abs = B.buildFAbs(S32, RHS, Flags);
2174   const APFloat C0Val(1.0f);
2175 
2176   auto C0 = B.buildConstant(S32, 0x6f800000);
2177   auto C1 = B.buildConstant(S32, 0x2f800000);
2178   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
2179 
2180   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
2181   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
2182 
2183   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
2184 
2185   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2186     .addUse(Mul0.getReg(0))
2187     .setMIFlags(Flags);
2188 
2189   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
2190 
2191   B.buildFMul(Res, Sel, Mul1, Flags);
2192 
2193   MI.eraseFromParent();
2194   return true;
2195 }
2196 
2197 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
2198                                                  MachineRegisterInfo &MRI,
2199                                                  MachineIRBuilder &B) const {
2200   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2201   if (!MFI->isEntryFunction()) {
2202     return legalizePreloadedArgIntrin(MI, MRI, B,
2203                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
2204   }
2205 
2206   B.setInstr(MI);
2207 
2208   uint64_t Offset =
2209     ST.getTargetLowering()->getImplicitParameterOffset(
2210       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
2211   Register DstReg = MI.getOperand(0).getReg();
2212   LLT DstTy = MRI.getType(DstReg);
2213   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
2214 
2215   const ArgDescriptor *Arg;
2216   const TargetRegisterClass *RC;
2217   std::tie(Arg, RC)
2218     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2219   if (!Arg)
2220     return false;
2221 
2222   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
2223   if (!loadInputValue(KernargPtrReg, B, Arg))
2224     return false;
2225 
2226   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
2227   MI.eraseFromParent();
2228   return true;
2229 }
2230 
2231 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
2232                                               MachineRegisterInfo &MRI,
2233                                               MachineIRBuilder &B,
2234                                               unsigned AddrSpace) const {
2235   B.setInstr(MI);
2236   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
2237   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
2238   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
2239   MI.eraseFromParent();
2240   return true;
2241 }
2242 
2243 /// Handle register layout difference for f16 images for some subtargets.
2244 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
2245                                              MachineRegisterInfo &MRI,
2246                                              Register Reg) const {
2247   if (!ST.hasUnpackedD16VMem())
2248     return Reg;
2249 
2250   const LLT S16 = LLT::scalar(16);
2251   const LLT S32 = LLT::scalar(32);
2252   LLT StoreVT = MRI.getType(Reg);
2253   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
2254 
2255   auto Unmerge = B.buildUnmerge(S16, Reg);
2256 
2257   SmallVector<Register, 4> WideRegs;
2258   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
2259     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
2260 
2261   int NumElts = StoreVT.getNumElements();
2262 
2263   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
2264 }
2265 
2266 bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI,
2267                                                  MachineRegisterInfo &MRI,
2268                                                  MachineIRBuilder &B,
2269                                                  bool IsFormat) const {
2270   // TODO: Reject f16 format on targets where unsupported.
2271   Register VData = MI.getOperand(1).getReg();
2272   LLT Ty = MRI.getType(VData);
2273 
2274   B.setInstr(MI);
2275 
2276   const LLT S32 = LLT::scalar(32);
2277   const LLT S16 = LLT::scalar(16);
2278 
2279   // Fixup illegal register types for i8 stores.
2280   if (Ty == LLT::scalar(8) || Ty == S16) {
2281     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
2282     MI.getOperand(1).setReg(AnyExt);
2283     return true;
2284   }
2285 
2286   if (Ty.isVector()) {
2287     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
2288       if (IsFormat)
2289         MI.getOperand(1).setReg(handleD16VData(B, MRI, VData));
2290       return true;
2291     }
2292 
2293     return Ty.getElementType() == S32 && Ty.getNumElements() <= 4;
2294   }
2295 
2296   return Ty == S32;
2297 }
2298 
2299 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
2300                                             MachineRegisterInfo &MRI,
2301                                             MachineIRBuilder &B) const {
2302   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
2303   auto IntrID = MI.getIntrinsicID();
2304   switch (IntrID) {
2305   case Intrinsic::amdgcn_if:
2306   case Intrinsic::amdgcn_else: {
2307     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
2308       const SIRegisterInfo *TRI
2309         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
2310 
2311       B.setInstr(*BrCond);
2312       Register Def = MI.getOperand(1).getReg();
2313       Register Use = MI.getOperand(3).getReg();
2314 
2315       if (IntrID == Intrinsic::amdgcn_if) {
2316         B.buildInstr(AMDGPU::SI_IF)
2317           .addDef(Def)
2318           .addUse(Use)
2319           .addMBB(BrCond->getOperand(1).getMBB());
2320       } else {
2321         B.buildInstr(AMDGPU::SI_ELSE)
2322           .addDef(Def)
2323           .addUse(Use)
2324           .addMBB(BrCond->getOperand(1).getMBB())
2325           .addImm(0);
2326       }
2327 
2328       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
2329       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
2330       MI.eraseFromParent();
2331       BrCond->eraseFromParent();
2332       return true;
2333     }
2334 
2335     return false;
2336   }
2337   case Intrinsic::amdgcn_loop: {
2338     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
2339       const SIRegisterInfo *TRI
2340         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
2341 
2342       B.setInstr(*BrCond);
2343       Register Reg = MI.getOperand(2).getReg();
2344       B.buildInstr(AMDGPU::SI_LOOP)
2345         .addUse(Reg)
2346         .addMBB(BrCond->getOperand(1).getMBB());
2347       MI.eraseFromParent();
2348       BrCond->eraseFromParent();
2349       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
2350       return true;
2351     }
2352 
2353     return false;
2354   }
2355   case Intrinsic::amdgcn_kernarg_segment_ptr:
2356     return legalizePreloadedArgIntrin(
2357       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2358   case Intrinsic::amdgcn_implicitarg_ptr:
2359     return legalizeImplicitArgPtr(MI, MRI, B);
2360   case Intrinsic::amdgcn_workitem_id_x:
2361     return legalizePreloadedArgIntrin(MI, MRI, B,
2362                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
2363   case Intrinsic::amdgcn_workitem_id_y:
2364     return legalizePreloadedArgIntrin(MI, MRI, B,
2365                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
2366   case Intrinsic::amdgcn_workitem_id_z:
2367     return legalizePreloadedArgIntrin(MI, MRI, B,
2368                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
2369   case Intrinsic::amdgcn_workgroup_id_x:
2370     return legalizePreloadedArgIntrin(MI, MRI, B,
2371                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
2372   case Intrinsic::amdgcn_workgroup_id_y:
2373     return legalizePreloadedArgIntrin(MI, MRI, B,
2374                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
2375   case Intrinsic::amdgcn_workgroup_id_z:
2376     return legalizePreloadedArgIntrin(MI, MRI, B,
2377                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
2378   case Intrinsic::amdgcn_dispatch_ptr:
2379     return legalizePreloadedArgIntrin(MI, MRI, B,
2380                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
2381   case Intrinsic::amdgcn_queue_ptr:
2382     return legalizePreloadedArgIntrin(MI, MRI, B,
2383                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
2384   case Intrinsic::amdgcn_implicit_buffer_ptr:
2385     return legalizePreloadedArgIntrin(
2386       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
2387   case Intrinsic::amdgcn_dispatch_id:
2388     return legalizePreloadedArgIntrin(MI, MRI, B,
2389                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
2390   case Intrinsic::amdgcn_fdiv_fast:
2391     return legalizeFDIVFastIntrin(MI, MRI, B);
2392   case Intrinsic::amdgcn_is_shared:
2393     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
2394   case Intrinsic::amdgcn_is_private:
2395     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
2396   case Intrinsic::amdgcn_wavefrontsize: {
2397     B.setInstr(MI);
2398     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
2399     MI.eraseFromParent();
2400     return true;
2401   }
2402   case Intrinsic::amdgcn_raw_buffer_store:
2403     return legalizeRawBufferStore(MI, MRI, B, false);
2404   case Intrinsic::amdgcn_raw_buffer_store_format:
2405     return legalizeRawBufferStore(MI, MRI, B, true);
2406   default:
2407     return true;
2408   }
2409 
2410   return true;
2411 }
2412