1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
19 #endif
20 
21 #include "AMDGPU.h"
22 #include "AMDGPULegalizerInfo.h"
23 #include "AMDGPUTargetMachine.h"
24 #include "SIMachineFunctionInfo.h"
25 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/CodeGen/TargetOpcodes.h"
28 #include "llvm/CodeGen/ValueTypes.h"
29 #include "llvm/IR/DerivedTypes.h"
30 #include "llvm/IR/DiagnosticInfo.h"
31 #include "llvm/IR/Type.h"
32 #include "llvm/Support/Debug.h"
33 
34 #define DEBUG_TYPE "amdgpu-legalinfo"
35 
36 using namespace llvm;
37 using namespace LegalizeActions;
38 using namespace LegalizeMutations;
39 using namespace LegalityPredicates;
40 
41 
42 static LegalityPredicate isMultiple32(unsigned TypeIdx,
43                                       unsigned MaxSize = 512) {
44   return [=](const LegalityQuery &Query) {
45     const LLT Ty = Query.Types[TypeIdx];
46     const LLT EltTy = Ty.getScalarType();
47     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
48   };
49 }
50 
51 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
52   return [=](const LegalityQuery &Query) {
53     const LLT Ty = Query.Types[TypeIdx];
54     return Ty.isVector() &&
55            Ty.getNumElements() % 2 != 0 &&
56            Ty.getElementType().getSizeInBits() < 32;
57   };
58 }
59 
60 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
61   return [=](const LegalityQuery &Query) {
62     const LLT Ty = Query.Types[TypeIdx];
63     const LLT EltTy = Ty.getElementType();
64     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
65   };
66 }
67 
68 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
69   return [=](const LegalityQuery &Query) {
70     const LLT Ty = Query.Types[TypeIdx];
71     const LLT EltTy = Ty.getElementType();
72     unsigned Size = Ty.getSizeInBits();
73     unsigned Pieces = (Size + 63) / 64;
74     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
75     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
76   };
77 }
78 
79 // Increase the number of vector elements to reach the next multiple of 32-bit
80 // type.
81 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
82   return [=](const LegalityQuery &Query) {
83     const LLT Ty = Query.Types[TypeIdx];
84 
85     const LLT EltTy = Ty.getElementType();
86     const int Size = Ty.getSizeInBits();
87     const int EltSize = EltTy.getSizeInBits();
88     const int NextMul32 = (Size + 31) / 32;
89 
90     assert(EltSize < 32);
91 
92     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
93     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
94   };
95 }
96 
97 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
98   return [=](const LegalityQuery &Query) {
99     const LLT QueryTy = Query.Types[TypeIdx];
100     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
101   };
102 }
103 
104 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
105   return [=](const LegalityQuery &Query) {
106     const LLT QueryTy = Query.Types[TypeIdx];
107     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
108   };
109 }
110 
111 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
112   return [=](const LegalityQuery &Query) {
113     const LLT QueryTy = Query.Types[TypeIdx];
114     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
115   };
116 }
117 
118 // Any combination of 32 or 64-bit elements up to 512 bits, and multiples of
119 // v2s16.
120 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
121   return [=](const LegalityQuery &Query) {
122     const LLT Ty = Query.Types[TypeIdx];
123     if (Ty.isVector()) {
124       const int EltSize = Ty.getElementType().getSizeInBits();
125       return EltSize == 32 || EltSize == 64 ||
126             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
127              EltSize == 128 || EltSize == 256;
128     }
129 
130     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 512;
131   };
132 }
133 
134 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
135   return [=](const LegalityQuery &Query) {
136     return Query.Types[TypeIdx].getElementType() == Type;
137   };
138 }
139 
140 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
141   return [=](const LegalityQuery &Query) {
142     const LLT Ty = Query.Types[TypeIdx];
143     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
144            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
145   };
146 }
147 
148 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
149                                          const GCNTargetMachine &TM)
150   :  ST(ST_) {
151   using namespace TargetOpcode;
152 
153   auto GetAddrSpacePtr = [&TM](unsigned AS) {
154     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
155   };
156 
157   const LLT S1 = LLT::scalar(1);
158   const LLT S8 = LLT::scalar(8);
159   const LLT S16 = LLT::scalar(16);
160   const LLT S32 = LLT::scalar(32);
161   const LLT S64 = LLT::scalar(64);
162   const LLT S96 = LLT::scalar(96);
163   const LLT S128 = LLT::scalar(128);
164   const LLT S256 = LLT::scalar(256);
165   const LLT S512 = LLT::scalar(512);
166 
167   const LLT V2S16 = LLT::vector(2, 16);
168   const LLT V4S16 = LLT::vector(4, 16);
169 
170   const LLT V2S32 = LLT::vector(2, 32);
171   const LLT V3S32 = LLT::vector(3, 32);
172   const LLT V4S32 = LLT::vector(4, 32);
173   const LLT V5S32 = LLT::vector(5, 32);
174   const LLT V6S32 = LLT::vector(6, 32);
175   const LLT V7S32 = LLT::vector(7, 32);
176   const LLT V8S32 = LLT::vector(8, 32);
177   const LLT V9S32 = LLT::vector(9, 32);
178   const LLT V10S32 = LLT::vector(10, 32);
179   const LLT V11S32 = LLT::vector(11, 32);
180   const LLT V12S32 = LLT::vector(12, 32);
181   const LLT V13S32 = LLT::vector(13, 32);
182   const LLT V14S32 = LLT::vector(14, 32);
183   const LLT V15S32 = LLT::vector(15, 32);
184   const LLT V16S32 = LLT::vector(16, 32);
185 
186   const LLT V2S64 = LLT::vector(2, 64);
187   const LLT V3S64 = LLT::vector(3, 64);
188   const LLT V4S64 = LLT::vector(4, 64);
189   const LLT V5S64 = LLT::vector(5, 64);
190   const LLT V6S64 = LLT::vector(6, 64);
191   const LLT V7S64 = LLT::vector(7, 64);
192   const LLT V8S64 = LLT::vector(8, 64);
193 
194   std::initializer_list<LLT> AllS32Vectors =
195     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
196      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32};
197   std::initializer_list<LLT> AllS64Vectors =
198     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64};
199 
200   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
201   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
202   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
203   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
204   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
205   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
206   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
207 
208   const LLT CodePtr = FlatPtr;
209 
210   const std::initializer_list<LLT> AddrSpaces64 = {
211     GlobalPtr, ConstantPtr, FlatPtr
212   };
213 
214   const std::initializer_list<LLT> AddrSpaces32 = {
215     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
216   };
217 
218   const std::initializer_list<LLT> FPTypesBase = {
219     S32, S64
220   };
221 
222   const std::initializer_list<LLT> FPTypes16 = {
223     S32, S64, S16
224   };
225 
226   const std::initializer_list<LLT> FPTypesPK16 = {
227     S32, S64, S16, V2S16
228   };
229 
230   setAction({G_BRCOND, S1}, Legal);
231 
232   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
233   // elements for v3s16
234   getActionDefinitionsBuilder(G_PHI)
235     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
236     .legalFor(AllS32Vectors)
237     .legalFor(AllS64Vectors)
238     .legalFor(AddrSpaces64)
239     .legalFor(AddrSpaces32)
240     .clampScalar(0, S32, S256)
241     .widenScalarToNextPow2(0, 32)
242     .clampMaxNumElements(0, S32, 16)
243     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
244     .legalIf(isPointer(0));
245 
246   if (ST.has16BitInsts()) {
247     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
248       .legalFor({S32, S16})
249       .clampScalar(0, S16, S32)
250       .scalarize(0);
251   } else {
252     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
253       .legalFor({S32})
254       .clampScalar(0, S32, S32)
255       .scalarize(0);
256   }
257 
258   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
259     .legalFor({S32})
260     .clampScalar(0, S32, S32)
261     .scalarize(0);
262 
263   // Report legal for any types we can handle anywhere. For the cases only legal
264   // on the SALU, RegBankSelect will be able to re-legalize.
265   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
266     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
267     .clampScalar(0, S32, S64)
268     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
269     .fewerElementsIf(vectorWiderThan(0, 32), fewerEltsToSize64Vector(0))
270     .widenScalarToNextPow2(0)
271     .scalarize(0);
272 
273   getActionDefinitionsBuilder({G_UADDO, G_SADDO, G_USUBO, G_SSUBO,
274                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
275     .legalFor({{S32, S1}})
276     .clampScalar(0, S32, S32);
277 
278   getActionDefinitionsBuilder(G_BITCAST)
279     .legalForCartesianProduct({S32, V2S16})
280     .legalForCartesianProduct({S64, V2S32, V4S16})
281     .legalForCartesianProduct({V2S64, V4S32})
282     // Don't worry about the size constraint.
283     .legalIf(all(isPointer(0), isPointer(1)))
284     // FIXME: Testing hack
285     .legalForCartesianProduct({S16, LLT::vector(2, 8), });
286 
287   getActionDefinitionsBuilder(G_FCONSTANT)
288     .legalFor({S32, S64, S16})
289     .clampScalar(0, S16, S64);
290 
291   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
292     .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
293                ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
294     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
295     .clampScalarOrElt(0, S32, S512)
296     .legalIf(isMultiple32(0))
297     .widenScalarToNextPow2(0, 32)
298     .clampMaxNumElements(0, S32, 16);
299 
300 
301   // FIXME: i1 operands to intrinsics should always be legal, but other i1
302   // values may not be legal.  We need to figure out how to distinguish
303   // between these two scenarios.
304   getActionDefinitionsBuilder(G_CONSTANT)
305     .legalFor({S1, S32, S64, S16, GlobalPtr,
306                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
307     .clampScalar(0, S32, S64)
308     .widenScalarToNextPow2(0)
309     .legalIf(isPointer(0));
310 
311   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
312   getActionDefinitionsBuilder(G_GLOBAL_VALUE).customFor({LocalPtr});
313 
314 
315   auto &FPOpActions = getActionDefinitionsBuilder(
316     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
317     .legalFor({S32, S64});
318   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
319     .customFor({S32, S64});
320 
321   if (ST.has16BitInsts()) {
322     if (ST.hasVOP3PInsts())
323       FPOpActions.legalFor({S16, V2S16});
324     else
325       FPOpActions.legalFor({S16});
326 
327     TrigActions.customFor({S16});
328   }
329 
330   auto &MinNumMaxNum = getActionDefinitionsBuilder({
331       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
332 
333   if (ST.hasVOP3PInsts()) {
334     MinNumMaxNum.customFor(FPTypesPK16)
335       .clampMaxNumElements(0, S16, 2)
336       .clampScalar(0, S16, S64)
337       .scalarize(0);
338   } else if (ST.has16BitInsts()) {
339     MinNumMaxNum.customFor(FPTypes16)
340       .clampScalar(0, S16, S64)
341       .scalarize(0);
342   } else {
343     MinNumMaxNum.customFor(FPTypesBase)
344       .clampScalar(0, S32, S64)
345       .scalarize(0);
346   }
347 
348   if (ST.hasVOP3PInsts())
349     FPOpActions.clampMaxNumElements(0, S16, 2);
350 
351   FPOpActions
352     .scalarize(0)
353     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
354 
355   TrigActions
356     .scalarize(0)
357     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
358 
359   getActionDefinitionsBuilder({G_FNEG, G_FABS})
360     .legalFor(FPTypesPK16)
361     .clampMaxNumElements(0, S16, 2)
362     .scalarize(0)
363     .clampScalar(0, S16, S64);
364 
365   // TODO: Implement
366   getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
367 
368   if (ST.has16BitInsts()) {
369     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
370       .legalFor({S32, S64, S16})
371       .scalarize(0)
372       .clampScalar(0, S16, S64);
373   } else {
374     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
375       .legalFor({S32, S64})
376       .scalarize(0)
377       .clampScalar(0, S32, S64);
378   }
379 
380   getActionDefinitionsBuilder(G_FPTRUNC)
381     .legalFor({{S32, S64}, {S16, S32}})
382     .scalarize(0);
383 
384   getActionDefinitionsBuilder(G_FPEXT)
385     .legalFor({{S64, S32}, {S32, S16}})
386     .lowerFor({{S64, S16}}) // FIXME: Implement
387     .scalarize(0);
388 
389   // TODO: Verify V_BFI_B32 is generated from expanded bit ops.
390   getActionDefinitionsBuilder(G_FCOPYSIGN).lower();
391 
392   getActionDefinitionsBuilder(G_FSUB)
393       // Use actual fsub instruction
394       .legalFor({S32})
395       // Must use fadd + fneg
396       .lowerFor({S64, S16, V2S16})
397       .scalarize(0)
398       .clampScalar(0, S32, S64);
399 
400   // Whether this is legal depends on the floating point mode for the function.
401   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
402   if (ST.hasMadF16())
403     FMad.customFor({S32, S16});
404   else
405     FMad.customFor({S32});
406   FMad.scalarize(0)
407       .lower();
408 
409   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
410     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
411                {S32, S1}, {S64, S1}, {S16, S1},
412                {S96, S32},
413                // FIXME: Hack
414                {S64, LLT::scalar(33)},
415                {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}})
416     .scalarize(0);
417 
418   // TODO: Legal for s1->s64, requires split for VALU.
419   getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
420     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}, {S32, S1}, {S16, S1}})
421     .lowerFor({{S32, S64}})
422     .customFor({{S64, S64}})
423     .scalarize(0);
424 
425   getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
426     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
427     .scalarize(0);
428 
429   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
430     .legalFor({S32, S64})
431     .scalarize(0);
432 
433   if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
434     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
435       .legalFor({S32, S64})
436       .clampScalar(0, S32, S64)
437       .scalarize(0);
438   } else {
439     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
440       .legalFor({S32})
441       .customFor({S64})
442       .clampScalar(0, S32, S64)
443       .scalarize(0);
444   }
445 
446   getActionDefinitionsBuilder(G_GEP)
447     .legalForCartesianProduct(AddrSpaces64, {S64})
448     .legalForCartesianProduct(AddrSpaces32, {S32})
449     .scalarize(0);
450 
451   getActionDefinitionsBuilder(G_PTR_MASK)
452     .scalarize(0)
453     .alwaysLegal();
454 
455   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
456 
457   auto &CmpBuilder =
458     getActionDefinitionsBuilder(G_ICMP)
459     .legalForCartesianProduct(
460       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
461     .legalFor({{S1, S32}, {S1, S64}});
462   if (ST.has16BitInsts()) {
463     CmpBuilder.legalFor({{S1, S16}});
464   }
465 
466   CmpBuilder
467     .widenScalarToNextPow2(1)
468     .clampScalar(1, S32, S64)
469     .scalarize(0)
470     .legalIf(all(typeIs(0, S1), isPointer(1)));
471 
472   getActionDefinitionsBuilder(G_FCMP)
473     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
474     .widenScalarToNextPow2(1)
475     .clampScalar(1, S32, S64)
476     .scalarize(0);
477 
478   // FIXME: fexp, flog2, flog10 needs to be custom lowered.
479   getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2,
480                                G_FLOG, G_FLOG2, G_FLOG10})
481     .legalFor({S32})
482     .scalarize(0);
483 
484   // The 64-bit versions produce 32-bit results, but only on the SALU.
485   getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF,
486                                G_CTTZ, G_CTTZ_ZERO_UNDEF,
487                                G_CTPOP})
488     .legalFor({{S32, S32}, {S32, S64}})
489     .clampScalar(0, S32, S32)
490     .clampScalar(1, S32, S64)
491     .scalarize(0)
492     .widenScalarToNextPow2(0, 32)
493     .widenScalarToNextPow2(1, 32);
494 
495   // TODO: Expand for > s32
496   getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE})
497     .legalFor({S32})
498     .clampScalar(0, S32, S32)
499     .scalarize(0);
500 
501   if (ST.has16BitInsts()) {
502     if (ST.hasVOP3PInsts()) {
503       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
504         .legalFor({S32, S16, V2S16})
505         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
506         .clampMaxNumElements(0, S16, 2)
507         .clampScalar(0, S16, S32)
508         .widenScalarToNextPow2(0)
509         .scalarize(0);
510     } else {
511       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
512         .legalFor({S32, S16})
513         .widenScalarToNextPow2(0)
514         .clampScalar(0, S16, S32)
515         .scalarize(0);
516     }
517   } else {
518     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
519       .legalFor({S32})
520       .clampScalar(0, S32, S32)
521       .widenScalarToNextPow2(0)
522       .scalarize(0);
523   }
524 
525   auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
526     return [=](const LegalityQuery &Query) {
527       return Query.Types[TypeIdx0].getSizeInBits() <
528              Query.Types[TypeIdx1].getSizeInBits();
529     };
530   };
531 
532   auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
533     return [=](const LegalityQuery &Query) {
534       return Query.Types[TypeIdx0].getSizeInBits() >
535              Query.Types[TypeIdx1].getSizeInBits();
536     };
537   };
538 
539   getActionDefinitionsBuilder(G_INTTOPTR)
540     // List the common cases
541     .legalForCartesianProduct(AddrSpaces64, {S64})
542     .legalForCartesianProduct(AddrSpaces32, {S32})
543     .scalarize(0)
544     // Accept any address space as long as the size matches
545     .legalIf(sameSize(0, 1))
546     .widenScalarIf(smallerThan(1, 0),
547       [](const LegalityQuery &Query) {
548         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
549       })
550     .narrowScalarIf(greaterThan(1, 0),
551       [](const LegalityQuery &Query) {
552         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
553       });
554 
555   getActionDefinitionsBuilder(G_PTRTOINT)
556     // List the common cases
557     .legalForCartesianProduct(AddrSpaces64, {S64})
558     .legalForCartesianProduct(AddrSpaces32, {S32})
559     .scalarize(0)
560     // Accept any address space as long as the size matches
561     .legalIf(sameSize(0, 1))
562     .widenScalarIf(smallerThan(0, 1),
563       [](const LegalityQuery &Query) {
564         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
565       })
566     .narrowScalarIf(
567       greaterThan(0, 1),
568       [](const LegalityQuery &Query) {
569         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
570       });
571 
572   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
573     .scalarize(0)
574     .custom();
575 
576   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
577   // handle some operations by just promoting the register during
578   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
579   auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned {
580     switch (AS) {
581     // FIXME: Private element size.
582     case AMDGPUAS::PRIVATE_ADDRESS:
583       return 32;
584     // FIXME: Check subtarget
585     case AMDGPUAS::LOCAL_ADDRESS:
586       return ST.useDS128() ? 128 : 64;
587 
588     // Treat constant and global as identical. SMRD loads are sometimes usable
589     // for global loads (ideally constant address space should be eliminated)
590     // depending on the context. Legality cannot be context dependent, but
591     // RegBankSelect can split the load as necessary depending on the pointer
592     // register bank/uniformity and if the memory is invariant or not written in
593     // a kernel.
594     case AMDGPUAS::CONSTANT_ADDRESS:
595     case AMDGPUAS::GLOBAL_ADDRESS:
596       return 512;
597     default:
598       return 128;
599     }
600   };
601 
602   const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool {
603     const LLT DstTy = Query.Types[0];
604 
605     // Split vector extloads.
606     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
607     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
608       return true;
609 
610     const LLT PtrTy = Query.Types[1];
611     unsigned AS = PtrTy.getAddressSpace();
612     if (MemSize > maxSizeForAddrSpace(AS))
613       return true;
614 
615     // Catch weird sized loads that don't evenly divide into the access sizes
616     // TODO: May be able to widen depending on alignment etc.
617     unsigned NumRegs = MemSize / 32;
618     if (NumRegs == 3 && !ST.hasDwordx3LoadStores())
619       return true;
620 
621     unsigned Align = Query.MMODescrs[0].AlignInBits;
622     if (Align < MemSize) {
623       const SITargetLowering *TLI = ST.getTargetLowering();
624       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
625     }
626 
627     return false;
628   };
629 
630   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
631   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
632   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
633 
634   // TODO: Refine based on subtargets which support unaligned access or 128-bit
635   // LDS
636   // TODO: Unsupported flat for SI.
637 
638   for (unsigned Op : {G_LOAD, G_STORE}) {
639     const bool IsStore = Op == G_STORE;
640 
641     auto &Actions = getActionDefinitionsBuilder(Op);
642     // Whitelist the common cases.
643     // TODO: Pointer loads
644     // TODO: Wide constant loads
645     // TODO: Only CI+ has 3x loads
646     // TODO: Loads to s16 on gfx9
647     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
648                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
649                                       {V3S32, GlobalPtr, 96, GlobalAlign32},
650                                       {S96, GlobalPtr, 96, GlobalAlign32},
651                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
652                                       {S128, GlobalPtr, 128, GlobalAlign32},
653                                       {S64, GlobalPtr, 64, GlobalAlign32},
654                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
655                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
656                                       {S32, GlobalPtr, 8, GlobalAlign8},
657                                       {S32, GlobalPtr, 16, GlobalAlign16},
658 
659                                       {S32, LocalPtr, 32, 32},
660                                       {S64, LocalPtr, 64, 32},
661                                       {V2S32, LocalPtr, 64, 32},
662                                       {S32, LocalPtr, 8, 8},
663                                       {S32, LocalPtr, 16, 16},
664                                       {V2S16, LocalPtr, 32, 32},
665 
666                                       {S32, PrivatePtr, 32, 32},
667                                       {S32, PrivatePtr, 8, 8},
668                                       {S32, PrivatePtr, 16, 16},
669                                       {V2S16, PrivatePtr, 32, 32},
670 
671                                       {S32, FlatPtr, 32, GlobalAlign32},
672                                       {S32, FlatPtr, 16, GlobalAlign16},
673                                       {S32, FlatPtr, 8, GlobalAlign8},
674                                       {V2S16, FlatPtr, 32, GlobalAlign32},
675 
676                                       {S32, ConstantPtr, 32, GlobalAlign32},
677                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
678                                       {V3S32, ConstantPtr, 96, GlobalAlign32},
679                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
680                                       {S64, ConstantPtr, 64, GlobalAlign32},
681                                       {S128, ConstantPtr, 128, GlobalAlign32},
682                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
683     Actions
684         .customIf(typeIs(1, Constant32Ptr))
685         .narrowScalarIf(
686             [=](const LegalityQuery &Query) -> bool {
687               return !Query.Types[0].isVector() && needToSplitLoad(Query);
688             },
689             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
690               const LLT DstTy = Query.Types[0];
691               const LLT PtrTy = Query.Types[1];
692 
693               const unsigned DstSize = DstTy.getSizeInBits();
694               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
695 
696               // Split extloads.
697               if (DstSize > MemSize)
698                 return std::make_pair(0, LLT::scalar(MemSize));
699 
700               if (DstSize > 32 && (DstSize % 32 != 0)) {
701                 // FIXME: Need a way to specify non-extload of larger size if
702                 // suitably aligned.
703                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
704               }
705 
706               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
707               if (MemSize > MaxSize)
708                 return std::make_pair(0, LLT::scalar(MaxSize));
709 
710               unsigned Align = Query.MMODescrs[0].AlignInBits;
711               return std::make_pair(0, LLT::scalar(Align));
712             })
713         .fewerElementsIf(
714             [=](const LegalityQuery &Query) -> bool {
715               return Query.Types[0].isVector() && needToSplitLoad(Query);
716             },
717             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
718               const LLT DstTy = Query.Types[0];
719               const LLT PtrTy = Query.Types[1];
720 
721               LLT EltTy = DstTy.getElementType();
722               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
723 
724               // Split if it's too large for the address space.
725               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
726                 unsigned NumElts = DstTy.getNumElements();
727                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
728 
729                 // FIXME: Refine when odd breakdowns handled
730                 // The scalars will need to be re-legalized.
731                 if (NumPieces == 1 || NumPieces >= NumElts ||
732                     NumElts % NumPieces != 0)
733                   return std::make_pair(0, EltTy);
734 
735                 return std::make_pair(0,
736                                       LLT::vector(NumElts / NumPieces, EltTy));
737               }
738 
739               // Need to split because of alignment.
740               unsigned Align = Query.MMODescrs[0].AlignInBits;
741               unsigned EltSize = EltTy.getSizeInBits();
742               if (EltSize > Align &&
743                   (EltSize / Align < DstTy.getNumElements())) {
744                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
745               }
746 
747               // May need relegalization for the scalars.
748               return std::make_pair(0, EltTy);
749             })
750         .minScalar(0, S32);
751 
752     if (IsStore)
753       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
754 
755     // TODO: Need a bitcast lower option?
756     Actions
757         .legalIf([=](const LegalityQuery &Query) {
758           const LLT Ty0 = Query.Types[0];
759           unsigned Size = Ty0.getSizeInBits();
760           unsigned MemSize = Query.MMODescrs[0].SizeInBits;
761           unsigned Align = Query.MMODescrs[0].AlignInBits;
762 
763           // No extending vector loads.
764           if (Size > MemSize && Ty0.isVector())
765             return false;
766 
767           // FIXME: Widening store from alignment not valid.
768           if (MemSize < Size)
769             MemSize = std::max(MemSize, Align);
770 
771           switch (MemSize) {
772           case 8:
773           case 16:
774             return Size == 32;
775           case 32:
776           case 64:
777           case 128:
778             return true;
779           case 96:
780             return ST.hasDwordx3LoadStores();
781           case 256:
782           case 512:
783             return true;
784           default:
785             return false;
786           }
787         })
788         .widenScalarToNextPow2(0)
789         // TODO: v3s32->v4s32 with alignment
790         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
791   }
792 
793   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
794                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
795                                                   {S32, GlobalPtr, 16, 2 * 8},
796                                                   {S32, LocalPtr, 8, 8},
797                                                   {S32, LocalPtr, 16, 16},
798                                                   {S32, PrivatePtr, 8, 8},
799                                                   {S32, PrivatePtr, 16, 16},
800                                                   {S32, ConstantPtr, 8, 8},
801                                                   {S32, ConstantPtr, 16, 2 * 8}});
802   if (ST.hasFlatAddressSpace()) {
803     ExtLoads.legalForTypesWithMemDesc(
804         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
805   }
806 
807   ExtLoads.clampScalar(0, S32, S32)
808           .widenScalarToNextPow2(0)
809           .unsupportedIfMemSizeNotPow2()
810           .lower();
811 
812   auto &Atomics = getActionDefinitionsBuilder(
813     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
814      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
815      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
816      G_ATOMICRMW_UMIN, G_ATOMIC_CMPXCHG})
817     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
818                {S64, GlobalPtr}, {S64, LocalPtr}});
819   if (ST.hasFlatAddressSpace()) {
820     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
821   }
822 
823   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
824     .legalFor({{S32, LocalPtr}});
825 
826   // TODO: Pointer types, any 32-bit or 64-bit vector
827   getActionDefinitionsBuilder(G_SELECT)
828     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
829           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
830           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1})
831     .clampScalar(0, S16, S64)
832     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
833     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
834     .scalarize(1)
835     .clampMaxNumElements(0, S32, 2)
836     .clampMaxNumElements(0, LocalPtr, 2)
837     .clampMaxNumElements(0, PrivatePtr, 2)
838     .scalarize(0)
839     .widenScalarToNextPow2(0)
840     .legalIf(all(isPointer(0), typeIs(1, S1)));
841 
842   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
843   // be more flexible with the shift amount type.
844   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
845     .legalFor({{S32, S32}, {S64, S32}});
846   if (ST.has16BitInsts()) {
847     if (ST.hasVOP3PInsts()) {
848       Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
849             .clampMaxNumElements(0, S16, 2);
850     } else
851       Shifts.legalFor({{S16, S32}, {S16, S16}});
852 
853     Shifts.clampScalar(1, S16, S32);
854     Shifts.clampScalar(0, S16, S64);
855     Shifts.widenScalarToNextPow2(0, 16);
856   } else {
857     // Make sure we legalize the shift amount type first, as the general
858     // expansion for the shifted type will produce much worse code if it hasn't
859     // been truncated already.
860     Shifts.clampScalar(1, S32, S32);
861     Shifts.clampScalar(0, S32, S64);
862     Shifts.widenScalarToNextPow2(0, 32);
863   }
864   Shifts.scalarize(0);
865 
866   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
867     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
868     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
869     unsigned IdxTypeIdx = 2;
870 
871     getActionDefinitionsBuilder(Op)
872       .customIf([=](const LegalityQuery &Query) {
873           const LLT EltTy = Query.Types[EltTypeIdx];
874           const LLT VecTy = Query.Types[VecTypeIdx];
875           const LLT IdxTy = Query.Types[IdxTypeIdx];
876           return (EltTy.getSizeInBits() == 16 ||
877                   EltTy.getSizeInBits() % 32 == 0) &&
878                  VecTy.getSizeInBits() % 32 == 0 &&
879                  VecTy.getSizeInBits() <= 512 &&
880                  IdxTy.getSizeInBits() == 32;
881         })
882       .clampScalar(EltTypeIdx, S32, S64)
883       .clampScalar(VecTypeIdx, S32, S64)
884       .clampScalar(IdxTypeIdx, S32, S32);
885   }
886 
887   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
888     .unsupportedIf([=](const LegalityQuery &Query) {
889         const LLT &EltTy = Query.Types[1].getElementType();
890         return Query.Types[0] != EltTy;
891       });
892 
893   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
894     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
895     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
896 
897     // FIXME: Doesn't handle extract of illegal sizes.
898     getActionDefinitionsBuilder(Op)
899       .legalIf([=](const LegalityQuery &Query) {
900           const LLT BigTy = Query.Types[BigTyIdx];
901           const LLT LitTy = Query.Types[LitTyIdx];
902           return (BigTy.getSizeInBits() % 32 == 0) &&
903                  (LitTy.getSizeInBits() % 16 == 0);
904         })
905       .widenScalarIf(
906         [=](const LegalityQuery &Query) {
907           const LLT BigTy = Query.Types[BigTyIdx];
908           return (BigTy.getScalarSizeInBits() < 16);
909         },
910         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
911       .widenScalarIf(
912         [=](const LegalityQuery &Query) {
913           const LLT LitTy = Query.Types[LitTyIdx];
914           return (LitTy.getScalarSizeInBits() < 16);
915         },
916         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
917       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
918       .widenScalarToNextPow2(BigTyIdx, 32);
919 
920   }
921 
922   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
923     .legalForCartesianProduct(AllS32Vectors, {S32})
924     .legalForCartesianProduct(AllS64Vectors, {S64})
925     .clampNumElements(0, V16S32, V16S32)
926     .clampNumElements(0, V2S64, V8S64);
927 
928   if (ST.hasScalarPackInsts())
929     BuildVector.legalFor({V2S16, S32});
930 
931   BuildVector
932     .minScalarSameAs(1, 0)
933     .legalIf(isRegisterType(0))
934     .minScalarOrElt(0, S32);
935 
936   if (ST.hasScalarPackInsts()) {
937     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
938       .legalFor({V2S16, S32})
939       .lower();
940   } else {
941     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
942       .lower();
943   }
944 
945   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
946     .legalIf(isRegisterType(0));
947 
948   // TODO: Don't fully scalarize v2s16 pieces
949   getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
950 
951   // Merge/Unmerge
952   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
953     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
954     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
955 
956     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
957       const LLT &Ty = Query.Types[TypeIdx];
958       if (Ty.isVector()) {
959         const LLT &EltTy = Ty.getElementType();
960         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
961           return true;
962         if (!isPowerOf2_32(EltTy.getSizeInBits()))
963           return true;
964       }
965       return false;
966     };
967 
968     getActionDefinitionsBuilder(Op)
969       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
970       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
971       // worth considering the multiples of 64 since 2*192 and 2*384 are not
972       // valid.
973       .clampScalar(LitTyIdx, S16, S256)
974       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
975       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
976       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
977                            elementTypeIs(1, S16)),
978                        changeTo(1, V2S16))
979       // Break up vectors with weird elements into scalars
980       .fewerElementsIf(
981         [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
982         scalarize(0))
983       .fewerElementsIf(
984         [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
985         scalarize(1))
986       .clampScalar(BigTyIdx, S32, S512)
987       .lowerFor({{S16, V2S16}})
988       .widenScalarIf(
989         [=](const LegalityQuery &Query) {
990           const LLT &Ty = Query.Types[BigTyIdx];
991           return !isPowerOf2_32(Ty.getSizeInBits()) &&
992                  Ty.getSizeInBits() % 16 != 0;
993         },
994         [=](const LegalityQuery &Query) {
995           // Pick the next power of 2, or a multiple of 64 over 128.
996           // Whichever is smaller.
997           const LLT &Ty = Query.Types[BigTyIdx];
998           unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
999           if (NewSizeInBits >= 256) {
1000             unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1001             if (RoundedTo < NewSizeInBits)
1002               NewSizeInBits = RoundedTo;
1003           }
1004           return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1005         })
1006       .legalIf([=](const LegalityQuery &Query) {
1007           const LLT &BigTy = Query.Types[BigTyIdx];
1008           const LLT &LitTy = Query.Types[LitTyIdx];
1009 
1010           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1011             return false;
1012           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1013             return false;
1014 
1015           return BigTy.getSizeInBits() % 16 == 0 &&
1016                  LitTy.getSizeInBits() % 16 == 0 &&
1017                  BigTy.getSizeInBits() <= 512;
1018         })
1019       // Any vectors left are the wrong size. Scalarize them.
1020       .scalarize(0)
1021       .scalarize(1);
1022   }
1023 
1024   getActionDefinitionsBuilder(G_SEXT_INREG).lower();
1025 
1026   computeTables();
1027   verify(*ST.getInstrInfo());
1028 }
1029 
1030 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1031                                          MachineRegisterInfo &MRI,
1032                                          MachineIRBuilder &B,
1033                                          GISelChangeObserver &Observer) const {
1034   switch (MI.getOpcode()) {
1035   case TargetOpcode::G_ADDRSPACE_CAST:
1036     return legalizeAddrSpaceCast(MI, MRI, B);
1037   case TargetOpcode::G_FRINT:
1038     return legalizeFrint(MI, MRI, B);
1039   case TargetOpcode::G_FCEIL:
1040     return legalizeFceil(MI, MRI, B);
1041   case TargetOpcode::G_INTRINSIC_TRUNC:
1042     return legalizeIntrinsicTrunc(MI, MRI, B);
1043   case TargetOpcode::G_SITOFP:
1044     return legalizeITOFP(MI, MRI, B, true);
1045   case TargetOpcode::G_UITOFP:
1046     return legalizeITOFP(MI, MRI, B, false);
1047   case TargetOpcode::G_FMINNUM:
1048   case TargetOpcode::G_FMAXNUM:
1049   case TargetOpcode::G_FMINNUM_IEEE:
1050   case TargetOpcode::G_FMAXNUM_IEEE:
1051     return legalizeMinNumMaxNum(MI, MRI, B);
1052   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1053     return legalizeExtractVectorElt(MI, MRI, B);
1054   case TargetOpcode::G_INSERT_VECTOR_ELT:
1055     return legalizeInsertVectorElt(MI, MRI, B);
1056   case TargetOpcode::G_FSIN:
1057   case TargetOpcode::G_FCOS:
1058     return legalizeSinCos(MI, MRI, B);
1059   case TargetOpcode::G_GLOBAL_VALUE:
1060     return legalizeGlobalValue(MI, MRI, B);
1061   case TargetOpcode::G_LOAD:
1062     return legalizeLoad(MI, MRI, B, Observer);
1063   case TargetOpcode::G_FMAD:
1064     return legalizeFMad(MI, MRI, B);
1065   default:
1066     return false;
1067   }
1068 
1069   llvm_unreachable("expected switch to return");
1070 }
1071 
1072 Register AMDGPULegalizerInfo::getSegmentAperture(
1073   unsigned AS,
1074   MachineRegisterInfo &MRI,
1075   MachineIRBuilder &B) const {
1076   MachineFunction &MF = B.getMF();
1077   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1078   const LLT S32 = LLT::scalar(32);
1079 
1080   if (ST.hasApertureRegs()) {
1081     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1082     // getreg.
1083     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1084         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1085         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1086     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1087         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1088         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1089     unsigned Encoding =
1090         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1091         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1092         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1093 
1094     Register ApertureReg = MRI.createGenericVirtualRegister(S32);
1095     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1096 
1097     B.buildInstr(AMDGPU::S_GETREG_B32)
1098       .addDef(GetReg)
1099       .addImm(Encoding);
1100     MRI.setType(GetReg, S32);
1101 
1102     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1103     B.buildInstr(TargetOpcode::G_SHL)
1104       .addDef(ApertureReg)
1105       .addUse(GetReg)
1106       .addUse(ShiftAmt.getReg(0));
1107 
1108     return ApertureReg;
1109   }
1110 
1111   Register QueuePtr = MRI.createGenericVirtualRegister(
1112     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1113 
1114   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1115   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1116     return Register();
1117 
1118   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1119   // private_segment_aperture_base_hi.
1120   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1121 
1122   // FIXME: Don't use undef
1123   Value *V = UndefValue::get(PointerType::get(
1124                                Type::getInt8Ty(MF.getFunction().getContext()),
1125                                AMDGPUAS::CONSTANT_ADDRESS));
1126 
1127   MachinePointerInfo PtrInfo(V, StructOffset);
1128   MachineMemOperand *MMO = MF.getMachineMemOperand(
1129     PtrInfo,
1130     MachineMemOperand::MOLoad |
1131     MachineMemOperand::MODereferenceable |
1132     MachineMemOperand::MOInvariant,
1133     4,
1134     MinAlign(64, StructOffset));
1135 
1136   Register LoadResult = MRI.createGenericVirtualRegister(S32);
1137   Register LoadAddr;
1138 
1139   B.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1140   B.buildLoad(LoadResult, LoadAddr, *MMO);
1141   return LoadResult;
1142 }
1143 
1144 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1145   MachineInstr &MI, MachineRegisterInfo &MRI,
1146   MachineIRBuilder &B) const {
1147   MachineFunction &MF = B.getMF();
1148 
1149   B.setInstr(MI);
1150 
1151   const LLT S32 = LLT::scalar(32);
1152   Register Dst = MI.getOperand(0).getReg();
1153   Register Src = MI.getOperand(1).getReg();
1154 
1155   LLT DstTy = MRI.getType(Dst);
1156   LLT SrcTy = MRI.getType(Src);
1157   unsigned DestAS = DstTy.getAddressSpace();
1158   unsigned SrcAS = SrcTy.getAddressSpace();
1159 
1160   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1161   // vector element.
1162   assert(!DstTy.isVector());
1163 
1164   const AMDGPUTargetMachine &TM
1165     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1166 
1167   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1168   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1169     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1170     return true;
1171   }
1172 
1173   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1174     // Truncate.
1175     B.buildExtract(Dst, Src, 0);
1176     MI.eraseFromParent();
1177     return true;
1178   }
1179 
1180   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1181     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1182     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1183 
1184     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1185     // another. Merge operands are required to be the same type, but creating an
1186     // extra ptrtoint would be kind of pointless.
1187     auto HighAddr = B.buildConstant(
1188       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1189     B.buildMerge(Dst, {Src, HighAddr.getReg(0)});
1190     MI.eraseFromParent();
1191     return true;
1192   }
1193 
1194   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1195     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1196            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1197     unsigned NullVal = TM.getNullPointerValue(DestAS);
1198 
1199     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1200     auto FlatNull = B.buildConstant(SrcTy, 0);
1201 
1202     Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy);
1203 
1204     // Extract low 32-bits of the pointer.
1205     B.buildExtract(PtrLo32, Src, 0);
1206 
1207     Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1208     B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
1209     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1210 
1211     MI.eraseFromParent();
1212     return true;
1213   }
1214 
1215   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1216     return false;
1217 
1218   if (!ST.hasFlatAddressSpace())
1219     return false;
1220 
1221   auto SegmentNull =
1222       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1223   auto FlatNull =
1224       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1225 
1226   Register ApertureReg = getSegmentAperture(DestAS, MRI, B);
1227   if (!ApertureReg.isValid())
1228     return false;
1229 
1230   Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1231   B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
1232 
1233   Register BuildPtr = MRI.createGenericVirtualRegister(DstTy);
1234 
1235   // Coerce the type of the low half of the result so we can use merge_values.
1236   Register SrcAsInt = MRI.createGenericVirtualRegister(S32);
1237   B.buildInstr(TargetOpcode::G_PTRTOINT)
1238     .addDef(SrcAsInt)
1239     .addUse(Src);
1240 
1241   // TODO: Should we allow mismatched types but matching sizes in merges to
1242   // avoid the ptrtoint?
1243   B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
1244   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
1245 
1246   MI.eraseFromParent();
1247   return true;
1248 }
1249 
1250 bool AMDGPULegalizerInfo::legalizeFrint(
1251   MachineInstr &MI, MachineRegisterInfo &MRI,
1252   MachineIRBuilder &B) const {
1253   B.setInstr(MI);
1254 
1255   Register Src = MI.getOperand(1).getReg();
1256   LLT Ty = MRI.getType(Src);
1257   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1258 
1259   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1260   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1261 
1262   auto C1 = B.buildFConstant(Ty, C1Val);
1263   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1264 
1265   // TODO: Should this propagate fast-math-flags?
1266   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1267   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1268 
1269   auto C2 = B.buildFConstant(Ty, C2Val);
1270   auto Fabs = B.buildFAbs(Ty, Src);
1271 
1272   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1273   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1274   return true;
1275 }
1276 
1277 bool AMDGPULegalizerInfo::legalizeFceil(
1278   MachineInstr &MI, MachineRegisterInfo &MRI,
1279   MachineIRBuilder &B) const {
1280   B.setInstr(MI);
1281 
1282   const LLT S1 = LLT::scalar(1);
1283   const LLT S64 = LLT::scalar(64);
1284 
1285   Register Src = MI.getOperand(1).getReg();
1286   assert(MRI.getType(Src) == S64);
1287 
1288   // result = trunc(src)
1289   // if (src > 0.0 && src != result)
1290   //   result += 1.0
1291 
1292   auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src});
1293 
1294   const auto Zero = B.buildFConstant(S64, 0.0);
1295   const auto One = B.buildFConstant(S64, 1.0);
1296   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1297   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1298   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1299   auto Add = B.buildSelect(S64, And, One, Zero);
1300 
1301   // TODO: Should this propagate fast-math-flags?
1302   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1303   return true;
1304 }
1305 
1306 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1307                                               MachineIRBuilder &B) {
1308   const unsigned FractBits = 52;
1309   const unsigned ExpBits = 11;
1310   LLT S32 = LLT::scalar(32);
1311 
1312   auto Const0 = B.buildConstant(S32, FractBits - 32);
1313   auto Const1 = B.buildConstant(S32, ExpBits);
1314 
1315   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1316     .addUse(Const0.getReg(0))
1317     .addUse(Const1.getReg(0));
1318 
1319   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1320 }
1321 
1322 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1323   MachineInstr &MI, MachineRegisterInfo &MRI,
1324   MachineIRBuilder &B) const {
1325   B.setInstr(MI);
1326 
1327   const LLT S1 = LLT::scalar(1);
1328   const LLT S32 = LLT::scalar(32);
1329   const LLT S64 = LLT::scalar(64);
1330 
1331   Register Src = MI.getOperand(1).getReg();
1332   assert(MRI.getType(Src) == S64);
1333 
1334   // TODO: Should this use extract since the low half is unused?
1335   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1336   Register Hi = Unmerge.getReg(1);
1337 
1338   // Extract the upper half, since this is where we will find the sign and
1339   // exponent.
1340   auto Exp = extractF64Exponent(Hi, B);
1341 
1342   const unsigned FractBits = 52;
1343 
1344   // Extract the sign bit.
1345   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1346   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1347 
1348   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1349 
1350   const auto Zero32 = B.buildConstant(S32, 0);
1351 
1352   // Extend back to 64-bits.
1353   auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)});
1354 
1355   auto Shr = B.buildAShr(S64, FractMask, Exp);
1356   auto Not = B.buildNot(S64, Shr);
1357   auto Tmp0 = B.buildAnd(S64, Src, Not);
1358   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1359 
1360   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1361   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1362 
1363   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1364   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1365   return true;
1366 }
1367 
1368 bool AMDGPULegalizerInfo::legalizeITOFP(
1369   MachineInstr &MI, MachineRegisterInfo &MRI,
1370   MachineIRBuilder &B, bool Signed) const {
1371   B.setInstr(MI);
1372 
1373   Register Dst = MI.getOperand(0).getReg();
1374   Register Src = MI.getOperand(1).getReg();
1375 
1376   const LLT S64 = LLT::scalar(64);
1377   const LLT S32 = LLT::scalar(32);
1378 
1379   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1380 
1381   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1382 
1383   auto CvtHi = Signed ?
1384     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1385     B.buildUITOFP(S64, Unmerge.getReg(1));
1386 
1387   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1388 
1389   auto ThirtyTwo = B.buildConstant(S32, 32);
1390   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1391     .addUse(CvtHi.getReg(0))
1392     .addUse(ThirtyTwo.getReg(0));
1393 
1394   // TODO: Should this propagate fast-math-flags?
1395   B.buildFAdd(Dst, LdExp, CvtLo);
1396   MI.eraseFromParent();
1397   return true;
1398 }
1399 
1400 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1401   MachineInstr &MI, MachineRegisterInfo &MRI,
1402   MachineIRBuilder &B) const {
1403   MachineFunction &MF = B.getMF();
1404   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1405 
1406   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1407                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1408 
1409   // With ieee_mode disabled, the instructions have the correct behavior
1410   // already for G_FMINNUM/G_FMAXNUM
1411   if (!MFI->getMode().IEEE)
1412     return !IsIEEEOp;
1413 
1414   if (IsIEEEOp)
1415     return true;
1416 
1417   MachineIRBuilder HelperBuilder(MI);
1418   GISelObserverWrapper DummyObserver;
1419   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1420   HelperBuilder.setInstr(MI);
1421   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1422 }
1423 
1424 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1425   MachineInstr &MI, MachineRegisterInfo &MRI,
1426   MachineIRBuilder &B) const {
1427   // TODO: Should move some of this into LegalizerHelper.
1428 
1429   // TODO: Promote dynamic indexing of s16 to s32
1430   // TODO: Dynamic s64 indexing is only legal for SGPR.
1431   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
1432   if (!IdxVal) // Dynamic case will be selected to register indexing.
1433     return true;
1434 
1435   Register Dst = MI.getOperand(0).getReg();
1436   Register Vec = MI.getOperand(1).getReg();
1437 
1438   LLT VecTy = MRI.getType(Vec);
1439   LLT EltTy = VecTy.getElementType();
1440   assert(EltTy == MRI.getType(Dst));
1441 
1442   B.setInstr(MI);
1443 
1444   if (IdxVal.getValue() < VecTy.getNumElements())
1445     B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits());
1446   else
1447     B.buildUndef(Dst);
1448 
1449   MI.eraseFromParent();
1450   return true;
1451 }
1452 
1453 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1454   MachineInstr &MI, MachineRegisterInfo &MRI,
1455   MachineIRBuilder &B) const {
1456   // TODO: Should move some of this into LegalizerHelper.
1457 
1458   // TODO: Promote dynamic indexing of s16 to s32
1459   // TODO: Dynamic s64 indexing is only legal for SGPR.
1460   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI);
1461   if (!IdxVal) // Dynamic case will be selected to register indexing.
1462     return true;
1463 
1464   Register Dst = MI.getOperand(0).getReg();
1465   Register Vec = MI.getOperand(1).getReg();
1466   Register Ins = MI.getOperand(2).getReg();
1467 
1468   LLT VecTy = MRI.getType(Vec);
1469   LLT EltTy = VecTy.getElementType();
1470   assert(EltTy == MRI.getType(Ins));
1471 
1472   B.setInstr(MI);
1473 
1474   if (IdxVal.getValue() < VecTy.getNumElements())
1475     B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits());
1476   else
1477     B.buildUndef(Dst);
1478 
1479   MI.eraseFromParent();
1480   return true;
1481 }
1482 
1483 bool AMDGPULegalizerInfo::legalizeSinCos(
1484   MachineInstr &MI, MachineRegisterInfo &MRI,
1485   MachineIRBuilder &B) const {
1486   B.setInstr(MI);
1487 
1488   Register DstReg = MI.getOperand(0).getReg();
1489   Register SrcReg = MI.getOperand(1).getReg();
1490   LLT Ty = MRI.getType(DstReg);
1491   unsigned Flags = MI.getFlags();
1492 
1493   Register TrigVal;
1494   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1495   if (ST.hasTrigReducedRange()) {
1496     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1497     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1498       .addUse(MulVal.getReg(0))
1499       .setMIFlags(Flags).getReg(0);
1500   } else
1501     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1502 
1503   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1504     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1505   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1506     .addUse(TrigVal)
1507     .setMIFlags(Flags);
1508   MI.eraseFromParent();
1509   return true;
1510 }
1511 
1512 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1513   MachineInstr &MI, MachineRegisterInfo &MRI,
1514   MachineIRBuilder &B) const {
1515   Register DstReg = MI.getOperand(0).getReg();
1516   LLT Ty = MRI.getType(DstReg);
1517   unsigned AS = Ty.getAddressSpace();
1518 
1519   const GlobalValue *GV = MI.getOperand(1).getGlobal();
1520   MachineFunction &MF = B.getMF();
1521   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1522 
1523   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1524     B.setInstr(MI);
1525 
1526     if (!MFI->isEntryFunction()) {
1527       const Function &Fn = MF.getFunction();
1528       DiagnosticInfoUnsupported BadLDSDecl(
1529         Fn, "local memory global used by non-kernel function", MI.getDebugLoc());
1530       Fn.getContext().diagnose(BadLDSDecl);
1531     }
1532 
1533     // TODO: We could emit code to handle the initialization somewhere.
1534     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1535       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1536       MI.eraseFromParent();
1537       return true;
1538     }
1539   } else
1540     return false;
1541 
1542   const Function &Fn = MF.getFunction();
1543   DiagnosticInfoUnsupported BadInit(
1544     Fn, "unsupported initializer for address space", MI.getDebugLoc());
1545   Fn.getContext().diagnose(BadInit);
1546   return true;
1547 }
1548 
1549 bool AMDGPULegalizerInfo::legalizeLoad(
1550   MachineInstr &MI, MachineRegisterInfo &MRI,
1551   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
1552   B.setInstr(MI);
1553   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1554   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
1555   Observer.changingInstr(MI);
1556   MI.getOperand(1).setReg(Cast.getReg(0));
1557   Observer.changedInstr(MI);
1558   return true;
1559 }
1560 
1561 bool AMDGPULegalizerInfo::legalizeFMad(
1562   MachineInstr &MI, MachineRegisterInfo &MRI,
1563   MachineIRBuilder &B) const {
1564   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1565   assert(Ty.isScalar());
1566 
1567   // TODO: Always legal with future ftz flag.
1568   if (Ty == LLT::scalar(32) && !ST.hasFP32Denormals())
1569     return true;
1570   if (Ty == LLT::scalar(16) && !ST.hasFP16Denormals())
1571     return true;
1572 
1573   MachineFunction &MF = B.getMF();
1574 
1575   MachineIRBuilder HelperBuilder(MI);
1576   GISelObserverWrapper DummyObserver;
1577   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1578   HelperBuilder.setMBB(*MI.getParent());
1579   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
1580 }
1581 
1582 // Return the use branch instruction, otherwise null if the usage is invalid.
1583 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
1584                                        MachineRegisterInfo &MRI) {
1585   Register CondDef = MI.getOperand(0).getReg();
1586   if (!MRI.hasOneNonDBGUse(CondDef))
1587     return nullptr;
1588 
1589   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
1590   return UseMI.getParent() == MI.getParent() &&
1591     UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr;
1592 }
1593 
1594 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
1595                                                 Register Reg, LLT Ty) const {
1596   Register LiveIn = MRI.getLiveInVirtReg(Reg);
1597   if (LiveIn)
1598     return LiveIn;
1599 
1600   Register NewReg = MRI.createGenericVirtualRegister(Ty);
1601   MRI.addLiveIn(Reg, NewReg);
1602   return NewReg;
1603 }
1604 
1605 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
1606                                          const ArgDescriptor *Arg) const {
1607   if (!Arg->isRegister() || !Arg->getRegister().isValid())
1608     return false; // TODO: Handle these
1609 
1610   assert(Arg->getRegister().isPhysical());
1611 
1612   MachineRegisterInfo &MRI = *B.getMRI();
1613 
1614   LLT Ty = MRI.getType(DstReg);
1615   Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
1616 
1617   if (Arg->isMasked()) {
1618     // TODO: Should we try to emit this once in the entry block?
1619     const LLT S32 = LLT::scalar(32);
1620     const unsigned Mask = Arg->getMask();
1621     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
1622 
1623     auto ShiftAmt = B.buildConstant(S32, Shift);
1624     auto LShr = B.buildLShr(S32, LiveIn, ShiftAmt);
1625     B.buildAnd(DstReg, LShr, B.buildConstant(S32, Mask >> Shift));
1626   } else
1627     B.buildCopy(DstReg, LiveIn);
1628 
1629   // Insert the argument copy if it doens't already exist.
1630   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
1631   if (!MRI.getVRegDef(LiveIn)) {
1632     // FIXME: Should have scoped insert pt
1633     MachineBasicBlock &OrigInsBB = B.getMBB();
1634     auto OrigInsPt = B.getInsertPt();
1635 
1636     MachineBasicBlock &EntryMBB = B.getMF().front();
1637     EntryMBB.addLiveIn(Arg->getRegister());
1638     B.setInsertPt(EntryMBB, EntryMBB.begin());
1639     B.buildCopy(LiveIn, Arg->getRegister());
1640 
1641     B.setInsertPt(OrigInsBB, OrigInsPt);
1642   }
1643 
1644   return true;
1645 }
1646 
1647 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
1648   MachineInstr &MI,
1649   MachineRegisterInfo &MRI,
1650   MachineIRBuilder &B,
1651   AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
1652   B.setInstr(MI);
1653 
1654   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1655 
1656   const ArgDescriptor *Arg;
1657   const TargetRegisterClass *RC;
1658   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
1659   if (!Arg) {
1660     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
1661     return false;
1662   }
1663 
1664   if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
1665     MI.eraseFromParent();
1666     return true;
1667   }
1668 
1669   return false;
1670 }
1671 
1672 bool AMDGPULegalizerInfo::legalizeFDIVFast(MachineInstr &MI,
1673                                            MachineRegisterInfo &MRI,
1674                                            MachineIRBuilder &B) const {
1675   B.setInstr(MI);
1676   Register Res = MI.getOperand(0).getReg();
1677   Register LHS = MI.getOperand(2).getReg();
1678   Register RHS = MI.getOperand(3).getReg();
1679   uint16_t Flags = MI.getFlags();
1680 
1681   LLT S32 = LLT::scalar(32);
1682   LLT S1 = LLT::scalar(1);
1683 
1684   auto Abs = B.buildFAbs(S32, RHS, Flags);
1685   const APFloat C0Val(1.0f);
1686 
1687   auto C0 = B.buildConstant(S32, 0x6f800000);
1688   auto C1 = B.buildConstant(S32, 0x2f800000);
1689   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
1690 
1691   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
1692   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
1693 
1694   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
1695 
1696   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
1697     .addUse(Mul0.getReg(0))
1698     .setMIFlags(Flags);
1699 
1700   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
1701 
1702   B.buildFMul(Res, Sel, Mul1, Flags);
1703 
1704   MI.eraseFromParent();
1705   return true;
1706 }
1707 
1708 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
1709                                                  MachineRegisterInfo &MRI,
1710                                                  MachineIRBuilder &B) const {
1711   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1712   if (!MFI->isEntryFunction()) {
1713     return legalizePreloadedArgIntrin(MI, MRI, B,
1714                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
1715   }
1716 
1717   B.setInstr(MI);
1718 
1719   uint64_t Offset =
1720     ST.getTargetLowering()->getImplicitParameterOffset(
1721       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
1722   Register DstReg = MI.getOperand(0).getReg();
1723   LLT DstTy = MRI.getType(DstReg);
1724   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
1725 
1726   const ArgDescriptor *Arg;
1727   const TargetRegisterClass *RC;
1728   std::tie(Arg, RC)
1729     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1730   if (!Arg)
1731     return false;
1732 
1733   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
1734   if (!loadInputValue(KernargPtrReg, B, Arg))
1735     return false;
1736 
1737   B.buildGEP(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
1738   MI.eraseFromParent();
1739   return true;
1740 }
1741 
1742 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
1743                                               MachineRegisterInfo &MRI,
1744                                               MachineIRBuilder &B,
1745                                               unsigned AddrSpace) const {
1746   B.setInstr(MI);
1747   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
1748   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
1749   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
1750   MI.eraseFromParent();
1751   return true;
1752 }
1753 
1754 /// Handle register layout difference for f16 images for some subtargets.
1755 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
1756                                              MachineRegisterInfo &MRI,
1757                                              Register Reg) const {
1758   if (!ST.hasUnpackedD16VMem())
1759     return Reg;
1760 
1761   const LLT S16 = LLT::scalar(16);
1762   const LLT S32 = LLT::scalar(32);
1763   LLT StoreVT = MRI.getType(Reg);
1764   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
1765 
1766   auto Unmerge = B.buildUnmerge(S16, Reg);
1767 
1768   SmallVector<Register, 4> WideRegs;
1769   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
1770     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
1771 
1772   int NumElts = StoreVT.getNumElements();
1773 
1774   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
1775 }
1776 
1777 bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI,
1778                                                  MachineRegisterInfo &MRI,
1779                                                  MachineIRBuilder &B,
1780                                                  bool IsFormat) const {
1781   // TODO: Reject f16 format on targets where unsupported.
1782   Register VData = MI.getOperand(1).getReg();
1783   LLT Ty = MRI.getType(VData);
1784 
1785   B.setInstr(MI);
1786 
1787   const LLT S32 = LLT::scalar(32);
1788   const LLT S16 = LLT::scalar(16);
1789 
1790   // Fixup illegal register types for i8 stores.
1791   if (Ty == LLT::scalar(8) || Ty == S16) {
1792     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
1793     MI.getOperand(1).setReg(AnyExt);
1794     return true;
1795   }
1796 
1797   if (Ty.isVector()) {
1798     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
1799       if (IsFormat)
1800         MI.getOperand(1).setReg(handleD16VData(B, MRI, VData));
1801       return true;
1802     }
1803 
1804     return Ty.getElementType() == S32 && Ty.getNumElements() <= 4;
1805   }
1806 
1807   return Ty == S32;
1808 }
1809 
1810 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
1811                                             MachineRegisterInfo &MRI,
1812                                             MachineIRBuilder &B) const {
1813   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
1814   switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
1815   case Intrinsic::amdgcn_if: {
1816     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
1817       const SIRegisterInfo *TRI
1818         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1819 
1820       B.setInstr(*BrCond);
1821       Register Def = MI.getOperand(1).getReg();
1822       Register Use = MI.getOperand(3).getReg();
1823       B.buildInstr(AMDGPU::SI_IF)
1824         .addDef(Def)
1825         .addUse(Use)
1826         .addMBB(BrCond->getOperand(1).getMBB());
1827 
1828       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
1829       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
1830       MI.eraseFromParent();
1831       BrCond->eraseFromParent();
1832       return true;
1833     }
1834 
1835     return false;
1836   }
1837   case Intrinsic::amdgcn_loop: {
1838     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
1839       const SIRegisterInfo *TRI
1840         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1841 
1842       B.setInstr(*BrCond);
1843       Register Reg = MI.getOperand(2).getReg();
1844       B.buildInstr(AMDGPU::SI_LOOP)
1845         .addUse(Reg)
1846         .addMBB(BrCond->getOperand(1).getMBB());
1847       MI.eraseFromParent();
1848       BrCond->eraseFromParent();
1849       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
1850       return true;
1851     }
1852 
1853     return false;
1854   }
1855   case Intrinsic::amdgcn_kernarg_segment_ptr:
1856     return legalizePreloadedArgIntrin(
1857       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1858   case Intrinsic::amdgcn_implicitarg_ptr:
1859     return legalizeImplicitArgPtr(MI, MRI, B);
1860   case Intrinsic::amdgcn_workitem_id_x:
1861     return legalizePreloadedArgIntrin(MI, MRI, B,
1862                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
1863   case Intrinsic::amdgcn_workitem_id_y:
1864     return legalizePreloadedArgIntrin(MI, MRI, B,
1865                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
1866   case Intrinsic::amdgcn_workitem_id_z:
1867     return legalizePreloadedArgIntrin(MI, MRI, B,
1868                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
1869   case Intrinsic::amdgcn_workgroup_id_x:
1870     return legalizePreloadedArgIntrin(MI, MRI, B,
1871                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
1872   case Intrinsic::amdgcn_workgroup_id_y:
1873     return legalizePreloadedArgIntrin(MI, MRI, B,
1874                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
1875   case Intrinsic::amdgcn_workgroup_id_z:
1876     return legalizePreloadedArgIntrin(MI, MRI, B,
1877                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
1878   case Intrinsic::amdgcn_dispatch_ptr:
1879     return legalizePreloadedArgIntrin(MI, MRI, B,
1880                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
1881   case Intrinsic::amdgcn_queue_ptr:
1882     return legalizePreloadedArgIntrin(MI, MRI, B,
1883                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
1884   case Intrinsic::amdgcn_implicit_buffer_ptr:
1885     return legalizePreloadedArgIntrin(
1886       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
1887   case Intrinsic::amdgcn_dispatch_id:
1888     return legalizePreloadedArgIntrin(MI, MRI, B,
1889                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
1890   case Intrinsic::amdgcn_fdiv_fast:
1891     return legalizeFDIVFast(MI, MRI, B);
1892   case Intrinsic::amdgcn_is_shared:
1893     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
1894   case Intrinsic::amdgcn_is_private:
1895     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
1896   case Intrinsic::amdgcn_wavefrontsize: {
1897     B.setInstr(MI);
1898     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
1899     MI.eraseFromParent();
1900     return true;
1901   }
1902   case Intrinsic::amdgcn_raw_buffer_store:
1903     return legalizeRawBufferStore(MI, MRI, B, false);
1904   case Intrinsic::amdgcn_raw_buffer_store_format:
1905     return legalizeRawBufferStore(MI, MRI, B, true);
1906   default:
1907     return true;
1908   }
1909 
1910   return true;
1911 }
1912