1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
19 #endif
20 
21 #include "AMDGPU.h"
22 #include "AMDGPULegalizerInfo.h"
23 #include "AMDGPUTargetMachine.h"
24 #include "SIMachineFunctionInfo.h"
25 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/CodeGen/TargetOpcodes.h"
28 #include "llvm/CodeGen/ValueTypes.h"
29 #include "llvm/IR/DerivedTypes.h"
30 #include "llvm/IR/DiagnosticInfo.h"
31 #include "llvm/IR/Type.h"
32 #include "llvm/Support/Debug.h"
33 
34 #define DEBUG_TYPE "amdgpu-legalinfo"
35 
36 using namespace llvm;
37 using namespace LegalizeActions;
38 using namespace LegalizeMutations;
39 using namespace LegalityPredicates;
40 
41 
42 static LegalityPredicate isMultiple32(unsigned TypeIdx,
43                                       unsigned MaxSize = 512) {
44   return [=](const LegalityQuery &Query) {
45     const LLT Ty = Query.Types[TypeIdx];
46     const LLT EltTy = Ty.getScalarType();
47     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
48   };
49 }
50 
51 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
52   return [=](const LegalityQuery &Query) {
53     const LLT Ty = Query.Types[TypeIdx];
54     return Ty.isVector() &&
55            Ty.getNumElements() % 2 != 0 &&
56            Ty.getElementType().getSizeInBits() < 32;
57   };
58 }
59 
60 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
61   return [=](const LegalityQuery &Query) {
62     const LLT Ty = Query.Types[TypeIdx];
63     const LLT EltTy = Ty.getElementType();
64     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
65   };
66 }
67 
68 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
69   return [=](const LegalityQuery &Query) {
70     const LLT Ty = Query.Types[TypeIdx];
71     const LLT EltTy = Ty.getElementType();
72     unsigned Size = Ty.getSizeInBits();
73     unsigned Pieces = (Size + 63) / 64;
74     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
75     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
76   };
77 }
78 
79 // Increase the number of vector elements to reach the next multiple of 32-bit
80 // type.
81 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
82   return [=](const LegalityQuery &Query) {
83     const LLT Ty = Query.Types[TypeIdx];
84 
85     const LLT EltTy = Ty.getElementType();
86     const int Size = Ty.getSizeInBits();
87     const int EltSize = EltTy.getSizeInBits();
88     const int NextMul32 = (Size + 31) / 32;
89 
90     assert(EltSize < 32);
91 
92     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
93     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
94   };
95 }
96 
97 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
98   return [=](const LegalityQuery &Query) {
99     const LLT QueryTy = Query.Types[TypeIdx];
100     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
101   };
102 }
103 
104 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
105   return [=](const LegalityQuery &Query) {
106     const LLT QueryTy = Query.Types[TypeIdx];
107     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
108   };
109 }
110 
111 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
112   return [=](const LegalityQuery &Query) {
113     const LLT QueryTy = Query.Types[TypeIdx];
114     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
115   };
116 }
117 
118 // Any combination of 32 or 64-bit elements up to 512 bits, and multiples of
119 // v2s16.
120 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
121   return [=](const LegalityQuery &Query) {
122     const LLT Ty = Query.Types[TypeIdx];
123     if (Ty.isVector()) {
124       const int EltSize = Ty.getElementType().getSizeInBits();
125       return EltSize == 32 || EltSize == 64 ||
126             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
127              EltSize == 128 || EltSize == 256;
128     }
129 
130     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 512;
131   };
132 }
133 
134 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
135   return [=](const LegalityQuery &Query) {
136     return Query.Types[TypeIdx].getElementType() == Type;
137   };
138 }
139 
140 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
141   return [=](const LegalityQuery &Query) {
142     const LLT Ty = Query.Types[TypeIdx];
143     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
144            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
145   };
146 }
147 
148 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
149                                          const GCNTargetMachine &TM)
150   :  ST(ST_) {
151   using namespace TargetOpcode;
152 
153   auto GetAddrSpacePtr = [&TM](unsigned AS) {
154     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
155   };
156 
157   const LLT S1 = LLT::scalar(1);
158   const LLT S8 = LLT::scalar(8);
159   const LLT S16 = LLT::scalar(16);
160   const LLT S32 = LLT::scalar(32);
161   const LLT S64 = LLT::scalar(64);
162   const LLT S96 = LLT::scalar(96);
163   const LLT S128 = LLT::scalar(128);
164   const LLT S256 = LLT::scalar(256);
165   const LLT S512 = LLT::scalar(512);
166 
167   const LLT V2S16 = LLT::vector(2, 16);
168   const LLT V4S16 = LLT::vector(4, 16);
169 
170   const LLT V2S32 = LLT::vector(2, 32);
171   const LLT V3S32 = LLT::vector(3, 32);
172   const LLT V4S32 = LLT::vector(4, 32);
173   const LLT V5S32 = LLT::vector(5, 32);
174   const LLT V6S32 = LLT::vector(6, 32);
175   const LLT V7S32 = LLT::vector(7, 32);
176   const LLT V8S32 = LLT::vector(8, 32);
177   const LLT V9S32 = LLT::vector(9, 32);
178   const LLT V10S32 = LLT::vector(10, 32);
179   const LLT V11S32 = LLT::vector(11, 32);
180   const LLT V12S32 = LLT::vector(12, 32);
181   const LLT V13S32 = LLT::vector(13, 32);
182   const LLT V14S32 = LLT::vector(14, 32);
183   const LLT V15S32 = LLT::vector(15, 32);
184   const LLT V16S32 = LLT::vector(16, 32);
185 
186   const LLT V2S64 = LLT::vector(2, 64);
187   const LLT V3S64 = LLT::vector(3, 64);
188   const LLT V4S64 = LLT::vector(4, 64);
189   const LLT V5S64 = LLT::vector(5, 64);
190   const LLT V6S64 = LLT::vector(6, 64);
191   const LLT V7S64 = LLT::vector(7, 64);
192   const LLT V8S64 = LLT::vector(8, 64);
193 
194   std::initializer_list<LLT> AllS32Vectors =
195     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
196      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32};
197   std::initializer_list<LLT> AllS64Vectors =
198     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64};
199 
200   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
201   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
202   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
203   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
204   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
205   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
206   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
207 
208   const LLT CodePtr = FlatPtr;
209 
210   const std::initializer_list<LLT> AddrSpaces64 = {
211     GlobalPtr, ConstantPtr, FlatPtr
212   };
213 
214   const std::initializer_list<LLT> AddrSpaces32 = {
215     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
216   };
217 
218   const std::initializer_list<LLT> FPTypesBase = {
219     S32, S64
220   };
221 
222   const std::initializer_list<LLT> FPTypes16 = {
223     S32, S64, S16
224   };
225 
226   const std::initializer_list<LLT> FPTypesPK16 = {
227     S32, S64, S16, V2S16
228   };
229 
230   setAction({G_BRCOND, S1}, Legal);
231 
232   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
233   // elements for v3s16
234   getActionDefinitionsBuilder(G_PHI)
235     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
236     .legalFor(AllS32Vectors)
237     .legalFor(AllS64Vectors)
238     .legalFor(AddrSpaces64)
239     .legalFor(AddrSpaces32)
240     .clampScalar(0, S32, S256)
241     .widenScalarToNextPow2(0, 32)
242     .clampMaxNumElements(0, S32, 16)
243     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
244     .legalIf(isPointer(0));
245 
246   if (ST.has16BitInsts()) {
247     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
248       .legalFor({S32, S16})
249       .clampScalar(0, S16, S32)
250       .scalarize(0);
251   } else {
252     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
253       .legalFor({S32})
254       .clampScalar(0, S32, S32)
255       .scalarize(0);
256   }
257 
258   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
259     .legalFor({S32})
260     .clampScalar(0, S32, S32)
261     .scalarize(0);
262 
263   // Report legal for any types we can handle anywhere. For the cases only legal
264   // on the SALU, RegBankSelect will be able to re-legalize.
265   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
266     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
267     .clampScalar(0, S32, S64)
268     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
269     .fewerElementsIf(vectorWiderThan(0, 32), fewerEltsToSize64Vector(0))
270     .widenScalarToNextPow2(0)
271     .scalarize(0);
272 
273   getActionDefinitionsBuilder({G_UADDO, G_SADDO, G_USUBO, G_SSUBO,
274                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
275     .legalFor({{S32, S1}})
276     .clampScalar(0, S32, S32);
277 
278   getActionDefinitionsBuilder(G_BITCAST)
279     .legalForCartesianProduct({S32, V2S16})
280     .legalForCartesianProduct({S64, V2S32, V4S16})
281     .legalForCartesianProduct({V2S64, V4S32})
282     // Don't worry about the size constraint.
283     .legalIf(all(isPointer(0), isPointer(1)))
284     // FIXME: Testing hack
285     .legalForCartesianProduct({S16, LLT::vector(2, 8), });
286 
287   getActionDefinitionsBuilder(G_FCONSTANT)
288     .legalFor({S32, S64, S16})
289     .clampScalar(0, S16, S64);
290 
291   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
292     .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
293                ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
294     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
295     .clampScalarOrElt(0, S32, S512)
296     .legalIf(isMultiple32(0))
297     .widenScalarToNextPow2(0, 32)
298     .clampMaxNumElements(0, S32, 16);
299 
300 
301   // FIXME: i1 operands to intrinsics should always be legal, but other i1
302   // values may not be legal.  We need to figure out how to distinguish
303   // between these two scenarios.
304   getActionDefinitionsBuilder(G_CONSTANT)
305     .legalFor({S1, S32, S64, S16, GlobalPtr,
306                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
307     .clampScalar(0, S32, S64)
308     .widenScalarToNextPow2(0)
309     .legalIf(isPointer(0));
310 
311   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
312   getActionDefinitionsBuilder(G_GLOBAL_VALUE).customFor({LocalPtr});
313 
314 
315   auto &FPOpActions = getActionDefinitionsBuilder(
316     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
317     .legalFor({S32, S64});
318   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
319     .customFor({S32, S64});
320 
321   if (ST.has16BitInsts()) {
322     if (ST.hasVOP3PInsts())
323       FPOpActions.legalFor({S16, V2S16});
324     else
325       FPOpActions.legalFor({S16});
326 
327     TrigActions.customFor({S16});
328   }
329 
330   auto &MinNumMaxNum = getActionDefinitionsBuilder({
331       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
332 
333   if (ST.hasVOP3PInsts()) {
334     MinNumMaxNum.customFor(FPTypesPK16)
335       .clampMaxNumElements(0, S16, 2)
336       .clampScalar(0, S16, S64)
337       .scalarize(0);
338   } else if (ST.has16BitInsts()) {
339     MinNumMaxNum.customFor(FPTypes16)
340       .clampScalar(0, S16, S64)
341       .scalarize(0);
342   } else {
343     MinNumMaxNum.customFor(FPTypesBase)
344       .clampScalar(0, S32, S64)
345       .scalarize(0);
346   }
347 
348   if (ST.hasVOP3PInsts())
349     FPOpActions.clampMaxNumElements(0, S16, 2);
350 
351   FPOpActions
352     .scalarize(0)
353     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
354 
355   TrigActions
356     .scalarize(0)
357     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
358 
359   getActionDefinitionsBuilder({G_FNEG, G_FABS})
360     .legalFor(FPTypesPK16)
361     .clampMaxNumElements(0, S16, 2)
362     .scalarize(0)
363     .clampScalar(0, S16, S64);
364 
365   // TODO: Implement
366   getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
367 
368   if (ST.has16BitInsts()) {
369     getActionDefinitionsBuilder(G_FSQRT)
370       .legalFor({S32, S64, S16})
371       .scalarize(0)
372       .clampScalar(0, S16, S64);
373   } else {
374     getActionDefinitionsBuilder(G_FSQRT)
375       .legalFor({S32, S64})
376       .scalarize(0)
377       .clampScalar(0, S32, S64);
378   }
379 
380   getActionDefinitionsBuilder(G_FPTRUNC)
381     .legalFor({{S32, S64}, {S16, S32}})
382     .scalarize(0);
383 
384   getActionDefinitionsBuilder(G_FPEXT)
385     .legalFor({{S64, S32}, {S32, S16}})
386     .lowerFor({{S64, S16}}) // FIXME: Implement
387     .scalarize(0);
388 
389   // TODO: Verify V_BFI_B32 is generated from expanded bit ops.
390   getActionDefinitionsBuilder(G_FCOPYSIGN).lower();
391 
392   getActionDefinitionsBuilder(G_FSUB)
393       // Use actual fsub instruction
394       .legalFor({S32})
395       // Must use fadd + fneg
396       .lowerFor({S64, S16, V2S16})
397       .scalarize(0)
398       .clampScalar(0, S32, S64);
399 
400   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
401     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
402                {S32, S1}, {S64, S1}, {S16, S1},
403                {S96, S32},
404                // FIXME: Hack
405                {S64, LLT::scalar(33)},
406                {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}})
407     .scalarize(0);
408 
409   getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
410     .legalFor({{S32, S32}, {S64, S32}})
411     .lowerFor({{S32, S64}})
412     .customFor({{S64, S64}})
413     .scalarize(0);
414 
415   getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
416     .legalFor({{S32, S32}, {S32, S64}})
417     .scalarize(0);
418 
419   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
420     .legalFor({S32, S64})
421     .scalarize(0);
422 
423   if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
424     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
425       .legalFor({S32, S64})
426       .clampScalar(0, S32, S64)
427       .scalarize(0);
428   } else {
429     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
430       .legalFor({S32})
431       .customFor({S64})
432       .clampScalar(0, S32, S64)
433       .scalarize(0);
434   }
435 
436   getActionDefinitionsBuilder(G_GEP)
437     .legalForCartesianProduct(AddrSpaces64, {S64})
438     .legalForCartesianProduct(AddrSpaces32, {S32})
439     .scalarize(0);
440 
441   getActionDefinitionsBuilder(G_PTR_MASK)
442     .scalarize(0)
443     .alwaysLegal();
444 
445   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
446 
447   auto &CmpBuilder =
448     getActionDefinitionsBuilder(G_ICMP)
449     .legalForCartesianProduct(
450       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
451     .legalFor({{S1, S32}, {S1, S64}});
452   if (ST.has16BitInsts()) {
453     CmpBuilder.legalFor({{S1, S16}});
454   }
455 
456   CmpBuilder
457     .widenScalarToNextPow2(1)
458     .clampScalar(1, S32, S64)
459     .scalarize(0)
460     .legalIf(all(typeIs(0, S1), isPointer(1)));
461 
462   getActionDefinitionsBuilder(G_FCMP)
463     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
464     .widenScalarToNextPow2(1)
465     .clampScalar(1, S32, S64)
466     .scalarize(0);
467 
468   // FIXME: fexp, flog2, flog10 needs to be custom lowered.
469   getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2,
470                                G_FLOG, G_FLOG2, G_FLOG10})
471     .legalFor({S32})
472     .scalarize(0);
473 
474   // The 64-bit versions produce 32-bit results, but only on the SALU.
475   getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF,
476                                G_CTTZ, G_CTTZ_ZERO_UNDEF,
477                                G_CTPOP})
478     .legalFor({{S32, S32}, {S32, S64}})
479     .clampScalar(0, S32, S32)
480     .clampScalar(1, S32, S64)
481     .scalarize(0)
482     .widenScalarToNextPow2(0, 32)
483     .widenScalarToNextPow2(1, 32);
484 
485   // TODO: Expand for > s32
486   getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE})
487     .legalFor({S32})
488     .clampScalar(0, S32, S32)
489     .scalarize(0);
490 
491   if (ST.has16BitInsts()) {
492     if (ST.hasVOP3PInsts()) {
493       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
494         .legalFor({S32, S16, V2S16})
495         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
496         .clampMaxNumElements(0, S16, 2)
497         .clampScalar(0, S16, S32)
498         .widenScalarToNextPow2(0)
499         .scalarize(0);
500     } else {
501       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
502         .legalFor({S32, S16})
503         .widenScalarToNextPow2(0)
504         .clampScalar(0, S16, S32)
505         .scalarize(0);
506     }
507   } else {
508     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
509       .legalFor({S32})
510       .clampScalar(0, S32, S32)
511       .widenScalarToNextPow2(0)
512       .scalarize(0);
513   }
514 
515   auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
516     return [=](const LegalityQuery &Query) {
517       return Query.Types[TypeIdx0].getSizeInBits() <
518              Query.Types[TypeIdx1].getSizeInBits();
519     };
520   };
521 
522   auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
523     return [=](const LegalityQuery &Query) {
524       return Query.Types[TypeIdx0].getSizeInBits() >
525              Query.Types[TypeIdx1].getSizeInBits();
526     };
527   };
528 
529   getActionDefinitionsBuilder(G_INTTOPTR)
530     // List the common cases
531     .legalForCartesianProduct(AddrSpaces64, {S64})
532     .legalForCartesianProduct(AddrSpaces32, {S32})
533     .scalarize(0)
534     // Accept any address space as long as the size matches
535     .legalIf(sameSize(0, 1))
536     .widenScalarIf(smallerThan(1, 0),
537       [](const LegalityQuery &Query) {
538         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
539       })
540     .narrowScalarIf(greaterThan(1, 0),
541       [](const LegalityQuery &Query) {
542         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
543       });
544 
545   getActionDefinitionsBuilder(G_PTRTOINT)
546     // List the common cases
547     .legalForCartesianProduct(AddrSpaces64, {S64})
548     .legalForCartesianProduct(AddrSpaces32, {S32})
549     .scalarize(0)
550     // Accept any address space as long as the size matches
551     .legalIf(sameSize(0, 1))
552     .widenScalarIf(smallerThan(0, 1),
553       [](const LegalityQuery &Query) {
554         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
555       })
556     .narrowScalarIf(
557       greaterThan(0, 1),
558       [](const LegalityQuery &Query) {
559         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
560       });
561 
562   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
563     .scalarize(0)
564     .custom();
565 
566   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
567   // handle some operations by just promoting the register during
568   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
569   auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned {
570     switch (AS) {
571     // FIXME: Private element size.
572     case AMDGPUAS::PRIVATE_ADDRESS:
573       return 32;
574     // FIXME: Check subtarget
575     case AMDGPUAS::LOCAL_ADDRESS:
576       return ST.useDS128() ? 128 : 64;
577 
578     // Treat constant and global as identical. SMRD loads are sometimes usable
579     // for global loads (ideally constant address space should be eliminated)
580     // depending on the context. Legality cannot be context dependent, but
581     // RegBankSelect can split the load as necessary depending on the pointer
582     // register bank/uniformity and if the memory is invariant or not written in
583     // a kernel.
584     case AMDGPUAS::CONSTANT_ADDRESS:
585     case AMDGPUAS::GLOBAL_ADDRESS:
586       return 512;
587     default:
588       return 128;
589     }
590   };
591 
592   const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool {
593     const LLT DstTy = Query.Types[0];
594 
595     // Split vector extloads.
596     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
597     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
598       return true;
599 
600     const LLT PtrTy = Query.Types[1];
601     unsigned AS = PtrTy.getAddressSpace();
602     if (MemSize > maxSizeForAddrSpace(AS))
603       return true;
604 
605     // Catch weird sized loads that don't evenly divide into the access sizes
606     // TODO: May be able to widen depending on alignment etc.
607     unsigned NumRegs = MemSize / 32;
608     if (NumRegs == 3 && !ST.hasDwordx3LoadStores())
609       return true;
610 
611     unsigned Align = Query.MMODescrs[0].AlignInBits;
612     if (Align < MemSize) {
613       const SITargetLowering *TLI = ST.getTargetLowering();
614       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
615     }
616 
617     return false;
618   };
619 
620   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
621   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
622   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
623 
624   // TODO: Refine based on subtargets which support unaligned access or 128-bit
625   // LDS
626   // TODO: Unsupported flat for SI.
627 
628   for (unsigned Op : {G_LOAD, G_STORE}) {
629     const bool IsStore = Op == G_STORE;
630 
631     auto &Actions = getActionDefinitionsBuilder(Op);
632     // Whitelist the common cases.
633     // TODO: Pointer loads
634     // TODO: Wide constant loads
635     // TODO: Only CI+ has 3x loads
636     // TODO: Loads to s16 on gfx9
637     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
638                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
639                                       {V3S32, GlobalPtr, 96, GlobalAlign32},
640                                       {S96, GlobalPtr, 96, GlobalAlign32},
641                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
642                                       {S128, GlobalPtr, 128, GlobalAlign32},
643                                       {S64, GlobalPtr, 64, GlobalAlign32},
644                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
645                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
646                                       {S32, GlobalPtr, 8, GlobalAlign8},
647                                       {S32, GlobalPtr, 16, GlobalAlign16},
648 
649                                       {S32, LocalPtr, 32, 32},
650                                       {S64, LocalPtr, 64, 32},
651                                       {V2S32, LocalPtr, 64, 32},
652                                       {S32, LocalPtr, 8, 8},
653                                       {S32, LocalPtr, 16, 16},
654                                       {V2S16, LocalPtr, 32, 32},
655 
656                                       {S32, PrivatePtr, 32, 32},
657                                       {S32, PrivatePtr, 8, 8},
658                                       {S32, PrivatePtr, 16, 16},
659                                       {V2S16, PrivatePtr, 32, 32},
660 
661                                       {S32, FlatPtr, 32, GlobalAlign32},
662                                       {S32, FlatPtr, 16, GlobalAlign16},
663                                       {S32, FlatPtr, 8, GlobalAlign8},
664                                       {V2S16, FlatPtr, 32, GlobalAlign32},
665 
666                                       {S32, ConstantPtr, 32, GlobalAlign32},
667                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
668                                       {V3S32, ConstantPtr, 96, GlobalAlign32},
669                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
670                                       {S64, ConstantPtr, 64, GlobalAlign32},
671                                       {S128, ConstantPtr, 128, GlobalAlign32},
672                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
673     Actions
674         .customIf(typeIs(1, Constant32Ptr))
675         .narrowScalarIf(
676             [=](const LegalityQuery &Query) -> bool {
677               return !Query.Types[0].isVector() && needToSplitLoad(Query);
678             },
679             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
680               const LLT DstTy = Query.Types[0];
681               const LLT PtrTy = Query.Types[1];
682 
683               const unsigned DstSize = DstTy.getSizeInBits();
684               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
685 
686               // Split extloads.
687               if (DstSize > MemSize)
688                 return std::make_pair(0, LLT::scalar(MemSize));
689 
690               if (DstSize > 32 && (DstSize % 32 != 0)) {
691                 // FIXME: Need a way to specify non-extload of larger size if
692                 // suitably aligned.
693                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
694               }
695 
696               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
697               if (MemSize > MaxSize)
698                 return std::make_pair(0, LLT::scalar(MaxSize));
699 
700               unsigned Align = Query.MMODescrs[0].AlignInBits;
701               return std::make_pair(0, LLT::scalar(Align));
702             })
703         .fewerElementsIf(
704             [=](const LegalityQuery &Query) -> bool {
705               return Query.Types[0].isVector() && needToSplitLoad(Query);
706             },
707             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
708               const LLT DstTy = Query.Types[0];
709               const LLT PtrTy = Query.Types[1];
710 
711               LLT EltTy = DstTy.getElementType();
712               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
713 
714               // Split if it's too large for the address space.
715               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
716                 unsigned NumElts = DstTy.getNumElements();
717                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
718 
719                 // FIXME: Refine when odd breakdowns handled
720                 // The scalars will need to be re-legalized.
721                 if (NumPieces == 1 || NumPieces >= NumElts ||
722                     NumElts % NumPieces != 0)
723                   return std::make_pair(0, EltTy);
724 
725                 return std::make_pair(0,
726                                       LLT::vector(NumElts / NumPieces, EltTy));
727               }
728 
729               // Need to split because of alignment.
730               unsigned Align = Query.MMODescrs[0].AlignInBits;
731               unsigned EltSize = EltTy.getSizeInBits();
732               if (EltSize > Align &&
733                   (EltSize / Align < DstTy.getNumElements())) {
734                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
735               }
736 
737               // May need relegalization for the scalars.
738               return std::make_pair(0, EltTy);
739             })
740         .minScalar(0, S32);
741 
742     if (IsStore)
743       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
744 
745     // TODO: Need a bitcast lower option?
746     Actions
747         .legalIf([=](const LegalityQuery &Query) {
748           const LLT Ty0 = Query.Types[0];
749           unsigned Size = Ty0.getSizeInBits();
750           unsigned MemSize = Query.MMODescrs[0].SizeInBits;
751           unsigned Align = Query.MMODescrs[0].AlignInBits;
752 
753           // No extending vector loads.
754           if (Size > MemSize && Ty0.isVector())
755             return false;
756 
757           // FIXME: Widening store from alignment not valid.
758           if (MemSize < Size)
759             MemSize = std::max(MemSize, Align);
760 
761           switch (MemSize) {
762           case 8:
763           case 16:
764             return Size == 32;
765           case 32:
766           case 64:
767           case 128:
768             return true;
769           case 96:
770             return ST.hasDwordx3LoadStores();
771           case 256:
772           case 512:
773             return true;
774           default:
775             return false;
776           }
777         })
778         .widenScalarToNextPow2(0)
779         // TODO: v3s32->v4s32 with alignment
780         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
781   }
782 
783   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
784                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
785                                                   {S32, GlobalPtr, 16, 2 * 8},
786                                                   {S32, LocalPtr, 8, 8},
787                                                   {S32, LocalPtr, 16, 16},
788                                                   {S32, PrivatePtr, 8, 8},
789                                                   {S32, PrivatePtr, 16, 16},
790                                                   {S32, ConstantPtr, 8, 8},
791                                                   {S32, ConstantPtr, 16, 2 * 8}});
792   if (ST.hasFlatAddressSpace()) {
793     ExtLoads.legalForTypesWithMemDesc(
794         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
795   }
796 
797   ExtLoads.clampScalar(0, S32, S32)
798           .widenScalarToNextPow2(0)
799           .unsupportedIfMemSizeNotPow2()
800           .lower();
801 
802   auto &Atomics = getActionDefinitionsBuilder(
803     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
804      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
805      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
806      G_ATOMICRMW_UMIN, G_ATOMIC_CMPXCHG})
807     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
808                {S64, GlobalPtr}, {S64, LocalPtr}});
809   if (ST.hasFlatAddressSpace()) {
810     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
811   }
812 
813   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
814     .legalFor({{S32, LocalPtr}});
815 
816   // TODO: Pointer types, any 32-bit or 64-bit vector
817   getActionDefinitionsBuilder(G_SELECT)
818     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
819           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
820           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1})
821     .clampScalar(0, S16, S64)
822     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
823     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
824     .scalarize(1)
825     .clampMaxNumElements(0, S32, 2)
826     .clampMaxNumElements(0, LocalPtr, 2)
827     .clampMaxNumElements(0, PrivatePtr, 2)
828     .scalarize(0)
829     .widenScalarToNextPow2(0)
830     .legalIf(all(isPointer(0), typeIs(1, S1)));
831 
832   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
833   // be more flexible with the shift amount type.
834   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
835     .legalFor({{S32, S32}, {S64, S32}});
836   if (ST.has16BitInsts()) {
837     if (ST.hasVOP3PInsts()) {
838       Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
839             .clampMaxNumElements(0, S16, 2);
840     } else
841       Shifts.legalFor({{S16, S32}, {S16, S16}});
842 
843     Shifts.clampScalar(1, S16, S32);
844     Shifts.clampScalar(0, S16, S64);
845     Shifts.widenScalarToNextPow2(0, 16);
846   } else {
847     // Make sure we legalize the shift amount type first, as the general
848     // expansion for the shifted type will produce much worse code if it hasn't
849     // been truncated already.
850     Shifts.clampScalar(1, S32, S32);
851     Shifts.clampScalar(0, S32, S64);
852     Shifts.widenScalarToNextPow2(0, 32);
853   }
854   Shifts.scalarize(0);
855 
856   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
857     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
858     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
859     unsigned IdxTypeIdx = 2;
860 
861     getActionDefinitionsBuilder(Op)
862       .customIf([=](const LegalityQuery &Query) {
863           const LLT EltTy = Query.Types[EltTypeIdx];
864           const LLT VecTy = Query.Types[VecTypeIdx];
865           const LLT IdxTy = Query.Types[IdxTypeIdx];
866           return (EltTy.getSizeInBits() == 16 ||
867                   EltTy.getSizeInBits() % 32 == 0) &&
868                  VecTy.getSizeInBits() % 32 == 0 &&
869                  VecTy.getSizeInBits() <= 512 &&
870                  IdxTy.getSizeInBits() == 32;
871         })
872       .clampScalar(EltTypeIdx, S32, S64)
873       .clampScalar(VecTypeIdx, S32, S64)
874       .clampScalar(IdxTypeIdx, S32, S32);
875   }
876 
877   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
878     .unsupportedIf([=](const LegalityQuery &Query) {
879         const LLT &EltTy = Query.Types[1].getElementType();
880         return Query.Types[0] != EltTy;
881       });
882 
883   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
884     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
885     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
886 
887     // FIXME: Doesn't handle extract of illegal sizes.
888     getActionDefinitionsBuilder(Op)
889       .legalIf([=](const LegalityQuery &Query) {
890           const LLT BigTy = Query.Types[BigTyIdx];
891           const LLT LitTy = Query.Types[LitTyIdx];
892           return (BigTy.getSizeInBits() % 32 == 0) &&
893                  (LitTy.getSizeInBits() % 16 == 0);
894         })
895       .widenScalarIf(
896         [=](const LegalityQuery &Query) {
897           const LLT BigTy = Query.Types[BigTyIdx];
898           return (BigTy.getScalarSizeInBits() < 16);
899         },
900         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
901       .widenScalarIf(
902         [=](const LegalityQuery &Query) {
903           const LLT LitTy = Query.Types[LitTyIdx];
904           return (LitTy.getScalarSizeInBits() < 16);
905         },
906         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
907       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
908       .widenScalarToNextPow2(BigTyIdx, 32);
909 
910   }
911 
912   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
913     .legalForCartesianProduct(AllS32Vectors, {S32})
914     .legalForCartesianProduct(AllS64Vectors, {S64})
915     .clampNumElements(0, V16S32, V16S32)
916     .clampNumElements(0, V2S64, V8S64);
917 
918   if (ST.hasScalarPackInsts())
919     BuildVector.legalFor({V2S16, S32});
920 
921   BuildVector
922     .minScalarSameAs(1, 0)
923     .legalIf(isRegisterType(0))
924     .minScalarOrElt(0, S32);
925 
926   if (ST.hasScalarPackInsts()) {
927     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
928       .legalFor({V2S16, S32})
929       .lower();
930   } else {
931     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
932       .lower();
933   }
934 
935   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
936     .legalIf(isRegisterType(0));
937 
938   // TODO: Don't fully scalarize v2s16 pieces
939   getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
940 
941   // Merge/Unmerge
942   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
943     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
944     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
945 
946     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
947       const LLT &Ty = Query.Types[TypeIdx];
948       if (Ty.isVector()) {
949         const LLT &EltTy = Ty.getElementType();
950         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
951           return true;
952         if (!isPowerOf2_32(EltTy.getSizeInBits()))
953           return true;
954       }
955       return false;
956     };
957 
958     getActionDefinitionsBuilder(Op)
959       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
960       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
961       // worth considering the multiples of 64 since 2*192 and 2*384 are not
962       // valid.
963       .clampScalar(LitTyIdx, S16, S256)
964       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
965       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
966       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
967                            elementTypeIs(1, S16)),
968                        changeTo(1, V2S16))
969       // Break up vectors with weird elements into scalars
970       .fewerElementsIf(
971         [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
972         scalarize(0))
973       .fewerElementsIf(
974         [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
975         scalarize(1))
976       .clampScalar(BigTyIdx, S32, S512)
977       .lowerFor({{S16, V2S16}})
978       .widenScalarIf(
979         [=](const LegalityQuery &Query) {
980           const LLT &Ty = Query.Types[BigTyIdx];
981           return !isPowerOf2_32(Ty.getSizeInBits()) &&
982                  Ty.getSizeInBits() % 16 != 0;
983         },
984         [=](const LegalityQuery &Query) {
985           // Pick the next power of 2, or a multiple of 64 over 128.
986           // Whichever is smaller.
987           const LLT &Ty = Query.Types[BigTyIdx];
988           unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
989           if (NewSizeInBits >= 256) {
990             unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
991             if (RoundedTo < NewSizeInBits)
992               NewSizeInBits = RoundedTo;
993           }
994           return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
995         })
996       .legalIf([=](const LegalityQuery &Query) {
997           const LLT &BigTy = Query.Types[BigTyIdx];
998           const LLT &LitTy = Query.Types[LitTyIdx];
999 
1000           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1001             return false;
1002           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1003             return false;
1004 
1005           return BigTy.getSizeInBits() % 16 == 0 &&
1006                  LitTy.getSizeInBits() % 16 == 0 &&
1007                  BigTy.getSizeInBits() <= 512;
1008         })
1009       // Any vectors left are the wrong size. Scalarize them.
1010       .scalarize(0)
1011       .scalarize(1);
1012   }
1013 
1014   getActionDefinitionsBuilder(G_SEXT_INREG).lower();
1015 
1016   computeTables();
1017   verify(*ST.getInstrInfo());
1018 }
1019 
1020 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1021                                          MachineRegisterInfo &MRI,
1022                                          MachineIRBuilder &B,
1023                                          GISelChangeObserver &Observer) const {
1024   switch (MI.getOpcode()) {
1025   case TargetOpcode::G_ADDRSPACE_CAST:
1026     return legalizeAddrSpaceCast(MI, MRI, B);
1027   case TargetOpcode::G_FRINT:
1028     return legalizeFrint(MI, MRI, B);
1029   case TargetOpcode::G_FCEIL:
1030     return legalizeFceil(MI, MRI, B);
1031   case TargetOpcode::G_INTRINSIC_TRUNC:
1032     return legalizeIntrinsicTrunc(MI, MRI, B);
1033   case TargetOpcode::G_SITOFP:
1034     return legalizeITOFP(MI, MRI, B, true);
1035   case TargetOpcode::G_UITOFP:
1036     return legalizeITOFP(MI, MRI, B, false);
1037   case TargetOpcode::G_FMINNUM:
1038   case TargetOpcode::G_FMAXNUM:
1039   case TargetOpcode::G_FMINNUM_IEEE:
1040   case TargetOpcode::G_FMAXNUM_IEEE:
1041     return legalizeMinNumMaxNum(MI, MRI, B);
1042   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1043     return legalizeExtractVectorElt(MI, MRI, B);
1044   case TargetOpcode::G_INSERT_VECTOR_ELT:
1045     return legalizeInsertVectorElt(MI, MRI, B);
1046   case TargetOpcode::G_FSIN:
1047   case TargetOpcode::G_FCOS:
1048     return legalizeSinCos(MI, MRI, B);
1049   case TargetOpcode::G_GLOBAL_VALUE:
1050     return legalizeGlobalValue(MI, MRI, B);
1051   case TargetOpcode::G_LOAD:
1052     return legalizeLoad(MI, MRI, B, Observer);
1053   default:
1054     return false;
1055   }
1056 
1057   llvm_unreachable("expected switch to return");
1058 }
1059 
1060 Register AMDGPULegalizerInfo::getSegmentAperture(
1061   unsigned AS,
1062   MachineRegisterInfo &MRI,
1063   MachineIRBuilder &B) const {
1064   MachineFunction &MF = B.getMF();
1065   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1066   const LLT S32 = LLT::scalar(32);
1067 
1068   if (ST.hasApertureRegs()) {
1069     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1070     // getreg.
1071     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1072         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1073         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1074     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1075         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1076         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1077     unsigned Encoding =
1078         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1079         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1080         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1081 
1082     Register ApertureReg = MRI.createGenericVirtualRegister(S32);
1083     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1084 
1085     B.buildInstr(AMDGPU::S_GETREG_B32)
1086       .addDef(GetReg)
1087       .addImm(Encoding);
1088     MRI.setType(GetReg, S32);
1089 
1090     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1091     B.buildInstr(TargetOpcode::G_SHL)
1092       .addDef(ApertureReg)
1093       .addUse(GetReg)
1094       .addUse(ShiftAmt.getReg(0));
1095 
1096     return ApertureReg;
1097   }
1098 
1099   Register QueuePtr = MRI.createGenericVirtualRegister(
1100     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1101 
1102   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1103   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1104     return Register();
1105 
1106   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1107   // private_segment_aperture_base_hi.
1108   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1109 
1110   // FIXME: Don't use undef
1111   Value *V = UndefValue::get(PointerType::get(
1112                                Type::getInt8Ty(MF.getFunction().getContext()),
1113                                AMDGPUAS::CONSTANT_ADDRESS));
1114 
1115   MachinePointerInfo PtrInfo(V, StructOffset);
1116   MachineMemOperand *MMO = MF.getMachineMemOperand(
1117     PtrInfo,
1118     MachineMemOperand::MOLoad |
1119     MachineMemOperand::MODereferenceable |
1120     MachineMemOperand::MOInvariant,
1121     4,
1122     MinAlign(64, StructOffset));
1123 
1124   Register LoadResult = MRI.createGenericVirtualRegister(S32);
1125   Register LoadAddr;
1126 
1127   B.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1128   B.buildLoad(LoadResult, LoadAddr, *MMO);
1129   return LoadResult;
1130 }
1131 
1132 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1133   MachineInstr &MI, MachineRegisterInfo &MRI,
1134   MachineIRBuilder &B) const {
1135   MachineFunction &MF = B.getMF();
1136 
1137   B.setInstr(MI);
1138 
1139   const LLT S32 = LLT::scalar(32);
1140   Register Dst = MI.getOperand(0).getReg();
1141   Register Src = MI.getOperand(1).getReg();
1142 
1143   LLT DstTy = MRI.getType(Dst);
1144   LLT SrcTy = MRI.getType(Src);
1145   unsigned DestAS = DstTy.getAddressSpace();
1146   unsigned SrcAS = SrcTy.getAddressSpace();
1147 
1148   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1149   // vector element.
1150   assert(!DstTy.isVector());
1151 
1152   const AMDGPUTargetMachine &TM
1153     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1154 
1155   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1156   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1157     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1158     return true;
1159   }
1160 
1161   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1162     // Truncate.
1163     B.buildExtract(Dst, Src, 0);
1164     MI.eraseFromParent();
1165     return true;
1166   }
1167 
1168   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1169     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1170     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1171 
1172     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1173     // another. Merge operands are required to be the same type, but creating an
1174     // extra ptrtoint would be kind of pointless.
1175     auto HighAddr = B.buildConstant(
1176       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1177     B.buildMerge(Dst, {Src, HighAddr.getReg(0)});
1178     MI.eraseFromParent();
1179     return true;
1180   }
1181 
1182   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1183     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1184            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1185     unsigned NullVal = TM.getNullPointerValue(DestAS);
1186 
1187     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1188     auto FlatNull = B.buildConstant(SrcTy, 0);
1189 
1190     Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy);
1191 
1192     // Extract low 32-bits of the pointer.
1193     B.buildExtract(PtrLo32, Src, 0);
1194 
1195     Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1196     B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
1197     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1198 
1199     MI.eraseFromParent();
1200     return true;
1201   }
1202 
1203   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1204     return false;
1205 
1206   if (!ST.hasFlatAddressSpace())
1207     return false;
1208 
1209   auto SegmentNull =
1210       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1211   auto FlatNull =
1212       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1213 
1214   Register ApertureReg = getSegmentAperture(DestAS, MRI, B);
1215   if (!ApertureReg.isValid())
1216     return false;
1217 
1218   Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1219   B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
1220 
1221   Register BuildPtr = MRI.createGenericVirtualRegister(DstTy);
1222 
1223   // Coerce the type of the low half of the result so we can use merge_values.
1224   Register SrcAsInt = MRI.createGenericVirtualRegister(S32);
1225   B.buildInstr(TargetOpcode::G_PTRTOINT)
1226     .addDef(SrcAsInt)
1227     .addUse(Src);
1228 
1229   // TODO: Should we allow mismatched types but matching sizes in merges to
1230   // avoid the ptrtoint?
1231   B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
1232   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
1233 
1234   MI.eraseFromParent();
1235   return true;
1236 }
1237 
1238 bool AMDGPULegalizerInfo::legalizeFrint(
1239   MachineInstr &MI, MachineRegisterInfo &MRI,
1240   MachineIRBuilder &B) const {
1241   B.setInstr(MI);
1242 
1243   Register Src = MI.getOperand(1).getReg();
1244   LLT Ty = MRI.getType(Src);
1245   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1246 
1247   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1248   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1249 
1250   auto C1 = B.buildFConstant(Ty, C1Val);
1251   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1252 
1253   // TODO: Should this propagate fast-math-flags?
1254   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1255   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1256 
1257   auto C2 = B.buildFConstant(Ty, C2Val);
1258   auto Fabs = B.buildFAbs(Ty, Src);
1259 
1260   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1261   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1262   return true;
1263 }
1264 
1265 bool AMDGPULegalizerInfo::legalizeFceil(
1266   MachineInstr &MI, MachineRegisterInfo &MRI,
1267   MachineIRBuilder &B) const {
1268   B.setInstr(MI);
1269 
1270   const LLT S1 = LLT::scalar(1);
1271   const LLT S64 = LLT::scalar(64);
1272 
1273   Register Src = MI.getOperand(1).getReg();
1274   assert(MRI.getType(Src) == S64);
1275 
1276   // result = trunc(src)
1277   // if (src > 0.0 && src != result)
1278   //   result += 1.0
1279 
1280   auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src});
1281 
1282   const auto Zero = B.buildFConstant(S64, 0.0);
1283   const auto One = B.buildFConstant(S64, 1.0);
1284   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1285   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1286   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1287   auto Add = B.buildSelect(S64, And, One, Zero);
1288 
1289   // TODO: Should this propagate fast-math-flags?
1290   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1291   return true;
1292 }
1293 
1294 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1295                                               MachineIRBuilder &B) {
1296   const unsigned FractBits = 52;
1297   const unsigned ExpBits = 11;
1298   LLT S32 = LLT::scalar(32);
1299 
1300   auto Const0 = B.buildConstant(S32, FractBits - 32);
1301   auto Const1 = B.buildConstant(S32, ExpBits);
1302 
1303   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1304     .addUse(Const0.getReg(0))
1305     .addUse(Const1.getReg(0));
1306 
1307   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1308 }
1309 
1310 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1311   MachineInstr &MI, MachineRegisterInfo &MRI,
1312   MachineIRBuilder &B) const {
1313   B.setInstr(MI);
1314 
1315   const LLT S1 = LLT::scalar(1);
1316   const LLT S32 = LLT::scalar(32);
1317   const LLT S64 = LLT::scalar(64);
1318 
1319   Register Src = MI.getOperand(1).getReg();
1320   assert(MRI.getType(Src) == S64);
1321 
1322   // TODO: Should this use extract since the low half is unused?
1323   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1324   Register Hi = Unmerge.getReg(1);
1325 
1326   // Extract the upper half, since this is where we will find the sign and
1327   // exponent.
1328   auto Exp = extractF64Exponent(Hi, B);
1329 
1330   const unsigned FractBits = 52;
1331 
1332   // Extract the sign bit.
1333   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1334   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1335 
1336   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1337 
1338   const auto Zero32 = B.buildConstant(S32, 0);
1339 
1340   // Extend back to 64-bits.
1341   auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)});
1342 
1343   auto Shr = B.buildAShr(S64, FractMask, Exp);
1344   auto Not = B.buildNot(S64, Shr);
1345   auto Tmp0 = B.buildAnd(S64, Src, Not);
1346   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1347 
1348   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1349   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1350 
1351   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1352   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1353   return true;
1354 }
1355 
1356 bool AMDGPULegalizerInfo::legalizeITOFP(
1357   MachineInstr &MI, MachineRegisterInfo &MRI,
1358   MachineIRBuilder &B, bool Signed) const {
1359   B.setInstr(MI);
1360 
1361   Register Dst = MI.getOperand(0).getReg();
1362   Register Src = MI.getOperand(1).getReg();
1363 
1364   const LLT S64 = LLT::scalar(64);
1365   const LLT S32 = LLT::scalar(32);
1366 
1367   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1368 
1369   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1370 
1371   auto CvtHi = Signed ?
1372     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1373     B.buildUITOFP(S64, Unmerge.getReg(1));
1374 
1375   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1376 
1377   auto ThirtyTwo = B.buildConstant(S32, 32);
1378   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1379     .addUse(CvtHi.getReg(0))
1380     .addUse(ThirtyTwo.getReg(0));
1381 
1382   // TODO: Should this propagate fast-math-flags?
1383   B.buildFAdd(Dst, LdExp, CvtLo);
1384   MI.eraseFromParent();
1385   return true;
1386 }
1387 
1388 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1389   MachineInstr &MI, MachineRegisterInfo &MRI,
1390   MachineIRBuilder &B) const {
1391   MachineFunction &MF = B.getMF();
1392   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1393 
1394   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1395                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1396 
1397   // With ieee_mode disabled, the instructions have the correct behavior
1398   // already for G_FMINNUM/G_FMAXNUM
1399   if (!MFI->getMode().IEEE)
1400     return !IsIEEEOp;
1401 
1402   if (IsIEEEOp)
1403     return true;
1404 
1405   MachineIRBuilder HelperBuilder(MI);
1406   GISelObserverWrapper DummyObserver;
1407   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1408   HelperBuilder.setInstr(MI);
1409   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1410 }
1411 
1412 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1413   MachineInstr &MI, MachineRegisterInfo &MRI,
1414   MachineIRBuilder &B) const {
1415   // TODO: Should move some of this into LegalizerHelper.
1416 
1417   // TODO: Promote dynamic indexing of s16 to s32
1418   // TODO: Dynamic s64 indexing is only legal for SGPR.
1419   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
1420   if (!IdxVal) // Dynamic case will be selected to register indexing.
1421     return true;
1422 
1423   Register Dst = MI.getOperand(0).getReg();
1424   Register Vec = MI.getOperand(1).getReg();
1425 
1426   LLT VecTy = MRI.getType(Vec);
1427   LLT EltTy = VecTy.getElementType();
1428   assert(EltTy == MRI.getType(Dst));
1429 
1430   B.setInstr(MI);
1431 
1432   if (IdxVal.getValue() < VecTy.getNumElements())
1433     B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits());
1434   else
1435     B.buildUndef(Dst);
1436 
1437   MI.eraseFromParent();
1438   return true;
1439 }
1440 
1441 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1442   MachineInstr &MI, MachineRegisterInfo &MRI,
1443   MachineIRBuilder &B) const {
1444   // TODO: Should move some of this into LegalizerHelper.
1445 
1446   // TODO: Promote dynamic indexing of s16 to s32
1447   // TODO: Dynamic s64 indexing is only legal for SGPR.
1448   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI);
1449   if (!IdxVal) // Dynamic case will be selected to register indexing.
1450     return true;
1451 
1452   Register Dst = MI.getOperand(0).getReg();
1453   Register Vec = MI.getOperand(1).getReg();
1454   Register Ins = MI.getOperand(2).getReg();
1455 
1456   LLT VecTy = MRI.getType(Vec);
1457   LLT EltTy = VecTy.getElementType();
1458   assert(EltTy == MRI.getType(Ins));
1459 
1460   B.setInstr(MI);
1461 
1462   if (IdxVal.getValue() < VecTy.getNumElements())
1463     B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits());
1464   else
1465     B.buildUndef(Dst);
1466 
1467   MI.eraseFromParent();
1468   return true;
1469 }
1470 
1471 bool AMDGPULegalizerInfo::legalizeSinCos(
1472   MachineInstr &MI, MachineRegisterInfo &MRI,
1473   MachineIRBuilder &B) const {
1474   B.setInstr(MI);
1475 
1476   Register DstReg = MI.getOperand(0).getReg();
1477   Register SrcReg = MI.getOperand(1).getReg();
1478   LLT Ty = MRI.getType(DstReg);
1479   unsigned Flags = MI.getFlags();
1480 
1481   Register TrigVal;
1482   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1483   if (ST.hasTrigReducedRange()) {
1484     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1485     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1486       .addUse(MulVal.getReg(0))
1487       .setMIFlags(Flags).getReg(0);
1488   } else
1489     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1490 
1491   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1492     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1493   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1494     .addUse(TrigVal)
1495     .setMIFlags(Flags);
1496   MI.eraseFromParent();
1497   return true;
1498 }
1499 
1500 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1501   MachineInstr &MI, MachineRegisterInfo &MRI,
1502   MachineIRBuilder &B) const {
1503   Register DstReg = MI.getOperand(0).getReg();
1504   LLT Ty = MRI.getType(DstReg);
1505   unsigned AS = Ty.getAddressSpace();
1506 
1507   const GlobalValue *GV = MI.getOperand(1).getGlobal();
1508   MachineFunction &MF = B.getMF();
1509   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1510 
1511   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1512     B.setInstr(MI);
1513 
1514     if (!MFI->isEntryFunction()) {
1515       const Function &Fn = MF.getFunction();
1516       DiagnosticInfoUnsupported BadLDSDecl(
1517         Fn, "local memory global used by non-kernel function", MI.getDebugLoc());
1518       Fn.getContext().diagnose(BadLDSDecl);
1519     }
1520 
1521     // TODO: We could emit code to handle the initialization somewhere.
1522     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1523       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1524       MI.eraseFromParent();
1525       return true;
1526     }
1527   } else
1528     return false;
1529 
1530   const Function &Fn = MF.getFunction();
1531   DiagnosticInfoUnsupported BadInit(
1532     Fn, "unsupported initializer for address space", MI.getDebugLoc());
1533   Fn.getContext().diagnose(BadInit);
1534   return true;
1535 }
1536 
1537 bool AMDGPULegalizerInfo::legalizeLoad(
1538   MachineInstr &MI, MachineRegisterInfo &MRI,
1539   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
1540   B.setInstr(MI);
1541   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1542   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
1543   Observer.changingInstr(MI);
1544   MI.getOperand(1).setReg(Cast.getReg(0));
1545   Observer.changedInstr(MI);
1546   return true;
1547 }
1548 
1549 // Return the use branch instruction, otherwise null if the usage is invalid.
1550 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
1551                                        MachineRegisterInfo &MRI) {
1552   Register CondDef = MI.getOperand(0).getReg();
1553   if (!MRI.hasOneNonDBGUse(CondDef))
1554     return nullptr;
1555 
1556   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
1557   return UseMI.getParent() == MI.getParent() &&
1558     UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr;
1559 }
1560 
1561 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
1562                                                 Register Reg, LLT Ty) const {
1563   Register LiveIn = MRI.getLiveInVirtReg(Reg);
1564   if (LiveIn)
1565     return LiveIn;
1566 
1567   Register NewReg = MRI.createGenericVirtualRegister(Ty);
1568   MRI.addLiveIn(Reg, NewReg);
1569   return NewReg;
1570 }
1571 
1572 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
1573                                          const ArgDescriptor *Arg) const {
1574   if (!Arg->isRegister() || !Arg->getRegister().isValid())
1575     return false; // TODO: Handle these
1576 
1577   assert(Arg->getRegister().isPhysical());
1578 
1579   MachineRegisterInfo &MRI = *B.getMRI();
1580 
1581   LLT Ty = MRI.getType(DstReg);
1582   Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
1583 
1584   if (Arg->isMasked()) {
1585     // TODO: Should we try to emit this once in the entry block?
1586     const LLT S32 = LLT::scalar(32);
1587     const unsigned Mask = Arg->getMask();
1588     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
1589 
1590     auto ShiftAmt = B.buildConstant(S32, Shift);
1591     auto LShr = B.buildLShr(S32, LiveIn, ShiftAmt);
1592     B.buildAnd(DstReg, LShr, B.buildConstant(S32, Mask >> Shift));
1593   } else
1594     B.buildCopy(DstReg, LiveIn);
1595 
1596   // Insert the argument copy if it doens't already exist.
1597   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
1598   if (!MRI.getVRegDef(LiveIn)) {
1599     // FIXME: Should have scoped insert pt
1600     MachineBasicBlock &OrigInsBB = B.getMBB();
1601     auto OrigInsPt = B.getInsertPt();
1602 
1603     MachineBasicBlock &EntryMBB = B.getMF().front();
1604     EntryMBB.addLiveIn(Arg->getRegister());
1605     B.setInsertPt(EntryMBB, EntryMBB.begin());
1606     B.buildCopy(LiveIn, Arg->getRegister());
1607 
1608     B.setInsertPt(OrigInsBB, OrigInsPt);
1609   }
1610 
1611   return true;
1612 }
1613 
1614 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
1615   MachineInstr &MI,
1616   MachineRegisterInfo &MRI,
1617   MachineIRBuilder &B,
1618   AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
1619   B.setInstr(MI);
1620 
1621   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1622 
1623   const ArgDescriptor *Arg;
1624   const TargetRegisterClass *RC;
1625   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
1626   if (!Arg) {
1627     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
1628     return false;
1629   }
1630 
1631   if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
1632     MI.eraseFromParent();
1633     return true;
1634   }
1635 
1636   return false;
1637 }
1638 
1639 bool AMDGPULegalizerInfo::legalizeFDIVFast(MachineInstr &MI,
1640                                            MachineRegisterInfo &MRI,
1641                                            MachineIRBuilder &B) const {
1642   B.setInstr(MI);
1643   Register Res = MI.getOperand(0).getReg();
1644   Register LHS = MI.getOperand(2).getReg();
1645   Register RHS = MI.getOperand(3).getReg();
1646   uint16_t Flags = MI.getFlags();
1647 
1648   LLT S32 = LLT::scalar(32);
1649   LLT S1 = LLT::scalar(1);
1650 
1651   auto Abs = B.buildFAbs(S32, RHS, Flags);
1652   const APFloat C0Val(1.0f);
1653 
1654   auto C0 = B.buildConstant(S32, 0x6f800000);
1655   auto C1 = B.buildConstant(S32, 0x2f800000);
1656   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
1657 
1658   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
1659   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
1660 
1661   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
1662 
1663   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
1664     .addUse(Mul0.getReg(0))
1665     .setMIFlags(Flags);
1666 
1667   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
1668 
1669   B.buildFMul(Res, Sel, Mul1, Flags);
1670 
1671   MI.eraseFromParent();
1672   return true;
1673 }
1674 
1675 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
1676                                                  MachineRegisterInfo &MRI,
1677                                                  MachineIRBuilder &B) const {
1678   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1679   if (!MFI->isEntryFunction()) {
1680     return legalizePreloadedArgIntrin(MI, MRI, B,
1681                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
1682   }
1683 
1684   B.setInstr(MI);
1685 
1686   uint64_t Offset =
1687     ST.getTargetLowering()->getImplicitParameterOffset(
1688       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
1689   Register DstReg = MI.getOperand(0).getReg();
1690   LLT DstTy = MRI.getType(DstReg);
1691   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
1692 
1693   const ArgDescriptor *Arg;
1694   const TargetRegisterClass *RC;
1695   std::tie(Arg, RC)
1696     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1697   if (!Arg)
1698     return false;
1699 
1700   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
1701   if (!loadInputValue(KernargPtrReg, B, Arg))
1702     return false;
1703 
1704   B.buildGEP(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
1705   MI.eraseFromParent();
1706   return true;
1707 }
1708 
1709 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
1710                                               MachineRegisterInfo &MRI,
1711                                               MachineIRBuilder &B,
1712                                               unsigned AddrSpace) const {
1713   B.setInstr(MI);
1714   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
1715   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
1716   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
1717   MI.eraseFromParent();
1718   return true;
1719 }
1720 
1721 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
1722                                             MachineRegisterInfo &MRI,
1723                                             MachineIRBuilder &B) const {
1724   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
1725   switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
1726   case Intrinsic::amdgcn_if: {
1727     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
1728       const SIRegisterInfo *TRI
1729         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1730 
1731       B.setInstr(*BrCond);
1732       Register Def = MI.getOperand(1).getReg();
1733       Register Use = MI.getOperand(3).getReg();
1734       B.buildInstr(AMDGPU::SI_IF)
1735         .addDef(Def)
1736         .addUse(Use)
1737         .addMBB(BrCond->getOperand(1).getMBB());
1738 
1739       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
1740       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
1741       MI.eraseFromParent();
1742       BrCond->eraseFromParent();
1743       return true;
1744     }
1745 
1746     return false;
1747   }
1748   case Intrinsic::amdgcn_loop: {
1749     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
1750       const SIRegisterInfo *TRI
1751         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1752 
1753       B.setInstr(*BrCond);
1754       Register Reg = MI.getOperand(2).getReg();
1755       B.buildInstr(AMDGPU::SI_LOOP)
1756         .addUse(Reg)
1757         .addMBB(BrCond->getOperand(1).getMBB());
1758       MI.eraseFromParent();
1759       BrCond->eraseFromParent();
1760       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
1761       return true;
1762     }
1763 
1764     return false;
1765   }
1766   case Intrinsic::amdgcn_kernarg_segment_ptr:
1767     return legalizePreloadedArgIntrin(
1768       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1769   case Intrinsic::amdgcn_implicitarg_ptr:
1770     return legalizeImplicitArgPtr(MI, MRI, B);
1771   case Intrinsic::amdgcn_workitem_id_x:
1772     return legalizePreloadedArgIntrin(MI, MRI, B,
1773                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
1774   case Intrinsic::amdgcn_workitem_id_y:
1775     return legalizePreloadedArgIntrin(MI, MRI, B,
1776                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
1777   case Intrinsic::amdgcn_workitem_id_z:
1778     return legalizePreloadedArgIntrin(MI, MRI, B,
1779                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
1780   case Intrinsic::amdgcn_workgroup_id_x:
1781     return legalizePreloadedArgIntrin(MI, MRI, B,
1782                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
1783   case Intrinsic::amdgcn_workgroup_id_y:
1784     return legalizePreloadedArgIntrin(MI, MRI, B,
1785                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
1786   case Intrinsic::amdgcn_workgroup_id_z:
1787     return legalizePreloadedArgIntrin(MI, MRI, B,
1788                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
1789   case Intrinsic::amdgcn_dispatch_ptr:
1790     return legalizePreloadedArgIntrin(MI, MRI, B,
1791                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
1792   case Intrinsic::amdgcn_queue_ptr:
1793     return legalizePreloadedArgIntrin(MI, MRI, B,
1794                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
1795   case Intrinsic::amdgcn_implicit_buffer_ptr:
1796     return legalizePreloadedArgIntrin(
1797       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
1798   case Intrinsic::amdgcn_dispatch_id:
1799     return legalizePreloadedArgIntrin(MI, MRI, B,
1800                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
1801   case Intrinsic::amdgcn_fdiv_fast:
1802     return legalizeFDIVFast(MI, MRI, B);
1803   case Intrinsic::amdgcn_is_shared:
1804     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
1805   case Intrinsic::amdgcn_is_private:
1806     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
1807   case Intrinsic::amdgcn_wavefrontsize: {
1808     B.setInstr(MI);
1809     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
1810     MI.eraseFromParent();
1811     return true;
1812   }
1813   default:
1814     return true;
1815   }
1816 
1817   return true;
1818 }
1819