1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
19 #endif
20 
21 #include "AMDGPU.h"
22 #include "AMDGPULegalizerInfo.h"
23 #include "AMDGPUTargetMachine.h"
24 #include "SIMachineFunctionInfo.h"
25 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/CodeGen/TargetOpcodes.h"
28 #include "llvm/CodeGen/ValueTypes.h"
29 #include "llvm/IR/DerivedTypes.h"
30 #include "llvm/IR/DiagnosticInfo.h"
31 #include "llvm/IR/Type.h"
32 #include "llvm/Support/Debug.h"
33 
34 #define DEBUG_TYPE "amdgpu-legalinfo"
35 
36 using namespace llvm;
37 using namespace LegalizeActions;
38 using namespace LegalizeMutations;
39 using namespace LegalityPredicates;
40 
41 
42 static LegalityPredicate isMultiple32(unsigned TypeIdx,
43                                       unsigned MaxSize = 1024) {
44   return [=](const LegalityQuery &Query) {
45     const LLT Ty = Query.Types[TypeIdx];
46     const LLT EltTy = Ty.getScalarType();
47     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
48   };
49 }
50 
51 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
52   return [=](const LegalityQuery &Query) {
53     return Query.Types[TypeIdx].getSizeInBits() == Size;
54   };
55 }
56 
57 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
58   return [=](const LegalityQuery &Query) {
59     const LLT Ty = Query.Types[TypeIdx];
60     return Ty.isVector() &&
61            Ty.getNumElements() % 2 != 0 &&
62            Ty.getElementType().getSizeInBits() < 32 &&
63            Ty.getSizeInBits() % 32 != 0;
64   };
65 }
66 
67 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
68   return [=](const LegalityQuery &Query) {
69     const LLT Ty = Query.Types[TypeIdx];
70     const LLT EltTy = Ty.getScalarType();
71     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
72   };
73 }
74 
75 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
76   return [=](const LegalityQuery &Query) {
77     const LLT Ty = Query.Types[TypeIdx];
78     const LLT EltTy = Ty.getElementType();
79     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
80   };
81 }
82 
83 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
84   return [=](const LegalityQuery &Query) {
85     const LLT Ty = Query.Types[TypeIdx];
86     const LLT EltTy = Ty.getElementType();
87     unsigned Size = Ty.getSizeInBits();
88     unsigned Pieces = (Size + 63) / 64;
89     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
90     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
91   };
92 }
93 
94 // Increase the number of vector elements to reach the next multiple of 32-bit
95 // type.
96 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
97   return [=](const LegalityQuery &Query) {
98     const LLT Ty = Query.Types[TypeIdx];
99 
100     const LLT EltTy = Ty.getElementType();
101     const int Size = Ty.getSizeInBits();
102     const int EltSize = EltTy.getSizeInBits();
103     const int NextMul32 = (Size + 31) / 32;
104 
105     assert(EltSize < 32);
106 
107     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
108     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
109   };
110 }
111 
112 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
113   return [=](const LegalityQuery &Query) {
114     const LLT QueryTy = Query.Types[TypeIdx];
115     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
116   };
117 }
118 
119 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
120   return [=](const LegalityQuery &Query) {
121     const LLT QueryTy = Query.Types[TypeIdx];
122     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
123   };
124 }
125 
126 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
127   return [=](const LegalityQuery &Query) {
128     const LLT QueryTy = Query.Types[TypeIdx];
129     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
130   };
131 }
132 
133 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
134 // v2s16.
135 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
136   return [=](const LegalityQuery &Query) {
137     const LLT Ty = Query.Types[TypeIdx];
138     if (Ty.isVector()) {
139       const int EltSize = Ty.getElementType().getSizeInBits();
140       return EltSize == 32 || EltSize == 64 ||
141             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
142              EltSize == 128 || EltSize == 256;
143     }
144 
145     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
146   };
147 }
148 
149 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
150   return [=](const LegalityQuery &Query) {
151     return Query.Types[TypeIdx].getElementType() == Type;
152   };
153 }
154 
155 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
156   return [=](const LegalityQuery &Query) {
157     const LLT Ty = Query.Types[TypeIdx];
158     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
159            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
160   };
161 }
162 
163 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
164                                          const GCNTargetMachine &TM)
165   :  ST(ST_) {
166   using namespace TargetOpcode;
167 
168   auto GetAddrSpacePtr = [&TM](unsigned AS) {
169     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
170   };
171 
172   const LLT S1 = LLT::scalar(1);
173   const LLT S8 = LLT::scalar(8);
174   const LLT S16 = LLT::scalar(16);
175   const LLT S32 = LLT::scalar(32);
176   const LLT S64 = LLT::scalar(64);
177   const LLT S96 = LLT::scalar(96);
178   const LLT S128 = LLT::scalar(128);
179   const LLT S256 = LLT::scalar(256);
180   const LLT S1024 = LLT::scalar(1024);
181 
182   const LLT V2S16 = LLT::vector(2, 16);
183   const LLT V4S16 = LLT::vector(4, 16);
184 
185   const LLT V2S32 = LLT::vector(2, 32);
186   const LLT V3S32 = LLT::vector(3, 32);
187   const LLT V4S32 = LLT::vector(4, 32);
188   const LLT V5S32 = LLT::vector(5, 32);
189   const LLT V6S32 = LLT::vector(6, 32);
190   const LLT V7S32 = LLT::vector(7, 32);
191   const LLT V8S32 = LLT::vector(8, 32);
192   const LLT V9S32 = LLT::vector(9, 32);
193   const LLT V10S32 = LLT::vector(10, 32);
194   const LLT V11S32 = LLT::vector(11, 32);
195   const LLT V12S32 = LLT::vector(12, 32);
196   const LLT V13S32 = LLT::vector(13, 32);
197   const LLT V14S32 = LLT::vector(14, 32);
198   const LLT V15S32 = LLT::vector(15, 32);
199   const LLT V16S32 = LLT::vector(16, 32);
200   const LLT V32S32 = LLT::vector(32, 32);
201 
202   const LLT V2S64 = LLT::vector(2, 64);
203   const LLT V3S64 = LLT::vector(3, 64);
204   const LLT V4S64 = LLT::vector(4, 64);
205   const LLT V5S64 = LLT::vector(5, 64);
206   const LLT V6S64 = LLT::vector(6, 64);
207   const LLT V7S64 = LLT::vector(7, 64);
208   const LLT V8S64 = LLT::vector(8, 64);
209   const LLT V16S64 = LLT::vector(16, 64);
210 
211   std::initializer_list<LLT> AllS32Vectors =
212     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
213      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
214   std::initializer_list<LLT> AllS64Vectors =
215     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
216 
217   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
218   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
219   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
220   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
221   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
222   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
223   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
224 
225   const LLT CodePtr = FlatPtr;
226 
227   const std::initializer_list<LLT> AddrSpaces64 = {
228     GlobalPtr, ConstantPtr, FlatPtr
229   };
230 
231   const std::initializer_list<LLT> AddrSpaces32 = {
232     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
233   };
234 
235   const std::initializer_list<LLT> FPTypesBase = {
236     S32, S64
237   };
238 
239   const std::initializer_list<LLT> FPTypes16 = {
240     S32, S64, S16
241   };
242 
243   const std::initializer_list<LLT> FPTypesPK16 = {
244     S32, S64, S16, V2S16
245   };
246 
247   setAction({G_BRCOND, S1}, Legal);
248 
249   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
250   // elements for v3s16
251   getActionDefinitionsBuilder(G_PHI)
252     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
253     .legalFor(AllS32Vectors)
254     .legalFor(AllS64Vectors)
255     .legalFor(AddrSpaces64)
256     .legalFor(AddrSpaces32)
257     .clampScalar(0, S32, S256)
258     .widenScalarToNextPow2(0, 32)
259     .clampMaxNumElements(0, S32, 16)
260     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
261     .legalIf(isPointer(0));
262 
263   if (ST.has16BitInsts()) {
264     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
265       .legalFor({S32, S16})
266       .clampScalar(0, S16, S32)
267       .scalarize(0);
268   } else {
269     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
270       .legalFor({S32})
271       .clampScalar(0, S32, S32)
272       .scalarize(0);
273   }
274 
275   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
276     .legalFor({S32})
277     .clampScalar(0, S32, S32)
278     .scalarize(0);
279 
280   // Report legal for any types we can handle anywhere. For the cases only legal
281   // on the SALU, RegBankSelect will be able to re-legalize.
282   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
283     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
284     .clampScalar(0, S32, S64)
285     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
286     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
287     .widenScalarToNextPow2(0)
288     .scalarize(0);
289 
290   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
291                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
292     .legalFor({{S32, S1}})
293     .clampScalar(0, S32, S32)
294     .scalarize(0); // TODO: Implement.
295 
296   getActionDefinitionsBuilder({G_SADDO, G_SSUBO})
297     .lower();
298 
299   getActionDefinitionsBuilder(G_BITCAST)
300     // Don't worry about the size constraint.
301     .legalIf(all(isRegisterType(0), isRegisterType(1)))
302     // FIXME: Testing hack
303     .legalForCartesianProduct({S16, LLT::vector(2, 8), });
304 
305   getActionDefinitionsBuilder(G_FCONSTANT)
306     .legalFor({S32, S64, S16})
307     .clampScalar(0, S16, S64);
308 
309   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
310     .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
311                ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
312     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
313     .clampScalarOrElt(0, S32, S1024)
314     .legalIf(isMultiple32(0))
315     .widenScalarToNextPow2(0, 32)
316     .clampMaxNumElements(0, S32, 16);
317 
318 
319   // FIXME: i1 operands to intrinsics should always be legal, but other i1
320   // values may not be legal.  We need to figure out how to distinguish
321   // between these two scenarios.
322   getActionDefinitionsBuilder(G_CONSTANT)
323     .legalFor({S1, S32, S64, S16, GlobalPtr,
324                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
325     .clampScalar(0, S32, S64)
326     .widenScalarToNextPow2(0)
327     .legalIf(isPointer(0));
328 
329   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
330   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
331     .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr});
332 
333 
334   auto &FPOpActions = getActionDefinitionsBuilder(
335     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
336     .legalFor({S32, S64});
337   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
338     .customFor({S32, S64});
339   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
340     .customFor({S32, S64});
341 
342   if (ST.has16BitInsts()) {
343     if (ST.hasVOP3PInsts())
344       FPOpActions.legalFor({S16, V2S16});
345     else
346       FPOpActions.legalFor({S16});
347 
348     TrigActions.customFor({S16});
349     FDIVActions.customFor({S16});
350   }
351 
352   auto &MinNumMaxNum = getActionDefinitionsBuilder({
353       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
354 
355   if (ST.hasVOP3PInsts()) {
356     MinNumMaxNum.customFor(FPTypesPK16)
357       .clampMaxNumElements(0, S16, 2)
358       .clampScalar(0, S16, S64)
359       .scalarize(0);
360   } else if (ST.has16BitInsts()) {
361     MinNumMaxNum.customFor(FPTypes16)
362       .clampScalar(0, S16, S64)
363       .scalarize(0);
364   } else {
365     MinNumMaxNum.customFor(FPTypesBase)
366       .clampScalar(0, S32, S64)
367       .scalarize(0);
368   }
369 
370   if (ST.hasVOP3PInsts())
371     FPOpActions.clampMaxNumElements(0, S16, 2);
372 
373   FPOpActions
374     .scalarize(0)
375     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
376 
377   TrigActions
378     .scalarize(0)
379     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
380 
381   FDIVActions
382     .scalarize(0)
383     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
384 
385   getActionDefinitionsBuilder({G_FNEG, G_FABS})
386     .legalFor(FPTypesPK16)
387     .clampMaxNumElements(0, S16, 2)
388     .scalarize(0)
389     .clampScalar(0, S16, S64);
390 
391   // TODO: Implement
392   getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
393 
394   if (ST.has16BitInsts()) {
395     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
396       .legalFor({S32, S64, S16})
397       .scalarize(0)
398       .clampScalar(0, S16, S64);
399   } else {
400     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
401       .legalFor({S32, S64})
402       .scalarize(0)
403       .clampScalar(0, S32, S64);
404   }
405 
406   getActionDefinitionsBuilder(G_FPTRUNC)
407     .legalFor({{S32, S64}, {S16, S32}})
408     .scalarize(0);
409 
410   getActionDefinitionsBuilder(G_FPEXT)
411     .legalFor({{S64, S32}, {S32, S16}})
412     .lowerFor({{S64, S16}}) // FIXME: Implement
413     .scalarize(0);
414 
415   // TODO: Verify V_BFI_B32 is generated from expanded bit ops.
416   getActionDefinitionsBuilder(G_FCOPYSIGN).lower();
417 
418   getActionDefinitionsBuilder(G_FSUB)
419       // Use actual fsub instruction
420       .legalFor({S32})
421       // Must use fadd + fneg
422       .lowerFor({S64, S16, V2S16})
423       .scalarize(0)
424       .clampScalar(0, S32, S64);
425 
426   // Whether this is legal depends on the floating point mode for the function.
427   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
428   if (ST.hasMadF16())
429     FMad.customFor({S32, S16});
430   else
431     FMad.customFor({S32});
432   FMad.scalarize(0)
433       .lower();
434 
435   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
436     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
437                {S32, S1}, {S64, S1}, {S16, S1},
438                {S96, S32},
439                // FIXME: Hack
440                {S64, LLT::scalar(33)},
441                {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}})
442     .scalarize(0);
443 
444   // TODO: Split s1->s64 during regbankselect for VALU.
445   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
446     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}, {S32, S1}, {S16, S1}, {S64, S1}})
447     .lowerFor({{S32, S64}})
448     .customFor({{S64, S64}});
449   if (ST.has16BitInsts())
450     IToFP.legalFor({{S16, S16}});
451   IToFP.clampScalar(1, S32, S64)
452        .scalarize(0);
453 
454   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
455     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}});
456   if (ST.has16BitInsts())
457     FPToI.legalFor({{S16, S16}});
458   else
459     FPToI.minScalar(1, S32);
460 
461   FPToI.minScalar(0, S32)
462        .scalarize(0);
463 
464   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
465     .legalFor({S32, S64})
466     .scalarize(0);
467 
468   if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
469     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
470       .legalFor({S32, S64})
471       .clampScalar(0, S32, S64)
472       .scalarize(0);
473   } else {
474     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
475       .legalFor({S32})
476       .customFor({S64})
477       .clampScalar(0, S32, S64)
478       .scalarize(0);
479   }
480 
481   getActionDefinitionsBuilder(G_GEP)
482     .legalForCartesianProduct(AddrSpaces64, {S64})
483     .legalForCartesianProduct(AddrSpaces32, {S32})
484     .scalarize(0);
485 
486   getActionDefinitionsBuilder(G_PTR_MASK)
487     .scalarize(0)
488     .alwaysLegal();
489 
490   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
491 
492   auto &CmpBuilder =
493     getActionDefinitionsBuilder(G_ICMP)
494     .legalForCartesianProduct(
495       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
496     .legalFor({{S1, S32}, {S1, S64}});
497   if (ST.has16BitInsts()) {
498     CmpBuilder.legalFor({{S1, S16}});
499   }
500 
501   CmpBuilder
502     .widenScalarToNextPow2(1)
503     .clampScalar(1, S32, S64)
504     .scalarize(0)
505     .legalIf(all(typeIs(0, S1), isPointer(1)));
506 
507   getActionDefinitionsBuilder(G_FCMP)
508     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
509     .widenScalarToNextPow2(1)
510     .clampScalar(1, S32, S64)
511     .scalarize(0);
512 
513   // FIXME: fexp, flog2, flog10 needs to be custom lowered.
514   getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2,
515                                G_FLOG, G_FLOG2, G_FLOG10})
516     .legalFor({S32})
517     .scalarize(0);
518 
519   // The 64-bit versions produce 32-bit results, but only on the SALU.
520   getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF,
521                                G_CTTZ, G_CTTZ_ZERO_UNDEF,
522                                G_CTPOP})
523     .legalFor({{S32, S32}, {S32, S64}})
524     .clampScalar(0, S32, S32)
525     .clampScalar(1, S32, S64)
526     .scalarize(0)
527     .widenScalarToNextPow2(0, 32)
528     .widenScalarToNextPow2(1, 32);
529 
530   // TODO: Expand for > s32
531   getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE})
532     .legalFor({S32})
533     .clampScalar(0, S32, S32)
534     .scalarize(0);
535 
536   if (ST.has16BitInsts()) {
537     if (ST.hasVOP3PInsts()) {
538       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
539         .legalFor({S32, S16, V2S16})
540         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
541         .clampMaxNumElements(0, S16, 2)
542         .clampScalar(0, S16, S32)
543         .widenScalarToNextPow2(0)
544         .scalarize(0);
545     } else {
546       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
547         .legalFor({S32, S16})
548         .widenScalarToNextPow2(0)
549         .clampScalar(0, S16, S32)
550         .scalarize(0);
551     }
552   } else {
553     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
554       .legalFor({S32})
555       .clampScalar(0, S32, S32)
556       .widenScalarToNextPow2(0)
557       .scalarize(0);
558   }
559 
560   auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
561     return [=](const LegalityQuery &Query) {
562       return Query.Types[TypeIdx0].getSizeInBits() <
563              Query.Types[TypeIdx1].getSizeInBits();
564     };
565   };
566 
567   auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
568     return [=](const LegalityQuery &Query) {
569       return Query.Types[TypeIdx0].getSizeInBits() >
570              Query.Types[TypeIdx1].getSizeInBits();
571     };
572   };
573 
574   getActionDefinitionsBuilder(G_INTTOPTR)
575     // List the common cases
576     .legalForCartesianProduct(AddrSpaces64, {S64})
577     .legalForCartesianProduct(AddrSpaces32, {S32})
578     .scalarize(0)
579     // Accept any address space as long as the size matches
580     .legalIf(sameSize(0, 1))
581     .widenScalarIf(smallerThan(1, 0),
582       [](const LegalityQuery &Query) {
583         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
584       })
585     .narrowScalarIf(greaterThan(1, 0),
586       [](const LegalityQuery &Query) {
587         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
588       });
589 
590   getActionDefinitionsBuilder(G_PTRTOINT)
591     // List the common cases
592     .legalForCartesianProduct(AddrSpaces64, {S64})
593     .legalForCartesianProduct(AddrSpaces32, {S32})
594     .scalarize(0)
595     // Accept any address space as long as the size matches
596     .legalIf(sameSize(0, 1))
597     .widenScalarIf(smallerThan(0, 1),
598       [](const LegalityQuery &Query) {
599         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
600       })
601     .narrowScalarIf(
602       greaterThan(0, 1),
603       [](const LegalityQuery &Query) {
604         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
605       });
606 
607   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
608     .scalarize(0)
609     .custom();
610 
611   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
612   // handle some operations by just promoting the register during
613   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
614   auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned {
615     switch (AS) {
616     // FIXME: Private element size.
617     case AMDGPUAS::PRIVATE_ADDRESS:
618       return 32;
619     // FIXME: Check subtarget
620     case AMDGPUAS::LOCAL_ADDRESS:
621       return ST.useDS128() ? 128 : 64;
622 
623     // Treat constant and global as identical. SMRD loads are sometimes usable
624     // for global loads (ideally constant address space should be eliminated)
625     // depending on the context. Legality cannot be context dependent, but
626     // RegBankSelect can split the load as necessary depending on the pointer
627     // register bank/uniformity and if the memory is invariant or not written in
628     // a kernel.
629     case AMDGPUAS::CONSTANT_ADDRESS:
630     case AMDGPUAS::GLOBAL_ADDRESS:
631       return 512;
632     default:
633       return 128;
634     }
635   };
636 
637   const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool {
638     const LLT DstTy = Query.Types[0];
639 
640     // Split vector extloads.
641     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
642     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
643       return true;
644 
645     const LLT PtrTy = Query.Types[1];
646     unsigned AS = PtrTy.getAddressSpace();
647     if (MemSize > maxSizeForAddrSpace(AS))
648       return true;
649 
650     // Catch weird sized loads that don't evenly divide into the access sizes
651     // TODO: May be able to widen depending on alignment etc.
652     unsigned NumRegs = MemSize / 32;
653     if (NumRegs == 3 && !ST.hasDwordx3LoadStores())
654       return true;
655 
656     unsigned Align = Query.MMODescrs[0].AlignInBits;
657     if (Align < MemSize) {
658       const SITargetLowering *TLI = ST.getTargetLowering();
659       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
660     }
661 
662     return false;
663   };
664 
665   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
666   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
667   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
668 
669   // TODO: Refine based on subtargets which support unaligned access or 128-bit
670   // LDS
671   // TODO: Unsupported flat for SI.
672 
673   for (unsigned Op : {G_LOAD, G_STORE}) {
674     const bool IsStore = Op == G_STORE;
675 
676     auto &Actions = getActionDefinitionsBuilder(Op);
677     // Whitelist the common cases.
678     // TODO: Pointer loads
679     // TODO: Wide constant loads
680     // TODO: Only CI+ has 3x loads
681     // TODO: Loads to s16 on gfx9
682     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
683                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
684                                       {V3S32, GlobalPtr, 96, GlobalAlign32},
685                                       {S96, GlobalPtr, 96, GlobalAlign32},
686                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
687                                       {S128, GlobalPtr, 128, GlobalAlign32},
688                                       {S64, GlobalPtr, 64, GlobalAlign32},
689                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
690                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
691                                       {S32, GlobalPtr, 8, GlobalAlign8},
692                                       {S32, GlobalPtr, 16, GlobalAlign16},
693 
694                                       {S32, LocalPtr, 32, 32},
695                                       {S64, LocalPtr, 64, 32},
696                                       {V2S32, LocalPtr, 64, 32},
697                                       {S32, LocalPtr, 8, 8},
698                                       {S32, LocalPtr, 16, 16},
699                                       {V2S16, LocalPtr, 32, 32},
700 
701                                       {S32, PrivatePtr, 32, 32},
702                                       {S32, PrivatePtr, 8, 8},
703                                       {S32, PrivatePtr, 16, 16},
704                                       {V2S16, PrivatePtr, 32, 32},
705 
706                                       {S32, FlatPtr, 32, GlobalAlign32},
707                                       {S32, FlatPtr, 16, GlobalAlign16},
708                                       {S32, FlatPtr, 8, GlobalAlign8},
709                                       {V2S16, FlatPtr, 32, GlobalAlign32},
710 
711                                       {S32, ConstantPtr, 32, GlobalAlign32},
712                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
713                                       {V3S32, ConstantPtr, 96, GlobalAlign32},
714                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
715                                       {S64, ConstantPtr, 64, GlobalAlign32},
716                                       {S128, ConstantPtr, 128, GlobalAlign32},
717                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
718     Actions
719         .customIf(typeIs(1, Constant32Ptr))
720         .narrowScalarIf(
721             [=](const LegalityQuery &Query) -> bool {
722               return !Query.Types[0].isVector() && needToSplitLoad(Query);
723             },
724             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
725               const LLT DstTy = Query.Types[0];
726               const LLT PtrTy = Query.Types[1];
727 
728               const unsigned DstSize = DstTy.getSizeInBits();
729               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
730 
731               // Split extloads.
732               if (DstSize > MemSize)
733                 return std::make_pair(0, LLT::scalar(MemSize));
734 
735               if (DstSize > 32 && (DstSize % 32 != 0)) {
736                 // FIXME: Need a way to specify non-extload of larger size if
737                 // suitably aligned.
738                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
739               }
740 
741               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
742               if (MemSize > MaxSize)
743                 return std::make_pair(0, LLT::scalar(MaxSize));
744 
745               unsigned Align = Query.MMODescrs[0].AlignInBits;
746               return std::make_pair(0, LLT::scalar(Align));
747             })
748         .fewerElementsIf(
749             [=](const LegalityQuery &Query) -> bool {
750               return Query.Types[0].isVector() && needToSplitLoad(Query);
751             },
752             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
753               const LLT DstTy = Query.Types[0];
754               const LLT PtrTy = Query.Types[1];
755 
756               LLT EltTy = DstTy.getElementType();
757               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
758 
759               // Split if it's too large for the address space.
760               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
761                 unsigned NumElts = DstTy.getNumElements();
762                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
763 
764                 // FIXME: Refine when odd breakdowns handled
765                 // The scalars will need to be re-legalized.
766                 if (NumPieces == 1 || NumPieces >= NumElts ||
767                     NumElts % NumPieces != 0)
768                   return std::make_pair(0, EltTy);
769 
770                 return std::make_pair(0,
771                                       LLT::vector(NumElts / NumPieces, EltTy));
772               }
773 
774               // Need to split because of alignment.
775               unsigned Align = Query.MMODescrs[0].AlignInBits;
776               unsigned EltSize = EltTy.getSizeInBits();
777               if (EltSize > Align &&
778                   (EltSize / Align < DstTy.getNumElements())) {
779                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
780               }
781 
782               // May need relegalization for the scalars.
783               return std::make_pair(0, EltTy);
784             })
785         .minScalar(0, S32);
786 
787     if (IsStore)
788       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
789 
790     // TODO: Need a bitcast lower option?
791     Actions
792         .legalIf([=](const LegalityQuery &Query) {
793           const LLT Ty0 = Query.Types[0];
794           unsigned Size = Ty0.getSizeInBits();
795           unsigned MemSize = Query.MMODescrs[0].SizeInBits;
796           unsigned Align = Query.MMODescrs[0].AlignInBits;
797 
798           // No extending vector loads.
799           if (Size > MemSize && Ty0.isVector())
800             return false;
801 
802           // FIXME: Widening store from alignment not valid.
803           if (MemSize < Size)
804             MemSize = std::max(MemSize, Align);
805 
806           switch (MemSize) {
807           case 8:
808           case 16:
809             return Size == 32;
810           case 32:
811           case 64:
812           case 128:
813             return true;
814           case 96:
815             return ST.hasDwordx3LoadStores();
816           case 256:
817           case 512:
818             return true;
819           default:
820             return false;
821           }
822         })
823         .widenScalarToNextPow2(0)
824         // TODO: v3s32->v4s32 with alignment
825         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
826   }
827 
828   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
829                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
830                                                   {S32, GlobalPtr, 16, 2 * 8},
831                                                   {S32, LocalPtr, 8, 8},
832                                                   {S32, LocalPtr, 16, 16},
833                                                   {S32, PrivatePtr, 8, 8},
834                                                   {S32, PrivatePtr, 16, 16},
835                                                   {S32, ConstantPtr, 8, 8},
836                                                   {S32, ConstantPtr, 16, 2 * 8}});
837   if (ST.hasFlatAddressSpace()) {
838     ExtLoads.legalForTypesWithMemDesc(
839         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
840   }
841 
842   ExtLoads.clampScalar(0, S32, S32)
843           .widenScalarToNextPow2(0)
844           .unsupportedIfMemSizeNotPow2()
845           .lower();
846 
847   auto &Atomics = getActionDefinitionsBuilder(
848     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
849      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
850      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
851      G_ATOMICRMW_UMIN})
852     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
853                {S64, GlobalPtr}, {S64, LocalPtr}});
854   if (ST.hasFlatAddressSpace()) {
855     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
856   }
857 
858   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
859     .legalFor({{S32, LocalPtr}});
860 
861   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
862   // demarshalling
863   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
864     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
865                 {S32, FlatPtr}, {S64, FlatPtr}})
866     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
867                {S32, RegionPtr}, {S64, RegionPtr}});
868 
869   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS)
870     .lower();
871 
872   // TODO: Pointer types, any 32-bit or 64-bit vector
873   getActionDefinitionsBuilder(G_SELECT)
874     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
875           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
876           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1})
877     .clampScalar(0, S16, S64)
878     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
879     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
880     .scalarize(1)
881     .clampMaxNumElements(0, S32, 2)
882     .clampMaxNumElements(0, LocalPtr, 2)
883     .clampMaxNumElements(0, PrivatePtr, 2)
884     .scalarize(0)
885     .widenScalarToNextPow2(0)
886     .legalIf(all(isPointer(0), typeIs(1, S1)));
887 
888   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
889   // be more flexible with the shift amount type.
890   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
891     .legalFor({{S32, S32}, {S64, S32}});
892   if (ST.has16BitInsts()) {
893     if (ST.hasVOP3PInsts()) {
894       Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
895             .clampMaxNumElements(0, S16, 2);
896     } else
897       Shifts.legalFor({{S16, S32}, {S16, S16}});
898 
899     Shifts.clampScalar(1, S16, S32);
900     Shifts.clampScalar(0, S16, S64);
901     Shifts.widenScalarToNextPow2(0, 16);
902   } else {
903     // Make sure we legalize the shift amount type first, as the general
904     // expansion for the shifted type will produce much worse code if it hasn't
905     // been truncated already.
906     Shifts.clampScalar(1, S32, S32);
907     Shifts.clampScalar(0, S32, S64);
908     Shifts.widenScalarToNextPow2(0, 32);
909   }
910   Shifts.scalarize(0);
911 
912   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
913     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
914     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
915     unsigned IdxTypeIdx = 2;
916 
917     getActionDefinitionsBuilder(Op)
918       .customIf([=](const LegalityQuery &Query) {
919           const LLT EltTy = Query.Types[EltTypeIdx];
920           const LLT VecTy = Query.Types[VecTypeIdx];
921           const LLT IdxTy = Query.Types[IdxTypeIdx];
922           return (EltTy.getSizeInBits() == 16 ||
923                   EltTy.getSizeInBits() % 32 == 0) &&
924                  VecTy.getSizeInBits() % 32 == 0 &&
925                  VecTy.getSizeInBits() <= 1024 &&
926                  IdxTy.getSizeInBits() == 32;
927         })
928       .clampScalar(EltTypeIdx, S32, S64)
929       .clampScalar(VecTypeIdx, S32, S64)
930       .clampScalar(IdxTypeIdx, S32, S32);
931   }
932 
933   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
934     .unsupportedIf([=](const LegalityQuery &Query) {
935         const LLT &EltTy = Query.Types[1].getElementType();
936         return Query.Types[0] != EltTy;
937       });
938 
939   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
940     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
941     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
942 
943     // FIXME: Doesn't handle extract of illegal sizes.
944     getActionDefinitionsBuilder(Op)
945       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
946       // FIXME: Multiples of 16 should not be legal.
947       .legalIf([=](const LegalityQuery &Query) {
948           const LLT BigTy = Query.Types[BigTyIdx];
949           const LLT LitTy = Query.Types[LitTyIdx];
950           return (BigTy.getSizeInBits() % 32 == 0) &&
951                  (LitTy.getSizeInBits() % 16 == 0);
952         })
953       .widenScalarIf(
954         [=](const LegalityQuery &Query) {
955           const LLT BigTy = Query.Types[BigTyIdx];
956           return (BigTy.getScalarSizeInBits() < 16);
957         },
958         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
959       .widenScalarIf(
960         [=](const LegalityQuery &Query) {
961           const LLT LitTy = Query.Types[LitTyIdx];
962           return (LitTy.getScalarSizeInBits() < 16);
963         },
964         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
965       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
966       .widenScalarToNextPow2(BigTyIdx, 32);
967 
968   }
969 
970   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
971     .legalForCartesianProduct(AllS32Vectors, {S32})
972     .legalForCartesianProduct(AllS64Vectors, {S64})
973     .clampNumElements(0, V16S32, V32S32)
974     .clampNumElements(0, V2S64, V16S64)
975     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
976 
977   if (ST.hasScalarPackInsts())
978     BuildVector.legalFor({V2S16, S32});
979 
980   BuildVector
981     .minScalarSameAs(1, 0)
982     .legalIf(isRegisterType(0))
983     .minScalarOrElt(0, S32);
984 
985   if (ST.hasScalarPackInsts()) {
986     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
987       .legalFor({V2S16, S32})
988       .lower();
989   } else {
990     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
991       .lower();
992   }
993 
994   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
995     .legalIf(isRegisterType(0));
996 
997   // TODO: Don't fully scalarize v2s16 pieces
998   getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
999 
1000   // Merge/Unmerge
1001   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1002     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1003     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1004 
1005     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1006       const LLT &Ty = Query.Types[TypeIdx];
1007       if (Ty.isVector()) {
1008         const LLT &EltTy = Ty.getElementType();
1009         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
1010           return true;
1011         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1012           return true;
1013       }
1014       return false;
1015     };
1016 
1017     auto &Builder = getActionDefinitionsBuilder(Op)
1018       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1019       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1020       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1021       // valid.
1022       .clampScalar(LitTyIdx, S16, S256)
1023       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1024       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1025       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1026                            elementTypeIs(1, S16)),
1027                        changeTo(1, V2S16))
1028       // Break up vectors with weird elements into scalars
1029       .fewerElementsIf(
1030         [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
1031         scalarize(0))
1032       .fewerElementsIf(
1033         [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
1034         scalarize(1))
1035       .clampScalar(BigTyIdx, S32, S1024)
1036       .lowerFor({{S16, V2S16}});
1037 
1038     if (Op == G_MERGE_VALUES) {
1039       Builder.widenScalarIf(
1040         // TODO: Use 16-bit shifts if legal for 8-bit values?
1041         [=](const LegalityQuery &Query) {
1042           const LLT Ty = Query.Types[LitTyIdx];
1043           return Ty.getSizeInBits() < 32;
1044         },
1045         changeTo(LitTyIdx, S32));
1046     }
1047 
1048     Builder.widenScalarIf(
1049       [=](const LegalityQuery &Query) {
1050         const LLT Ty = Query.Types[BigTyIdx];
1051         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1052           Ty.getSizeInBits() % 16 != 0;
1053       },
1054       [=](const LegalityQuery &Query) {
1055         // Pick the next power of 2, or a multiple of 64 over 128.
1056         // Whichever is smaller.
1057         const LLT &Ty = Query.Types[BigTyIdx];
1058         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1059         if (NewSizeInBits >= 256) {
1060           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1061           if (RoundedTo < NewSizeInBits)
1062             NewSizeInBits = RoundedTo;
1063         }
1064         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1065       })
1066       .legalIf([=](const LegalityQuery &Query) {
1067           const LLT &BigTy = Query.Types[BigTyIdx];
1068           const LLT &LitTy = Query.Types[LitTyIdx];
1069 
1070           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1071             return false;
1072           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1073             return false;
1074 
1075           return BigTy.getSizeInBits() % 16 == 0 &&
1076                  LitTy.getSizeInBits() % 16 == 0 &&
1077                  BigTy.getSizeInBits() <= 1024;
1078         })
1079       // Any vectors left are the wrong size. Scalarize them.
1080       .scalarize(0)
1081       .scalarize(1);
1082   }
1083 
1084   getActionDefinitionsBuilder(G_SEXT_INREG).lower();
1085 
1086   computeTables();
1087   verify(*ST.getInstrInfo());
1088 }
1089 
1090 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1091                                          MachineRegisterInfo &MRI,
1092                                          MachineIRBuilder &B,
1093                                          GISelChangeObserver &Observer) const {
1094   switch (MI.getOpcode()) {
1095   case TargetOpcode::G_ADDRSPACE_CAST:
1096     return legalizeAddrSpaceCast(MI, MRI, B);
1097   case TargetOpcode::G_FRINT:
1098     return legalizeFrint(MI, MRI, B);
1099   case TargetOpcode::G_FCEIL:
1100     return legalizeFceil(MI, MRI, B);
1101   case TargetOpcode::G_INTRINSIC_TRUNC:
1102     return legalizeIntrinsicTrunc(MI, MRI, B);
1103   case TargetOpcode::G_SITOFP:
1104     return legalizeITOFP(MI, MRI, B, true);
1105   case TargetOpcode::G_UITOFP:
1106     return legalizeITOFP(MI, MRI, B, false);
1107   case TargetOpcode::G_FMINNUM:
1108   case TargetOpcode::G_FMAXNUM:
1109   case TargetOpcode::G_FMINNUM_IEEE:
1110   case TargetOpcode::G_FMAXNUM_IEEE:
1111     return legalizeMinNumMaxNum(MI, MRI, B);
1112   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1113     return legalizeExtractVectorElt(MI, MRI, B);
1114   case TargetOpcode::G_INSERT_VECTOR_ELT:
1115     return legalizeInsertVectorElt(MI, MRI, B);
1116   case TargetOpcode::G_FSIN:
1117   case TargetOpcode::G_FCOS:
1118     return legalizeSinCos(MI, MRI, B);
1119   case TargetOpcode::G_GLOBAL_VALUE:
1120     return legalizeGlobalValue(MI, MRI, B);
1121   case TargetOpcode::G_LOAD:
1122     return legalizeLoad(MI, MRI, B, Observer);
1123   case TargetOpcode::G_FMAD:
1124     return legalizeFMad(MI, MRI, B);
1125   case TargetOpcode::G_FDIV:
1126     return legalizeFDIV(MI, MRI, B);
1127   case TargetOpcode::G_ATOMIC_CMPXCHG:
1128     return legalizeAtomicCmpXChg(MI, MRI, B);
1129   default:
1130     return false;
1131   }
1132 
1133   llvm_unreachable("expected switch to return");
1134 }
1135 
1136 Register AMDGPULegalizerInfo::getSegmentAperture(
1137   unsigned AS,
1138   MachineRegisterInfo &MRI,
1139   MachineIRBuilder &B) const {
1140   MachineFunction &MF = B.getMF();
1141   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1142   const LLT S32 = LLT::scalar(32);
1143 
1144   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1145 
1146   if (ST.hasApertureRegs()) {
1147     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1148     // getreg.
1149     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1150         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1151         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1152     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1153         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1154         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1155     unsigned Encoding =
1156         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1157         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1158         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1159 
1160     Register ApertureReg = MRI.createGenericVirtualRegister(S32);
1161     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1162 
1163     B.buildInstr(AMDGPU::S_GETREG_B32)
1164       .addDef(GetReg)
1165       .addImm(Encoding);
1166     MRI.setType(GetReg, S32);
1167 
1168     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1169     B.buildInstr(TargetOpcode::G_SHL)
1170       .addDef(ApertureReg)
1171       .addUse(GetReg)
1172       .addUse(ShiftAmt.getReg(0));
1173 
1174     return ApertureReg;
1175   }
1176 
1177   Register QueuePtr = MRI.createGenericVirtualRegister(
1178     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1179 
1180   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1181   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1182     return Register();
1183 
1184   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1185   // private_segment_aperture_base_hi.
1186   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1187 
1188   // FIXME: Don't use undef
1189   Value *V = UndefValue::get(PointerType::get(
1190                                Type::getInt8Ty(MF.getFunction().getContext()),
1191                                AMDGPUAS::CONSTANT_ADDRESS));
1192 
1193   MachinePointerInfo PtrInfo(V, StructOffset);
1194   MachineMemOperand *MMO = MF.getMachineMemOperand(
1195     PtrInfo,
1196     MachineMemOperand::MOLoad |
1197     MachineMemOperand::MODereferenceable |
1198     MachineMemOperand::MOInvariant,
1199     4,
1200     MinAlign(64, StructOffset));
1201 
1202   Register LoadResult = MRI.createGenericVirtualRegister(S32);
1203   Register LoadAddr;
1204 
1205   B.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1206   B.buildLoad(LoadResult, LoadAddr, *MMO);
1207   return LoadResult;
1208 }
1209 
1210 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1211   MachineInstr &MI, MachineRegisterInfo &MRI,
1212   MachineIRBuilder &B) const {
1213   MachineFunction &MF = B.getMF();
1214 
1215   B.setInstr(MI);
1216 
1217   const LLT S32 = LLT::scalar(32);
1218   Register Dst = MI.getOperand(0).getReg();
1219   Register Src = MI.getOperand(1).getReg();
1220 
1221   LLT DstTy = MRI.getType(Dst);
1222   LLT SrcTy = MRI.getType(Src);
1223   unsigned DestAS = DstTy.getAddressSpace();
1224   unsigned SrcAS = SrcTy.getAddressSpace();
1225 
1226   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1227   // vector element.
1228   assert(!DstTy.isVector());
1229 
1230   const AMDGPUTargetMachine &TM
1231     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1232 
1233   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1234   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1235     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1236     return true;
1237   }
1238 
1239   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1240     // Truncate.
1241     B.buildExtract(Dst, Src, 0);
1242     MI.eraseFromParent();
1243     return true;
1244   }
1245 
1246   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1247     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1248     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1249 
1250     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1251     // another. Merge operands are required to be the same type, but creating an
1252     // extra ptrtoint would be kind of pointless.
1253     auto HighAddr = B.buildConstant(
1254       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1255     B.buildMerge(Dst, {Src, HighAddr.getReg(0)});
1256     MI.eraseFromParent();
1257     return true;
1258   }
1259 
1260   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1261     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1262            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1263     unsigned NullVal = TM.getNullPointerValue(DestAS);
1264 
1265     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1266     auto FlatNull = B.buildConstant(SrcTy, 0);
1267 
1268     Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy);
1269 
1270     // Extract low 32-bits of the pointer.
1271     B.buildExtract(PtrLo32, Src, 0);
1272 
1273     Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1274     B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
1275     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1276 
1277     MI.eraseFromParent();
1278     return true;
1279   }
1280 
1281   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1282     return false;
1283 
1284   if (!ST.hasFlatAddressSpace())
1285     return false;
1286 
1287   auto SegmentNull =
1288       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1289   auto FlatNull =
1290       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1291 
1292   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1293   if (!ApertureReg.isValid())
1294     return false;
1295 
1296   Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1297   B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
1298 
1299   Register BuildPtr = MRI.createGenericVirtualRegister(DstTy);
1300 
1301   // Coerce the type of the low half of the result so we can use merge_values.
1302   Register SrcAsInt = MRI.createGenericVirtualRegister(S32);
1303   B.buildInstr(TargetOpcode::G_PTRTOINT)
1304     .addDef(SrcAsInt)
1305     .addUse(Src);
1306 
1307   // TODO: Should we allow mismatched types but matching sizes in merges to
1308   // avoid the ptrtoint?
1309   B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
1310   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
1311 
1312   MI.eraseFromParent();
1313   return true;
1314 }
1315 
1316 bool AMDGPULegalizerInfo::legalizeFrint(
1317   MachineInstr &MI, MachineRegisterInfo &MRI,
1318   MachineIRBuilder &B) const {
1319   B.setInstr(MI);
1320 
1321   Register Src = MI.getOperand(1).getReg();
1322   LLT Ty = MRI.getType(Src);
1323   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1324 
1325   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1326   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1327 
1328   auto C1 = B.buildFConstant(Ty, C1Val);
1329   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1330 
1331   // TODO: Should this propagate fast-math-flags?
1332   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1333   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1334 
1335   auto C2 = B.buildFConstant(Ty, C2Val);
1336   auto Fabs = B.buildFAbs(Ty, Src);
1337 
1338   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1339   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1340   return true;
1341 }
1342 
1343 bool AMDGPULegalizerInfo::legalizeFceil(
1344   MachineInstr &MI, MachineRegisterInfo &MRI,
1345   MachineIRBuilder &B) const {
1346   B.setInstr(MI);
1347 
1348   const LLT S1 = LLT::scalar(1);
1349   const LLT S64 = LLT::scalar(64);
1350 
1351   Register Src = MI.getOperand(1).getReg();
1352   assert(MRI.getType(Src) == S64);
1353 
1354   // result = trunc(src)
1355   // if (src > 0.0 && src != result)
1356   //   result += 1.0
1357 
1358   auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src});
1359 
1360   const auto Zero = B.buildFConstant(S64, 0.0);
1361   const auto One = B.buildFConstant(S64, 1.0);
1362   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1363   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1364   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1365   auto Add = B.buildSelect(S64, And, One, Zero);
1366 
1367   // TODO: Should this propagate fast-math-flags?
1368   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1369   return true;
1370 }
1371 
1372 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1373                                               MachineIRBuilder &B) {
1374   const unsigned FractBits = 52;
1375   const unsigned ExpBits = 11;
1376   LLT S32 = LLT::scalar(32);
1377 
1378   auto Const0 = B.buildConstant(S32, FractBits - 32);
1379   auto Const1 = B.buildConstant(S32, ExpBits);
1380 
1381   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1382     .addUse(Const0.getReg(0))
1383     .addUse(Const1.getReg(0));
1384 
1385   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1386 }
1387 
1388 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1389   MachineInstr &MI, MachineRegisterInfo &MRI,
1390   MachineIRBuilder &B) const {
1391   B.setInstr(MI);
1392 
1393   const LLT S1 = LLT::scalar(1);
1394   const LLT S32 = LLT::scalar(32);
1395   const LLT S64 = LLT::scalar(64);
1396 
1397   Register Src = MI.getOperand(1).getReg();
1398   assert(MRI.getType(Src) == S64);
1399 
1400   // TODO: Should this use extract since the low half is unused?
1401   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1402   Register Hi = Unmerge.getReg(1);
1403 
1404   // Extract the upper half, since this is where we will find the sign and
1405   // exponent.
1406   auto Exp = extractF64Exponent(Hi, B);
1407 
1408   const unsigned FractBits = 52;
1409 
1410   // Extract the sign bit.
1411   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1412   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1413 
1414   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1415 
1416   const auto Zero32 = B.buildConstant(S32, 0);
1417 
1418   // Extend back to 64-bits.
1419   auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)});
1420 
1421   auto Shr = B.buildAShr(S64, FractMask, Exp);
1422   auto Not = B.buildNot(S64, Shr);
1423   auto Tmp0 = B.buildAnd(S64, Src, Not);
1424   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1425 
1426   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1427   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1428 
1429   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1430   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1431   return true;
1432 }
1433 
1434 bool AMDGPULegalizerInfo::legalizeITOFP(
1435   MachineInstr &MI, MachineRegisterInfo &MRI,
1436   MachineIRBuilder &B, bool Signed) const {
1437   B.setInstr(MI);
1438 
1439   Register Dst = MI.getOperand(0).getReg();
1440   Register Src = MI.getOperand(1).getReg();
1441 
1442   const LLT S64 = LLT::scalar(64);
1443   const LLT S32 = LLT::scalar(32);
1444 
1445   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1446 
1447   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1448 
1449   auto CvtHi = Signed ?
1450     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1451     B.buildUITOFP(S64, Unmerge.getReg(1));
1452 
1453   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1454 
1455   auto ThirtyTwo = B.buildConstant(S32, 32);
1456   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1457     .addUse(CvtHi.getReg(0))
1458     .addUse(ThirtyTwo.getReg(0));
1459 
1460   // TODO: Should this propagate fast-math-flags?
1461   B.buildFAdd(Dst, LdExp, CvtLo);
1462   MI.eraseFromParent();
1463   return true;
1464 }
1465 
1466 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1467   MachineInstr &MI, MachineRegisterInfo &MRI,
1468   MachineIRBuilder &B) const {
1469   MachineFunction &MF = B.getMF();
1470   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1471 
1472   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1473                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1474 
1475   // With ieee_mode disabled, the instructions have the correct behavior
1476   // already for G_FMINNUM/G_FMAXNUM
1477   if (!MFI->getMode().IEEE)
1478     return !IsIEEEOp;
1479 
1480   if (IsIEEEOp)
1481     return true;
1482 
1483   MachineIRBuilder HelperBuilder(MI);
1484   GISelObserverWrapper DummyObserver;
1485   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1486   HelperBuilder.setInstr(MI);
1487   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1488 }
1489 
1490 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1491   MachineInstr &MI, MachineRegisterInfo &MRI,
1492   MachineIRBuilder &B) const {
1493   // TODO: Should move some of this into LegalizerHelper.
1494 
1495   // TODO: Promote dynamic indexing of s16 to s32
1496   // TODO: Dynamic s64 indexing is only legal for SGPR.
1497   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
1498   if (!IdxVal) // Dynamic case will be selected to register indexing.
1499     return true;
1500 
1501   Register Dst = MI.getOperand(0).getReg();
1502   Register Vec = MI.getOperand(1).getReg();
1503 
1504   LLT VecTy = MRI.getType(Vec);
1505   LLT EltTy = VecTy.getElementType();
1506   assert(EltTy == MRI.getType(Dst));
1507 
1508   B.setInstr(MI);
1509 
1510   if (IdxVal.getValue() < VecTy.getNumElements())
1511     B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits());
1512   else
1513     B.buildUndef(Dst);
1514 
1515   MI.eraseFromParent();
1516   return true;
1517 }
1518 
1519 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1520   MachineInstr &MI, MachineRegisterInfo &MRI,
1521   MachineIRBuilder &B) const {
1522   // TODO: Should move some of this into LegalizerHelper.
1523 
1524   // TODO: Promote dynamic indexing of s16 to s32
1525   // TODO: Dynamic s64 indexing is only legal for SGPR.
1526   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI);
1527   if (!IdxVal) // Dynamic case will be selected to register indexing.
1528     return true;
1529 
1530   Register Dst = MI.getOperand(0).getReg();
1531   Register Vec = MI.getOperand(1).getReg();
1532   Register Ins = MI.getOperand(2).getReg();
1533 
1534   LLT VecTy = MRI.getType(Vec);
1535   LLT EltTy = VecTy.getElementType();
1536   assert(EltTy == MRI.getType(Ins));
1537 
1538   B.setInstr(MI);
1539 
1540   if (IdxVal.getValue() < VecTy.getNumElements())
1541     B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits());
1542   else
1543     B.buildUndef(Dst);
1544 
1545   MI.eraseFromParent();
1546   return true;
1547 }
1548 
1549 bool AMDGPULegalizerInfo::legalizeSinCos(
1550   MachineInstr &MI, MachineRegisterInfo &MRI,
1551   MachineIRBuilder &B) const {
1552   B.setInstr(MI);
1553 
1554   Register DstReg = MI.getOperand(0).getReg();
1555   Register SrcReg = MI.getOperand(1).getReg();
1556   LLT Ty = MRI.getType(DstReg);
1557   unsigned Flags = MI.getFlags();
1558 
1559   Register TrigVal;
1560   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1561   if (ST.hasTrigReducedRange()) {
1562     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1563     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1564       .addUse(MulVal.getReg(0))
1565       .setMIFlags(Flags).getReg(0);
1566   } else
1567     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1568 
1569   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1570     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1571   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1572     .addUse(TrigVal)
1573     .setMIFlags(Flags);
1574   MI.eraseFromParent();
1575   return true;
1576 }
1577 
1578 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1579   Register DstReg, LLT PtrTy,
1580   MachineIRBuilder &B, const GlobalValue *GV,
1581   unsigned Offset, unsigned GAFlags) const {
1582   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1583   // to the following code sequence:
1584   //
1585   // For constant address space:
1586   //   s_getpc_b64 s[0:1]
1587   //   s_add_u32 s0, s0, $symbol
1588   //   s_addc_u32 s1, s1, 0
1589   //
1590   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1591   //   a fixup or relocation is emitted to replace $symbol with a literal
1592   //   constant, which is a pc-relative offset from the encoding of the $symbol
1593   //   operand to the global variable.
1594   //
1595   // For global address space:
1596   //   s_getpc_b64 s[0:1]
1597   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1598   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1599   //
1600   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1601   //   fixups or relocations are emitted to replace $symbol@*@lo and
1602   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1603   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
1604   //   operand to the global variable.
1605   //
1606   // What we want here is an offset from the value returned by s_getpc
1607   // (which is the address of the s_add_u32 instruction) to the global
1608   // variable, but since the encoding of $symbol starts 4 bytes after the start
1609   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1610   // small. This requires us to add 4 to the global variable offset in order to
1611   // compute the correct address.
1612 
1613   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1614 
1615   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1616     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1617 
1618   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1619     .addDef(PCReg);
1620 
1621   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1622   if (GAFlags == SIInstrInfo::MO_NONE)
1623     MIB.addImm(0);
1624   else
1625     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1626 
1627   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1628 
1629   if (PtrTy.getSizeInBits() == 32)
1630     B.buildExtract(DstReg, PCReg, 0);
1631   return true;
1632  }
1633 
1634 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1635   MachineInstr &MI, MachineRegisterInfo &MRI,
1636   MachineIRBuilder &B) const {
1637   Register DstReg = MI.getOperand(0).getReg();
1638   LLT Ty = MRI.getType(DstReg);
1639   unsigned AS = Ty.getAddressSpace();
1640 
1641   const GlobalValue *GV = MI.getOperand(1).getGlobal();
1642   MachineFunction &MF = B.getMF();
1643   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1644   B.setInstr(MI);
1645 
1646   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1647     if (!MFI->isEntryFunction()) {
1648       const Function &Fn = MF.getFunction();
1649       DiagnosticInfoUnsupported BadLDSDecl(
1650         Fn, "local memory global used by non-kernel function", MI.getDebugLoc());
1651       Fn.getContext().diagnose(BadLDSDecl);
1652     }
1653 
1654     // TODO: We could emit code to handle the initialization somewhere.
1655     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1656       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1657       MI.eraseFromParent();
1658       return true;
1659     }
1660 
1661     const Function &Fn = MF.getFunction();
1662     DiagnosticInfoUnsupported BadInit(
1663       Fn, "unsupported initializer for address space", MI.getDebugLoc());
1664     Fn.getContext().diagnose(BadInit);
1665     return true;
1666   }
1667 
1668   const SITargetLowering *TLI = ST.getTargetLowering();
1669 
1670   if (TLI->shouldEmitFixup(GV)) {
1671     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
1672     MI.eraseFromParent();
1673     return true;
1674   }
1675 
1676   if (TLI->shouldEmitPCReloc(GV)) {
1677     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
1678     MI.eraseFromParent();
1679     return true;
1680   }
1681 
1682   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1683   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
1684 
1685   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
1686     MachinePointerInfo::getGOT(MF),
1687     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1688     MachineMemOperand::MOInvariant,
1689     8 /*Size*/, 8 /*Align*/);
1690 
1691   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
1692 
1693   if (Ty.getSizeInBits() == 32) {
1694     // Truncate if this is a 32-bit constant adrdess.
1695     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
1696     B.buildExtract(DstReg, Load, 0);
1697   } else
1698     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
1699 
1700   MI.eraseFromParent();
1701   return true;
1702 }
1703 
1704 bool AMDGPULegalizerInfo::legalizeLoad(
1705   MachineInstr &MI, MachineRegisterInfo &MRI,
1706   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
1707   B.setInstr(MI);
1708   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1709   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
1710   Observer.changingInstr(MI);
1711   MI.getOperand(1).setReg(Cast.getReg(0));
1712   Observer.changedInstr(MI);
1713   return true;
1714 }
1715 
1716 bool AMDGPULegalizerInfo::legalizeFMad(
1717   MachineInstr &MI, MachineRegisterInfo &MRI,
1718   MachineIRBuilder &B) const {
1719   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1720   assert(Ty.isScalar());
1721 
1722   // TODO: Always legal with future ftz flag.
1723   if (Ty == LLT::scalar(32) && !ST.hasFP32Denormals())
1724     return true;
1725   if (Ty == LLT::scalar(16) && !ST.hasFP16Denormals())
1726     return true;
1727 
1728   MachineFunction &MF = B.getMF();
1729 
1730   MachineIRBuilder HelperBuilder(MI);
1731   GISelObserverWrapper DummyObserver;
1732   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1733   HelperBuilder.setMBB(*MI.getParent());
1734   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
1735 }
1736 
1737 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
1738   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
1739   Register DstReg = MI.getOperand(0).getReg();
1740   Register PtrReg = MI.getOperand(1).getReg();
1741   Register CmpVal = MI.getOperand(2).getReg();
1742   Register NewVal = MI.getOperand(3).getReg();
1743 
1744   assert(SITargetLowering::isFlatGlobalAddrSpace(
1745            MRI.getType(PtrReg).getAddressSpace()) &&
1746          "this should not have been custom lowered");
1747 
1748   LLT ValTy = MRI.getType(CmpVal);
1749   LLT VecTy = LLT::vector(2, ValTy);
1750 
1751   B.setInstr(MI);
1752   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
1753 
1754   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
1755     .addDef(DstReg)
1756     .addUse(PtrReg)
1757     .addUse(PackedVal)
1758     .setMemRefs(MI.memoperands());
1759 
1760   MI.eraseFromParent();
1761   return true;
1762 }
1763 
1764 // Return the use branch instruction, otherwise null if the usage is invalid.
1765 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
1766                                        MachineRegisterInfo &MRI) {
1767   Register CondDef = MI.getOperand(0).getReg();
1768   if (!MRI.hasOneNonDBGUse(CondDef))
1769     return nullptr;
1770 
1771   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
1772   return UseMI.getParent() == MI.getParent() &&
1773     UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr;
1774 }
1775 
1776 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
1777                                                 Register Reg, LLT Ty) const {
1778   Register LiveIn = MRI.getLiveInVirtReg(Reg);
1779   if (LiveIn)
1780     return LiveIn;
1781 
1782   Register NewReg = MRI.createGenericVirtualRegister(Ty);
1783   MRI.addLiveIn(Reg, NewReg);
1784   return NewReg;
1785 }
1786 
1787 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
1788                                          const ArgDescriptor *Arg) const {
1789   if (!Arg->isRegister() || !Arg->getRegister().isValid())
1790     return false; // TODO: Handle these
1791 
1792   assert(Arg->getRegister().isPhysical());
1793 
1794   MachineRegisterInfo &MRI = *B.getMRI();
1795 
1796   LLT Ty = MRI.getType(DstReg);
1797   Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
1798 
1799   if (Arg->isMasked()) {
1800     // TODO: Should we try to emit this once in the entry block?
1801     const LLT S32 = LLT::scalar(32);
1802     const unsigned Mask = Arg->getMask();
1803     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
1804 
1805     Register AndMaskSrc = LiveIn;
1806 
1807     if (Shift != 0) {
1808       auto ShiftAmt = B.buildConstant(S32, Shift);
1809       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
1810     }
1811 
1812     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
1813   } else
1814     B.buildCopy(DstReg, LiveIn);
1815 
1816   // Insert the argument copy if it doens't already exist.
1817   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
1818   if (!MRI.getVRegDef(LiveIn)) {
1819     // FIXME: Should have scoped insert pt
1820     MachineBasicBlock &OrigInsBB = B.getMBB();
1821     auto OrigInsPt = B.getInsertPt();
1822 
1823     MachineBasicBlock &EntryMBB = B.getMF().front();
1824     EntryMBB.addLiveIn(Arg->getRegister());
1825     B.setInsertPt(EntryMBB, EntryMBB.begin());
1826     B.buildCopy(LiveIn, Arg->getRegister());
1827 
1828     B.setInsertPt(OrigInsBB, OrigInsPt);
1829   }
1830 
1831   return true;
1832 }
1833 
1834 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
1835   MachineInstr &MI,
1836   MachineRegisterInfo &MRI,
1837   MachineIRBuilder &B,
1838   AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
1839   B.setInstr(MI);
1840 
1841   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1842 
1843   const ArgDescriptor *Arg;
1844   const TargetRegisterClass *RC;
1845   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
1846   if (!Arg) {
1847     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
1848     return false;
1849   }
1850 
1851   if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
1852     MI.eraseFromParent();
1853     return true;
1854   }
1855 
1856   return false;
1857 }
1858 
1859 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
1860                                        MachineRegisterInfo &MRI,
1861                                        MachineIRBuilder &B) const {
1862   B.setInstr(MI);
1863   Register Dst = MI.getOperand(0).getReg();
1864   LLT DstTy = MRI.getType(Dst);
1865   LLT S16 = LLT::scalar(16);
1866   LLT S32 = LLT::scalar(32);
1867 
1868   if (legalizeFastUnsafeFDIV(MI, MRI, B))
1869     return true;
1870 
1871   if (DstTy == S16)
1872     return legalizeFDIV16(MI, MRI, B);
1873   if (DstTy == S32)
1874     return legalizeFDIV32(MI, MRI, B);
1875 
1876   return false;
1877 }
1878 
1879 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
1880                                                  MachineRegisterInfo &MRI,
1881                                                  MachineIRBuilder &B) const {
1882   Register Res = MI.getOperand(0).getReg();
1883   Register LHS = MI.getOperand(1).getReg();
1884   Register RHS = MI.getOperand(2).getReg();
1885 
1886   uint16_t Flags = MI.getFlags();
1887 
1888   LLT ResTy = MRI.getType(Res);
1889   LLT S32 = LLT::scalar(32);
1890   LLT S64 = LLT::scalar(64);
1891 
1892   const MachineFunction &MF = B.getMF();
1893   bool Unsafe =
1894     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
1895 
1896   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
1897     return false;
1898 
1899   if (!Unsafe && ResTy == S32 && ST.hasFP32Denormals())
1900     return false;
1901 
1902   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
1903     // 1 / x -> RCP(x)
1904     if (CLHS->isExactlyValue(1.0)) {
1905       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
1906         .addUse(RHS)
1907         .setMIFlags(Flags);
1908 
1909       MI.eraseFromParent();
1910       return true;
1911     }
1912 
1913     // -1 / x -> RCP( FNEG(x) )
1914     if (CLHS->isExactlyValue(-1.0)) {
1915       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
1916       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
1917         .addUse(FNeg.getReg(0))
1918         .setMIFlags(Flags);
1919 
1920       MI.eraseFromParent();
1921       return true;
1922     }
1923   }
1924 
1925   // x / y -> x * (1.0 / y)
1926   if (Unsafe) {
1927     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
1928       .addUse(RHS)
1929       .setMIFlags(Flags);
1930     B.buildFMul(Res, LHS, RCP, Flags);
1931 
1932     MI.eraseFromParent();
1933     return true;
1934   }
1935 
1936   return false;
1937 }
1938 
1939 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
1940                                          MachineRegisterInfo &MRI,
1941                                          MachineIRBuilder &B) const {
1942   B.setInstr(MI);
1943   Register Res = MI.getOperand(0).getReg();
1944   Register LHS = MI.getOperand(1).getReg();
1945   Register RHS = MI.getOperand(2).getReg();
1946 
1947   uint16_t Flags = MI.getFlags();
1948 
1949   LLT S16 = LLT::scalar(16);
1950   LLT S32 = LLT::scalar(32);
1951 
1952   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
1953   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
1954 
1955   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
1956     .addUse(RHSExt.getReg(0))
1957     .setMIFlags(Flags);
1958 
1959   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
1960   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
1961 
1962   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
1963     .addUse(RDst.getReg(0))
1964     .addUse(RHS)
1965     .addUse(LHS)
1966     .setMIFlags(Flags);
1967 
1968   MI.eraseFromParent();
1969   return true;
1970 }
1971 
1972 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
1973 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
1974 static void toggleSPDenormMode(bool Enable,
1975                                const GCNSubtarget &ST,
1976                                MachineIRBuilder &B) {
1977   // Set SP denorm mode to this value.
1978   unsigned SPDenormMode =
1979     Enable ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
1980 
1981   if (ST.hasDenormModeInst()) {
1982     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
1983     unsigned DPDenormModeDefault = ST.hasFP64Denormals()
1984                                    ? FP_DENORM_FLUSH_NONE
1985                                    : FP_DENORM_FLUSH_IN_FLUSH_OUT;
1986 
1987     unsigned NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
1988     B.buildInstr(AMDGPU::S_DENORM_MODE)
1989       .addImm(NewDenormModeValue);
1990 
1991   } else {
1992     // Select FP32 bit field in mode register.
1993     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
1994                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
1995                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
1996 
1997     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
1998       .addImm(SPDenormMode)
1999       .addImm(SPDenormModeBitField);
2000   }
2001 }
2002 
2003 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2004                                          MachineRegisterInfo &MRI,
2005                                          MachineIRBuilder &B) const {
2006   B.setInstr(MI);
2007   Register Res = MI.getOperand(0).getReg();
2008   Register LHS = MI.getOperand(1).getReg();
2009   Register RHS = MI.getOperand(2).getReg();
2010 
2011   uint16_t Flags = MI.getFlags();
2012 
2013   LLT S32 = LLT::scalar(32);
2014   LLT S1 = LLT::scalar(1);
2015 
2016   auto One = B.buildFConstant(S32, 1.0f);
2017 
2018   auto DenominatorScaled =
2019     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2020       .addUse(RHS)
2021       .addUse(RHS)
2022       .addUse(LHS)
2023       .setMIFlags(Flags);
2024   auto NumeratorScaled =
2025     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2026       .addUse(LHS)
2027       .addUse(RHS)
2028       .addUse(LHS)
2029       .setMIFlags(Flags);
2030 
2031   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2032     .addUse(DenominatorScaled.getReg(0))
2033     .setMIFlags(Flags);
2034   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2035 
2036   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2037   // aren't modeled as reading it.
2038   if (!ST.hasFP32Denormals())
2039     toggleSPDenormMode(true, ST, B);
2040 
2041   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2042   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2043   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2044   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2045   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2046   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2047 
2048   if (!ST.hasFP32Denormals())
2049     toggleSPDenormMode(false, ST, B);
2050 
2051   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2052     .addUse(Fma4.getReg(0))
2053     .addUse(Fma1.getReg(0))
2054     .addUse(Fma3.getReg(0))
2055     .addUse(NumeratorScaled.getReg(1))
2056     .setMIFlags(Flags);
2057 
2058   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2059     .addUse(Fmas.getReg(0))
2060     .addUse(RHS)
2061     .addUse(LHS)
2062     .setMIFlags(Flags);
2063 
2064   MI.eraseFromParent();
2065   return true;
2066 }
2067 
2068 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
2069                                                  MachineRegisterInfo &MRI,
2070                                                  MachineIRBuilder &B) const {
2071   B.setInstr(MI);
2072   Register Res = MI.getOperand(0).getReg();
2073   Register LHS = MI.getOperand(2).getReg();
2074   Register RHS = MI.getOperand(3).getReg();
2075   uint16_t Flags = MI.getFlags();
2076 
2077   LLT S32 = LLT::scalar(32);
2078   LLT S1 = LLT::scalar(1);
2079 
2080   auto Abs = B.buildFAbs(S32, RHS, Flags);
2081   const APFloat C0Val(1.0f);
2082 
2083   auto C0 = B.buildConstant(S32, 0x6f800000);
2084   auto C1 = B.buildConstant(S32, 0x2f800000);
2085   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
2086 
2087   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
2088   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
2089 
2090   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
2091 
2092   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2093     .addUse(Mul0.getReg(0))
2094     .setMIFlags(Flags);
2095 
2096   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
2097 
2098   B.buildFMul(Res, Sel, Mul1, Flags);
2099 
2100   MI.eraseFromParent();
2101   return true;
2102 }
2103 
2104 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
2105                                                  MachineRegisterInfo &MRI,
2106                                                  MachineIRBuilder &B) const {
2107   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2108   if (!MFI->isEntryFunction()) {
2109     return legalizePreloadedArgIntrin(MI, MRI, B,
2110                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
2111   }
2112 
2113   B.setInstr(MI);
2114 
2115   uint64_t Offset =
2116     ST.getTargetLowering()->getImplicitParameterOffset(
2117       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
2118   Register DstReg = MI.getOperand(0).getReg();
2119   LLT DstTy = MRI.getType(DstReg);
2120   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
2121 
2122   const ArgDescriptor *Arg;
2123   const TargetRegisterClass *RC;
2124   std::tie(Arg, RC)
2125     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2126   if (!Arg)
2127     return false;
2128 
2129   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
2130   if (!loadInputValue(KernargPtrReg, B, Arg))
2131     return false;
2132 
2133   B.buildGEP(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
2134   MI.eraseFromParent();
2135   return true;
2136 }
2137 
2138 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
2139                                               MachineRegisterInfo &MRI,
2140                                               MachineIRBuilder &B,
2141                                               unsigned AddrSpace) const {
2142   B.setInstr(MI);
2143   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
2144   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
2145   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
2146   MI.eraseFromParent();
2147   return true;
2148 }
2149 
2150 /// Handle register layout difference for f16 images for some subtargets.
2151 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
2152                                              MachineRegisterInfo &MRI,
2153                                              Register Reg) const {
2154   if (!ST.hasUnpackedD16VMem())
2155     return Reg;
2156 
2157   const LLT S16 = LLT::scalar(16);
2158   const LLT S32 = LLT::scalar(32);
2159   LLT StoreVT = MRI.getType(Reg);
2160   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
2161 
2162   auto Unmerge = B.buildUnmerge(S16, Reg);
2163 
2164   SmallVector<Register, 4> WideRegs;
2165   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
2166     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
2167 
2168   int NumElts = StoreVT.getNumElements();
2169 
2170   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
2171 }
2172 
2173 bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI,
2174                                                  MachineRegisterInfo &MRI,
2175                                                  MachineIRBuilder &B,
2176                                                  bool IsFormat) const {
2177   // TODO: Reject f16 format on targets where unsupported.
2178   Register VData = MI.getOperand(1).getReg();
2179   LLT Ty = MRI.getType(VData);
2180 
2181   B.setInstr(MI);
2182 
2183   const LLT S32 = LLT::scalar(32);
2184   const LLT S16 = LLT::scalar(16);
2185 
2186   // Fixup illegal register types for i8 stores.
2187   if (Ty == LLT::scalar(8) || Ty == S16) {
2188     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
2189     MI.getOperand(1).setReg(AnyExt);
2190     return true;
2191   }
2192 
2193   if (Ty.isVector()) {
2194     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
2195       if (IsFormat)
2196         MI.getOperand(1).setReg(handleD16VData(B, MRI, VData));
2197       return true;
2198     }
2199 
2200     return Ty.getElementType() == S32 && Ty.getNumElements() <= 4;
2201   }
2202 
2203   return Ty == S32;
2204 }
2205 
2206 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
2207                                             MachineRegisterInfo &MRI,
2208                                             MachineIRBuilder &B) const {
2209   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
2210   switch (MI.getIntrinsicID()) {
2211   case Intrinsic::amdgcn_if: {
2212     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
2213       const SIRegisterInfo *TRI
2214         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
2215 
2216       B.setInstr(*BrCond);
2217       Register Def = MI.getOperand(1).getReg();
2218       Register Use = MI.getOperand(3).getReg();
2219       B.buildInstr(AMDGPU::SI_IF)
2220         .addDef(Def)
2221         .addUse(Use)
2222         .addMBB(BrCond->getOperand(1).getMBB());
2223 
2224       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
2225       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
2226       MI.eraseFromParent();
2227       BrCond->eraseFromParent();
2228       return true;
2229     }
2230 
2231     return false;
2232   }
2233   case Intrinsic::amdgcn_loop: {
2234     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
2235       const SIRegisterInfo *TRI
2236         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
2237 
2238       B.setInstr(*BrCond);
2239       Register Reg = MI.getOperand(2).getReg();
2240       B.buildInstr(AMDGPU::SI_LOOP)
2241         .addUse(Reg)
2242         .addMBB(BrCond->getOperand(1).getMBB());
2243       MI.eraseFromParent();
2244       BrCond->eraseFromParent();
2245       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
2246       return true;
2247     }
2248 
2249     return false;
2250   }
2251   case Intrinsic::amdgcn_kernarg_segment_ptr:
2252     return legalizePreloadedArgIntrin(
2253       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2254   case Intrinsic::amdgcn_implicitarg_ptr:
2255     return legalizeImplicitArgPtr(MI, MRI, B);
2256   case Intrinsic::amdgcn_workitem_id_x:
2257     return legalizePreloadedArgIntrin(MI, MRI, B,
2258                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
2259   case Intrinsic::amdgcn_workitem_id_y:
2260     return legalizePreloadedArgIntrin(MI, MRI, B,
2261                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
2262   case Intrinsic::amdgcn_workitem_id_z:
2263     return legalizePreloadedArgIntrin(MI, MRI, B,
2264                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
2265   case Intrinsic::amdgcn_workgroup_id_x:
2266     return legalizePreloadedArgIntrin(MI, MRI, B,
2267                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
2268   case Intrinsic::amdgcn_workgroup_id_y:
2269     return legalizePreloadedArgIntrin(MI, MRI, B,
2270                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
2271   case Intrinsic::amdgcn_workgroup_id_z:
2272     return legalizePreloadedArgIntrin(MI, MRI, B,
2273                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
2274   case Intrinsic::amdgcn_dispatch_ptr:
2275     return legalizePreloadedArgIntrin(MI, MRI, B,
2276                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
2277   case Intrinsic::amdgcn_queue_ptr:
2278     return legalizePreloadedArgIntrin(MI, MRI, B,
2279                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
2280   case Intrinsic::amdgcn_implicit_buffer_ptr:
2281     return legalizePreloadedArgIntrin(
2282       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
2283   case Intrinsic::amdgcn_dispatch_id:
2284     return legalizePreloadedArgIntrin(MI, MRI, B,
2285                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
2286   case Intrinsic::amdgcn_fdiv_fast:
2287     return legalizeFDIVFastIntrin(MI, MRI, B);
2288   case Intrinsic::amdgcn_is_shared:
2289     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
2290   case Intrinsic::amdgcn_is_private:
2291     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
2292   case Intrinsic::amdgcn_wavefrontsize: {
2293     B.setInstr(MI);
2294     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
2295     MI.eraseFromParent();
2296     return true;
2297   }
2298   case Intrinsic::amdgcn_raw_buffer_store:
2299     return legalizeRawBufferStore(MI, MRI, B, false);
2300   case Intrinsic::amdgcn_raw_buffer_store_format:
2301     return legalizeRawBufferStore(MI, MRI, B, true);
2302   default:
2303     return true;
2304   }
2305 
2306   return true;
2307 }
2308