1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
19 #endif
20 
21 #include "AMDGPU.h"
22 #include "AMDGPULegalizerInfo.h"
23 #include "AMDGPUTargetMachine.h"
24 #include "SIMachineFunctionInfo.h"
25 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/CodeGen/TargetOpcodes.h"
28 #include "llvm/CodeGen/ValueTypes.h"
29 #include "llvm/IR/DerivedTypes.h"
30 #include "llvm/IR/DiagnosticInfo.h"
31 #include "llvm/IR/Type.h"
32 #include "llvm/Support/Debug.h"
33 
34 #define DEBUG_TYPE "amdgpu-legalinfo"
35 
36 using namespace llvm;
37 using namespace LegalizeActions;
38 using namespace LegalizeMutations;
39 using namespace LegalityPredicates;
40 
41 
42 static LegalityPredicate isMultiple32(unsigned TypeIdx,
43                                       unsigned MaxSize = 1024) {
44   return [=](const LegalityQuery &Query) {
45     const LLT Ty = Query.Types[TypeIdx];
46     const LLT EltTy = Ty.getScalarType();
47     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
48   };
49 }
50 
51 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
52   return [=](const LegalityQuery &Query) {
53     return Query.Types[TypeIdx].getSizeInBits() == Size;
54   };
55 }
56 
57 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
58   return [=](const LegalityQuery &Query) {
59     const LLT Ty = Query.Types[TypeIdx];
60     return Ty.isVector() &&
61            Ty.getNumElements() % 2 != 0 &&
62            Ty.getElementType().getSizeInBits() < 32 &&
63            Ty.getSizeInBits() % 32 != 0;
64   };
65 }
66 
67 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
68   return [=](const LegalityQuery &Query) {
69     const LLT Ty = Query.Types[TypeIdx];
70     const LLT EltTy = Ty.getScalarType();
71     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
72   };
73 }
74 
75 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
76   return [=](const LegalityQuery &Query) {
77     const LLT Ty = Query.Types[TypeIdx];
78     const LLT EltTy = Ty.getElementType();
79     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
80   };
81 }
82 
83 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
84   return [=](const LegalityQuery &Query) {
85     const LLT Ty = Query.Types[TypeIdx];
86     const LLT EltTy = Ty.getElementType();
87     unsigned Size = Ty.getSizeInBits();
88     unsigned Pieces = (Size + 63) / 64;
89     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
90     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
91   };
92 }
93 
94 // Increase the number of vector elements to reach the next multiple of 32-bit
95 // type.
96 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
97   return [=](const LegalityQuery &Query) {
98     const LLT Ty = Query.Types[TypeIdx];
99 
100     const LLT EltTy = Ty.getElementType();
101     const int Size = Ty.getSizeInBits();
102     const int EltSize = EltTy.getSizeInBits();
103     const int NextMul32 = (Size + 31) / 32;
104 
105     assert(EltSize < 32);
106 
107     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
108     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
109   };
110 }
111 
112 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
113   return [=](const LegalityQuery &Query) {
114     const LLT QueryTy = Query.Types[TypeIdx];
115     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
116   };
117 }
118 
119 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
120   return [=](const LegalityQuery &Query) {
121     const LLT QueryTy = Query.Types[TypeIdx];
122     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
123   };
124 }
125 
126 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
127   return [=](const LegalityQuery &Query) {
128     const LLT QueryTy = Query.Types[TypeIdx];
129     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
130   };
131 }
132 
133 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
134 // v2s16.
135 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
136   return [=](const LegalityQuery &Query) {
137     const LLT Ty = Query.Types[TypeIdx];
138     if (Ty.isVector()) {
139       const int EltSize = Ty.getElementType().getSizeInBits();
140       return EltSize == 32 || EltSize == 64 ||
141             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
142              EltSize == 128 || EltSize == 256;
143     }
144 
145     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
146   };
147 }
148 
149 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
150   return [=](const LegalityQuery &Query) {
151     return Query.Types[TypeIdx].getElementType() == Type;
152   };
153 }
154 
155 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
156   return [=](const LegalityQuery &Query) {
157     const LLT Ty = Query.Types[TypeIdx];
158     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
159            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
160   };
161 }
162 
163 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
164                                          const GCNTargetMachine &TM)
165   :  ST(ST_) {
166   using namespace TargetOpcode;
167 
168   auto GetAddrSpacePtr = [&TM](unsigned AS) {
169     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
170   };
171 
172   const LLT S1 = LLT::scalar(1);
173   const LLT S8 = LLT::scalar(8);
174   const LLT S16 = LLT::scalar(16);
175   const LLT S32 = LLT::scalar(32);
176   const LLT S64 = LLT::scalar(64);
177   const LLT S96 = LLT::scalar(96);
178   const LLT S128 = LLT::scalar(128);
179   const LLT S256 = LLT::scalar(256);
180   const LLT S1024 = LLT::scalar(1024);
181 
182   const LLT V2S16 = LLT::vector(2, 16);
183   const LLT V4S16 = LLT::vector(4, 16);
184 
185   const LLT V2S32 = LLT::vector(2, 32);
186   const LLT V3S32 = LLT::vector(3, 32);
187   const LLT V4S32 = LLT::vector(4, 32);
188   const LLT V5S32 = LLT::vector(5, 32);
189   const LLT V6S32 = LLT::vector(6, 32);
190   const LLT V7S32 = LLT::vector(7, 32);
191   const LLT V8S32 = LLT::vector(8, 32);
192   const LLT V9S32 = LLT::vector(9, 32);
193   const LLT V10S32 = LLT::vector(10, 32);
194   const LLT V11S32 = LLT::vector(11, 32);
195   const LLT V12S32 = LLT::vector(12, 32);
196   const LLT V13S32 = LLT::vector(13, 32);
197   const LLT V14S32 = LLT::vector(14, 32);
198   const LLT V15S32 = LLT::vector(15, 32);
199   const LLT V16S32 = LLT::vector(16, 32);
200   const LLT V32S32 = LLT::vector(32, 32);
201 
202   const LLT V2S64 = LLT::vector(2, 64);
203   const LLT V3S64 = LLT::vector(3, 64);
204   const LLT V4S64 = LLT::vector(4, 64);
205   const LLT V5S64 = LLT::vector(5, 64);
206   const LLT V6S64 = LLT::vector(6, 64);
207   const LLT V7S64 = LLT::vector(7, 64);
208   const LLT V8S64 = LLT::vector(8, 64);
209   const LLT V16S64 = LLT::vector(16, 64);
210 
211   std::initializer_list<LLT> AllS32Vectors =
212     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
213      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
214   std::initializer_list<LLT> AllS64Vectors =
215     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
216 
217   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
218   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
219   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
220   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
221   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
222   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
223   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
224 
225   const LLT CodePtr = FlatPtr;
226 
227   const std::initializer_list<LLT> AddrSpaces64 = {
228     GlobalPtr, ConstantPtr, FlatPtr
229   };
230 
231   const std::initializer_list<LLT> AddrSpaces32 = {
232     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
233   };
234 
235   const std::initializer_list<LLT> FPTypesBase = {
236     S32, S64
237   };
238 
239   const std::initializer_list<LLT> FPTypes16 = {
240     S32, S64, S16
241   };
242 
243   const std::initializer_list<LLT> FPTypesPK16 = {
244     S32, S64, S16, V2S16
245   };
246 
247   setAction({G_BRCOND, S1}, Legal);
248 
249   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
250   // elements for v3s16
251   getActionDefinitionsBuilder(G_PHI)
252     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
253     .legalFor(AllS32Vectors)
254     .legalFor(AllS64Vectors)
255     .legalFor(AddrSpaces64)
256     .legalFor(AddrSpaces32)
257     .clampScalar(0, S32, S256)
258     .widenScalarToNextPow2(0, 32)
259     .clampMaxNumElements(0, S32, 16)
260     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
261     .legalIf(isPointer(0));
262 
263   if (ST.has16BitInsts()) {
264     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
265       .legalFor({S32, S16})
266       .clampScalar(0, S16, S32)
267       .scalarize(0);
268   } else {
269     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
270       .legalFor({S32})
271       .clampScalar(0, S32, S32)
272       .scalarize(0);
273   }
274 
275   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
276     .legalFor({S32})
277     .clampScalar(0, S32, S32)
278     .scalarize(0);
279 
280   // Report legal for any types we can handle anywhere. For the cases only legal
281   // on the SALU, RegBankSelect will be able to re-legalize.
282   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
283     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
284     .clampScalar(0, S32, S64)
285     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
286     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
287     .widenScalarToNextPow2(0)
288     .scalarize(0);
289 
290   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
291                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
292     .legalFor({{S32, S1}})
293     .clampScalar(0, S32, S32)
294     .scalarize(0); // TODO: Implement.
295 
296   getActionDefinitionsBuilder({G_SADDO, G_SSUBO})
297     .lower();
298 
299   getActionDefinitionsBuilder(G_BITCAST)
300     // Don't worry about the size constraint.
301     .legalIf(all(isRegisterType(0), isRegisterType(1)))
302     // FIXME: Testing hack
303     .legalForCartesianProduct({S16, LLT::vector(2, 8), });
304 
305   getActionDefinitionsBuilder(G_FCONSTANT)
306     .legalFor({S32, S64, S16})
307     .clampScalar(0, S16, S64);
308 
309   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
310     .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
311                ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
312     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
313     .clampScalarOrElt(0, S32, S1024)
314     .legalIf(isMultiple32(0))
315     .widenScalarToNextPow2(0, 32)
316     .clampMaxNumElements(0, S32, 16);
317 
318 
319   // FIXME: i1 operands to intrinsics should always be legal, but other i1
320   // values may not be legal.  We need to figure out how to distinguish
321   // between these two scenarios.
322   getActionDefinitionsBuilder(G_CONSTANT)
323     .legalFor({S1, S32, S64, S16, GlobalPtr,
324                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
325     .clampScalar(0, S32, S64)
326     .widenScalarToNextPow2(0)
327     .legalIf(isPointer(0));
328 
329   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
330   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
331     .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr});
332 
333 
334   auto &FPOpActions = getActionDefinitionsBuilder(
335     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
336     .legalFor({S32, S64});
337   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
338     .customFor({S32, S64});
339   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
340     .customFor({S32, S64});
341 
342   if (ST.has16BitInsts()) {
343     if (ST.hasVOP3PInsts())
344       FPOpActions.legalFor({S16, V2S16});
345     else
346       FPOpActions.legalFor({S16});
347 
348     TrigActions.customFor({S16});
349     FDIVActions.customFor({S16});
350   }
351 
352   auto &MinNumMaxNum = getActionDefinitionsBuilder({
353       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
354 
355   if (ST.hasVOP3PInsts()) {
356     MinNumMaxNum.customFor(FPTypesPK16)
357       .clampMaxNumElements(0, S16, 2)
358       .clampScalar(0, S16, S64)
359       .scalarize(0);
360   } else if (ST.has16BitInsts()) {
361     MinNumMaxNum.customFor(FPTypes16)
362       .clampScalar(0, S16, S64)
363       .scalarize(0);
364   } else {
365     MinNumMaxNum.customFor(FPTypesBase)
366       .clampScalar(0, S32, S64)
367       .scalarize(0);
368   }
369 
370   if (ST.hasVOP3PInsts())
371     FPOpActions.clampMaxNumElements(0, S16, 2);
372 
373   FPOpActions
374     .scalarize(0)
375     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
376 
377   TrigActions
378     .scalarize(0)
379     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
380 
381   FDIVActions
382     .scalarize(0)
383     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
384 
385   getActionDefinitionsBuilder({G_FNEG, G_FABS})
386     .legalFor(FPTypesPK16)
387     .clampMaxNumElements(0, S16, 2)
388     .scalarize(0)
389     .clampScalar(0, S16, S64);
390 
391   // TODO: Implement
392   getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
393 
394   if (ST.has16BitInsts()) {
395     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
396       .legalFor({S32, S64, S16})
397       .scalarize(0)
398       .clampScalar(0, S16, S64);
399   } else {
400     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
401       .legalFor({S32, S64})
402       .scalarize(0)
403       .clampScalar(0, S32, S64);
404   }
405 
406   getActionDefinitionsBuilder(G_FPTRUNC)
407     .legalFor({{S32, S64}, {S16, S32}})
408     .scalarize(0);
409 
410   getActionDefinitionsBuilder(G_FPEXT)
411     .legalFor({{S64, S32}, {S32, S16}})
412     .lowerFor({{S64, S16}}) // FIXME: Implement
413     .scalarize(0);
414 
415   // TODO: Verify V_BFI_B32 is generated from expanded bit ops.
416   getActionDefinitionsBuilder(G_FCOPYSIGN).lower();
417 
418   getActionDefinitionsBuilder(G_FSUB)
419       // Use actual fsub instruction
420       .legalFor({S32})
421       // Must use fadd + fneg
422       .lowerFor({S64, S16, V2S16})
423       .scalarize(0)
424       .clampScalar(0, S32, S64);
425 
426   // Whether this is legal depends on the floating point mode for the function.
427   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
428   if (ST.hasMadF16())
429     FMad.customFor({S32, S16});
430   else
431     FMad.customFor({S32});
432   FMad.scalarize(0)
433       .lower();
434 
435   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
436     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
437                {S32, S1}, {S64, S1}, {S16, S1},
438                {S96, S32},
439                // FIXME: Hack
440                {S64, LLT::scalar(33)},
441                {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}})
442     .scalarize(0);
443 
444   // TODO: Split s1->s64 during regbankselect for VALU.
445   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
446     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
447     .lowerFor({{S32, S64}})
448     .lowerIf(typeIs(1, S1))
449     .customFor({{S64, S64}});
450   if (ST.has16BitInsts())
451     IToFP.legalFor({{S16, S16}});
452   IToFP.clampScalar(1, S32, S64)
453        .scalarize(0);
454 
455   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
456     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}});
457   if (ST.has16BitInsts())
458     FPToI.legalFor({{S16, S16}});
459   else
460     FPToI.minScalar(1, S32);
461 
462   FPToI.minScalar(0, S32)
463        .scalarize(0);
464 
465   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
466     .legalFor({S32, S64})
467     .scalarize(0);
468 
469   if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
470     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
471       .legalFor({S32, S64})
472       .clampScalar(0, S32, S64)
473       .scalarize(0);
474   } else {
475     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
476       .legalFor({S32})
477       .customFor({S64})
478       .clampScalar(0, S32, S64)
479       .scalarize(0);
480   }
481 
482   getActionDefinitionsBuilder(G_PTR_ADD)
483     .legalForCartesianProduct(AddrSpaces64, {S64})
484     .legalForCartesianProduct(AddrSpaces32, {S32})
485     .scalarize(0);
486 
487   getActionDefinitionsBuilder(G_PTR_MASK)
488     .scalarize(0)
489     .alwaysLegal();
490 
491   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
492 
493   auto &CmpBuilder =
494     getActionDefinitionsBuilder(G_ICMP)
495     .legalForCartesianProduct(
496       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
497     .legalFor({{S1, S32}, {S1, S64}});
498   if (ST.has16BitInsts()) {
499     CmpBuilder.legalFor({{S1, S16}});
500   }
501 
502   CmpBuilder
503     .widenScalarToNextPow2(1)
504     .clampScalar(1, S32, S64)
505     .scalarize(0)
506     .legalIf(all(typeIs(0, S1), isPointer(1)));
507 
508   getActionDefinitionsBuilder(G_FCMP)
509     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
510     .widenScalarToNextPow2(1)
511     .clampScalar(1, S32, S64)
512     .scalarize(0);
513 
514   // FIXME: fexp, flog2, flog10 needs to be custom lowered.
515   getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2,
516                                G_FLOG, G_FLOG2, G_FLOG10})
517     .legalFor({S32})
518     .scalarize(0);
519 
520   // The 64-bit versions produce 32-bit results, but only on the SALU.
521   getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF,
522                                G_CTTZ, G_CTTZ_ZERO_UNDEF,
523                                G_CTPOP})
524     .legalFor({{S32, S32}, {S32, S64}})
525     .clampScalar(0, S32, S32)
526     .clampScalar(1, S32, S64)
527     .scalarize(0)
528     .widenScalarToNextPow2(0, 32)
529     .widenScalarToNextPow2(1, 32);
530 
531   // TODO: Expand for > s32
532   getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE})
533     .legalFor({S32})
534     .clampScalar(0, S32, S32)
535     .scalarize(0);
536 
537   if (ST.has16BitInsts()) {
538     if (ST.hasVOP3PInsts()) {
539       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
540         .legalFor({S32, S16, V2S16})
541         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
542         .clampMaxNumElements(0, S16, 2)
543         .clampScalar(0, S16, S32)
544         .widenScalarToNextPow2(0)
545         .scalarize(0);
546     } else {
547       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
548         .legalFor({S32, S16})
549         .widenScalarToNextPow2(0)
550         .clampScalar(0, S16, S32)
551         .scalarize(0);
552     }
553   } else {
554     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
555       .legalFor({S32})
556       .clampScalar(0, S32, S32)
557       .widenScalarToNextPow2(0)
558       .scalarize(0);
559   }
560 
561   auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
562     return [=](const LegalityQuery &Query) {
563       return Query.Types[TypeIdx0].getSizeInBits() <
564              Query.Types[TypeIdx1].getSizeInBits();
565     };
566   };
567 
568   auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
569     return [=](const LegalityQuery &Query) {
570       return Query.Types[TypeIdx0].getSizeInBits() >
571              Query.Types[TypeIdx1].getSizeInBits();
572     };
573   };
574 
575   getActionDefinitionsBuilder(G_INTTOPTR)
576     // List the common cases
577     .legalForCartesianProduct(AddrSpaces64, {S64})
578     .legalForCartesianProduct(AddrSpaces32, {S32})
579     .scalarize(0)
580     // Accept any address space as long as the size matches
581     .legalIf(sameSize(0, 1))
582     .widenScalarIf(smallerThan(1, 0),
583       [](const LegalityQuery &Query) {
584         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
585       })
586     .narrowScalarIf(greaterThan(1, 0),
587       [](const LegalityQuery &Query) {
588         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
589       });
590 
591   getActionDefinitionsBuilder(G_PTRTOINT)
592     // List the common cases
593     .legalForCartesianProduct(AddrSpaces64, {S64})
594     .legalForCartesianProduct(AddrSpaces32, {S32})
595     .scalarize(0)
596     // Accept any address space as long as the size matches
597     .legalIf(sameSize(0, 1))
598     .widenScalarIf(smallerThan(0, 1),
599       [](const LegalityQuery &Query) {
600         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
601       })
602     .narrowScalarIf(
603       greaterThan(0, 1),
604       [](const LegalityQuery &Query) {
605         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
606       });
607 
608   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
609     .scalarize(0)
610     .custom();
611 
612   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
613   // handle some operations by just promoting the register during
614   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
615   auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned {
616     switch (AS) {
617     // FIXME: Private element size.
618     case AMDGPUAS::PRIVATE_ADDRESS:
619       return 32;
620     // FIXME: Check subtarget
621     case AMDGPUAS::LOCAL_ADDRESS:
622       return ST.useDS128() ? 128 : 64;
623 
624     // Treat constant and global as identical. SMRD loads are sometimes usable
625     // for global loads (ideally constant address space should be eliminated)
626     // depending on the context. Legality cannot be context dependent, but
627     // RegBankSelect can split the load as necessary depending on the pointer
628     // register bank/uniformity and if the memory is invariant or not written in
629     // a kernel.
630     case AMDGPUAS::CONSTANT_ADDRESS:
631     case AMDGPUAS::GLOBAL_ADDRESS:
632       return 512;
633     default:
634       return 128;
635     }
636   };
637 
638   const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool {
639     const LLT DstTy = Query.Types[0];
640 
641     // Split vector extloads.
642     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
643     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
644       return true;
645 
646     const LLT PtrTy = Query.Types[1];
647     unsigned AS = PtrTy.getAddressSpace();
648     if (MemSize > maxSizeForAddrSpace(AS))
649       return true;
650 
651     // Catch weird sized loads that don't evenly divide into the access sizes
652     // TODO: May be able to widen depending on alignment etc.
653     unsigned NumRegs = MemSize / 32;
654     if (NumRegs == 3 && !ST.hasDwordx3LoadStores())
655       return true;
656 
657     unsigned Align = Query.MMODescrs[0].AlignInBits;
658     if (Align < MemSize) {
659       const SITargetLowering *TLI = ST.getTargetLowering();
660       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
661     }
662 
663     return false;
664   };
665 
666   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
667   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
668   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
669 
670   // TODO: Refine based on subtargets which support unaligned access or 128-bit
671   // LDS
672   // TODO: Unsupported flat for SI.
673 
674   for (unsigned Op : {G_LOAD, G_STORE}) {
675     const bool IsStore = Op == G_STORE;
676 
677     auto &Actions = getActionDefinitionsBuilder(Op);
678     // Whitelist the common cases.
679     // TODO: Pointer loads
680     // TODO: Wide constant loads
681     // TODO: Only CI+ has 3x loads
682     // TODO: Loads to s16 on gfx9
683     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
684                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
685                                       {V3S32, GlobalPtr, 96, GlobalAlign32},
686                                       {S96, GlobalPtr, 96, GlobalAlign32},
687                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
688                                       {S128, GlobalPtr, 128, GlobalAlign32},
689                                       {S64, GlobalPtr, 64, GlobalAlign32},
690                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
691                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
692                                       {S32, GlobalPtr, 8, GlobalAlign8},
693                                       {S32, GlobalPtr, 16, GlobalAlign16},
694 
695                                       {S32, LocalPtr, 32, 32},
696                                       {S64, LocalPtr, 64, 32},
697                                       {V2S32, LocalPtr, 64, 32},
698                                       {S32, LocalPtr, 8, 8},
699                                       {S32, LocalPtr, 16, 16},
700                                       {V2S16, LocalPtr, 32, 32},
701 
702                                       {S32, PrivatePtr, 32, 32},
703                                       {S32, PrivatePtr, 8, 8},
704                                       {S32, PrivatePtr, 16, 16},
705                                       {V2S16, PrivatePtr, 32, 32},
706 
707                                       {S32, FlatPtr, 32, GlobalAlign32},
708                                       {S32, FlatPtr, 16, GlobalAlign16},
709                                       {S32, FlatPtr, 8, GlobalAlign8},
710                                       {V2S16, FlatPtr, 32, GlobalAlign32},
711 
712                                       {S32, ConstantPtr, 32, GlobalAlign32},
713                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
714                                       {V3S32, ConstantPtr, 96, GlobalAlign32},
715                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
716                                       {S64, ConstantPtr, 64, GlobalAlign32},
717                                       {S128, ConstantPtr, 128, GlobalAlign32},
718                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
719     Actions
720         .customIf(typeIs(1, Constant32Ptr))
721         .narrowScalarIf(
722             [=](const LegalityQuery &Query) -> bool {
723               return !Query.Types[0].isVector() && needToSplitLoad(Query);
724             },
725             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
726               const LLT DstTy = Query.Types[0];
727               const LLT PtrTy = Query.Types[1];
728 
729               const unsigned DstSize = DstTy.getSizeInBits();
730               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
731 
732               // Split extloads.
733               if (DstSize > MemSize)
734                 return std::make_pair(0, LLT::scalar(MemSize));
735 
736               if (DstSize > 32 && (DstSize % 32 != 0)) {
737                 // FIXME: Need a way to specify non-extload of larger size if
738                 // suitably aligned.
739                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
740               }
741 
742               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
743               if (MemSize > MaxSize)
744                 return std::make_pair(0, LLT::scalar(MaxSize));
745 
746               unsigned Align = Query.MMODescrs[0].AlignInBits;
747               return std::make_pair(0, LLT::scalar(Align));
748             })
749         .fewerElementsIf(
750             [=](const LegalityQuery &Query) -> bool {
751               return Query.Types[0].isVector() && needToSplitLoad(Query);
752             },
753             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
754               const LLT DstTy = Query.Types[0];
755               const LLT PtrTy = Query.Types[1];
756 
757               LLT EltTy = DstTy.getElementType();
758               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
759 
760               // Split if it's too large for the address space.
761               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
762                 unsigned NumElts = DstTy.getNumElements();
763                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
764 
765                 // FIXME: Refine when odd breakdowns handled
766                 // The scalars will need to be re-legalized.
767                 if (NumPieces == 1 || NumPieces >= NumElts ||
768                     NumElts % NumPieces != 0)
769                   return std::make_pair(0, EltTy);
770 
771                 return std::make_pair(0,
772                                       LLT::vector(NumElts / NumPieces, EltTy));
773               }
774 
775               // Need to split because of alignment.
776               unsigned Align = Query.MMODescrs[0].AlignInBits;
777               unsigned EltSize = EltTy.getSizeInBits();
778               if (EltSize > Align &&
779                   (EltSize / Align < DstTy.getNumElements())) {
780                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
781               }
782 
783               // May need relegalization for the scalars.
784               return std::make_pair(0, EltTy);
785             })
786         .minScalar(0, S32);
787 
788     if (IsStore)
789       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
790 
791     // TODO: Need a bitcast lower option?
792     Actions
793         .legalIf([=](const LegalityQuery &Query) {
794           const LLT Ty0 = Query.Types[0];
795           unsigned Size = Ty0.getSizeInBits();
796           unsigned MemSize = Query.MMODescrs[0].SizeInBits;
797           unsigned Align = Query.MMODescrs[0].AlignInBits;
798 
799           // No extending vector loads.
800           if (Size > MemSize && Ty0.isVector())
801             return false;
802 
803           // FIXME: Widening store from alignment not valid.
804           if (MemSize < Size)
805             MemSize = std::max(MemSize, Align);
806 
807           switch (MemSize) {
808           case 8:
809           case 16:
810             return Size == 32;
811           case 32:
812           case 64:
813           case 128:
814             return true;
815           case 96:
816             return ST.hasDwordx3LoadStores();
817           case 256:
818           case 512:
819             return true;
820           default:
821             return false;
822           }
823         })
824         .widenScalarToNextPow2(0)
825         // TODO: v3s32->v4s32 with alignment
826         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
827   }
828 
829   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
830                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
831                                                   {S32, GlobalPtr, 16, 2 * 8},
832                                                   {S32, LocalPtr, 8, 8},
833                                                   {S32, LocalPtr, 16, 16},
834                                                   {S32, PrivatePtr, 8, 8},
835                                                   {S32, PrivatePtr, 16, 16},
836                                                   {S32, ConstantPtr, 8, 8},
837                                                   {S32, ConstantPtr, 16, 2 * 8}});
838   if (ST.hasFlatAddressSpace()) {
839     ExtLoads.legalForTypesWithMemDesc(
840         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
841   }
842 
843   ExtLoads.clampScalar(0, S32, S32)
844           .widenScalarToNextPow2(0)
845           .unsupportedIfMemSizeNotPow2()
846           .lower();
847 
848   auto &Atomics = getActionDefinitionsBuilder(
849     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
850      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
851      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
852      G_ATOMICRMW_UMIN})
853     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
854                {S64, GlobalPtr}, {S64, LocalPtr}});
855   if (ST.hasFlatAddressSpace()) {
856     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
857   }
858 
859   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
860     .legalFor({{S32, LocalPtr}});
861 
862   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
863   // demarshalling
864   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
865     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
866                 {S32, FlatPtr}, {S64, FlatPtr}})
867     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
868                {S32, RegionPtr}, {S64, RegionPtr}});
869 
870   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS)
871     .lower();
872 
873   // TODO: Pointer types, any 32-bit or 64-bit vector
874   getActionDefinitionsBuilder(G_SELECT)
875     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
876           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
877           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1})
878     .clampScalar(0, S16, S64)
879     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
880     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
881     .scalarize(1)
882     .clampMaxNumElements(0, S32, 2)
883     .clampMaxNumElements(0, LocalPtr, 2)
884     .clampMaxNumElements(0, PrivatePtr, 2)
885     .scalarize(0)
886     .widenScalarToNextPow2(0)
887     .legalIf(all(isPointer(0), typeIs(1, S1)));
888 
889   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
890   // be more flexible with the shift amount type.
891   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
892     .legalFor({{S32, S32}, {S64, S32}});
893   if (ST.has16BitInsts()) {
894     if (ST.hasVOP3PInsts()) {
895       Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
896             .clampMaxNumElements(0, S16, 2);
897     } else
898       Shifts.legalFor({{S16, S32}, {S16, S16}});
899 
900     Shifts.clampScalar(1, S16, S32);
901     Shifts.clampScalar(0, S16, S64);
902     Shifts.widenScalarToNextPow2(0, 16);
903   } else {
904     // Make sure we legalize the shift amount type first, as the general
905     // expansion for the shifted type will produce much worse code if it hasn't
906     // been truncated already.
907     Shifts.clampScalar(1, S32, S32);
908     Shifts.clampScalar(0, S32, S64);
909     Shifts.widenScalarToNextPow2(0, 32);
910   }
911   Shifts.scalarize(0);
912 
913   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
914     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
915     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
916     unsigned IdxTypeIdx = 2;
917 
918     getActionDefinitionsBuilder(Op)
919       .customIf([=](const LegalityQuery &Query) {
920           const LLT EltTy = Query.Types[EltTypeIdx];
921           const LLT VecTy = Query.Types[VecTypeIdx];
922           const LLT IdxTy = Query.Types[IdxTypeIdx];
923           return (EltTy.getSizeInBits() == 16 ||
924                   EltTy.getSizeInBits() % 32 == 0) &&
925                  VecTy.getSizeInBits() % 32 == 0 &&
926                  VecTy.getSizeInBits() <= 1024 &&
927                  IdxTy.getSizeInBits() == 32;
928         })
929       .clampScalar(EltTypeIdx, S32, S64)
930       .clampScalar(VecTypeIdx, S32, S64)
931       .clampScalar(IdxTypeIdx, S32, S32);
932   }
933 
934   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
935     .unsupportedIf([=](const LegalityQuery &Query) {
936         const LLT &EltTy = Query.Types[1].getElementType();
937         return Query.Types[0] != EltTy;
938       });
939 
940   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
941     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
942     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
943 
944     // FIXME: Doesn't handle extract of illegal sizes.
945     getActionDefinitionsBuilder(Op)
946       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
947       // FIXME: Multiples of 16 should not be legal.
948       .legalIf([=](const LegalityQuery &Query) {
949           const LLT BigTy = Query.Types[BigTyIdx];
950           const LLT LitTy = Query.Types[LitTyIdx];
951           return (BigTy.getSizeInBits() % 32 == 0) &&
952                  (LitTy.getSizeInBits() % 16 == 0);
953         })
954       .widenScalarIf(
955         [=](const LegalityQuery &Query) {
956           const LLT BigTy = Query.Types[BigTyIdx];
957           return (BigTy.getScalarSizeInBits() < 16);
958         },
959         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
960       .widenScalarIf(
961         [=](const LegalityQuery &Query) {
962           const LLT LitTy = Query.Types[LitTyIdx];
963           return (LitTy.getScalarSizeInBits() < 16);
964         },
965         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
966       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
967       .widenScalarToNextPow2(BigTyIdx, 32);
968 
969   }
970 
971   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
972     .legalForCartesianProduct(AllS32Vectors, {S32})
973     .legalForCartesianProduct(AllS64Vectors, {S64})
974     .clampNumElements(0, V16S32, V32S32)
975     .clampNumElements(0, V2S64, V16S64)
976     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
977 
978   if (ST.hasScalarPackInsts())
979     BuildVector.legalFor({V2S16, S32});
980 
981   BuildVector
982     .minScalarSameAs(1, 0)
983     .legalIf(isRegisterType(0))
984     .minScalarOrElt(0, S32);
985 
986   if (ST.hasScalarPackInsts()) {
987     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
988       .legalFor({V2S16, S32})
989       .lower();
990   } else {
991     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
992       .lower();
993   }
994 
995   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
996     .legalIf(isRegisterType(0));
997 
998   // TODO: Don't fully scalarize v2s16 pieces
999   getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1000 
1001   // Merge/Unmerge
1002   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1003     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1004     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1005 
1006     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1007       const LLT &Ty = Query.Types[TypeIdx];
1008       if (Ty.isVector()) {
1009         const LLT &EltTy = Ty.getElementType();
1010         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
1011           return true;
1012         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1013           return true;
1014       }
1015       return false;
1016     };
1017 
1018     auto &Builder = getActionDefinitionsBuilder(Op)
1019       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1020       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1021       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1022       // valid.
1023       .clampScalar(LitTyIdx, S16, S256)
1024       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1025       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1026       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1027                            elementTypeIs(1, S16)),
1028                        changeTo(1, V2S16))
1029       // Break up vectors with weird elements into scalars
1030       .fewerElementsIf(
1031         [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
1032         scalarize(0))
1033       .fewerElementsIf(
1034         [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
1035         scalarize(1))
1036       .clampScalar(BigTyIdx, S32, S1024)
1037       .lowerFor({{S16, V2S16}});
1038 
1039     if (Op == G_MERGE_VALUES) {
1040       Builder.widenScalarIf(
1041         // TODO: Use 16-bit shifts if legal for 8-bit values?
1042         [=](const LegalityQuery &Query) {
1043           const LLT Ty = Query.Types[LitTyIdx];
1044           return Ty.getSizeInBits() < 32;
1045         },
1046         changeTo(LitTyIdx, S32));
1047     }
1048 
1049     Builder.widenScalarIf(
1050       [=](const LegalityQuery &Query) {
1051         const LLT Ty = Query.Types[BigTyIdx];
1052         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1053           Ty.getSizeInBits() % 16 != 0;
1054       },
1055       [=](const LegalityQuery &Query) {
1056         // Pick the next power of 2, or a multiple of 64 over 128.
1057         // Whichever is smaller.
1058         const LLT &Ty = Query.Types[BigTyIdx];
1059         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1060         if (NewSizeInBits >= 256) {
1061           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1062           if (RoundedTo < NewSizeInBits)
1063             NewSizeInBits = RoundedTo;
1064         }
1065         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1066       })
1067       .legalIf([=](const LegalityQuery &Query) {
1068           const LLT &BigTy = Query.Types[BigTyIdx];
1069           const LLT &LitTy = Query.Types[LitTyIdx];
1070 
1071           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1072             return false;
1073           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1074             return false;
1075 
1076           return BigTy.getSizeInBits() % 16 == 0 &&
1077                  LitTy.getSizeInBits() % 16 == 0 &&
1078                  BigTy.getSizeInBits() <= 1024;
1079         })
1080       // Any vectors left are the wrong size. Scalarize them.
1081       .scalarize(0)
1082       .scalarize(1);
1083   }
1084 
1085   getActionDefinitionsBuilder(G_SEXT_INREG).lower();
1086 
1087   computeTables();
1088   verify(*ST.getInstrInfo());
1089 }
1090 
1091 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1092                                          MachineRegisterInfo &MRI,
1093                                          MachineIRBuilder &B,
1094                                          GISelChangeObserver &Observer) const {
1095   switch (MI.getOpcode()) {
1096   case TargetOpcode::G_ADDRSPACE_CAST:
1097     return legalizeAddrSpaceCast(MI, MRI, B);
1098   case TargetOpcode::G_FRINT:
1099     return legalizeFrint(MI, MRI, B);
1100   case TargetOpcode::G_FCEIL:
1101     return legalizeFceil(MI, MRI, B);
1102   case TargetOpcode::G_INTRINSIC_TRUNC:
1103     return legalizeIntrinsicTrunc(MI, MRI, B);
1104   case TargetOpcode::G_SITOFP:
1105     return legalizeITOFP(MI, MRI, B, true);
1106   case TargetOpcode::G_UITOFP:
1107     return legalizeITOFP(MI, MRI, B, false);
1108   case TargetOpcode::G_FMINNUM:
1109   case TargetOpcode::G_FMAXNUM:
1110   case TargetOpcode::G_FMINNUM_IEEE:
1111   case TargetOpcode::G_FMAXNUM_IEEE:
1112     return legalizeMinNumMaxNum(MI, MRI, B);
1113   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1114     return legalizeExtractVectorElt(MI, MRI, B);
1115   case TargetOpcode::G_INSERT_VECTOR_ELT:
1116     return legalizeInsertVectorElt(MI, MRI, B);
1117   case TargetOpcode::G_FSIN:
1118   case TargetOpcode::G_FCOS:
1119     return legalizeSinCos(MI, MRI, B);
1120   case TargetOpcode::G_GLOBAL_VALUE:
1121     return legalizeGlobalValue(MI, MRI, B);
1122   case TargetOpcode::G_LOAD:
1123     return legalizeLoad(MI, MRI, B, Observer);
1124   case TargetOpcode::G_FMAD:
1125     return legalizeFMad(MI, MRI, B);
1126   case TargetOpcode::G_FDIV:
1127     return legalizeFDIV(MI, MRI, B);
1128   case TargetOpcode::G_ATOMIC_CMPXCHG:
1129     return legalizeAtomicCmpXChg(MI, MRI, B);
1130   default:
1131     return false;
1132   }
1133 
1134   llvm_unreachable("expected switch to return");
1135 }
1136 
1137 Register AMDGPULegalizerInfo::getSegmentAperture(
1138   unsigned AS,
1139   MachineRegisterInfo &MRI,
1140   MachineIRBuilder &B) const {
1141   MachineFunction &MF = B.getMF();
1142   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1143   const LLT S32 = LLT::scalar(32);
1144 
1145   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1146 
1147   if (ST.hasApertureRegs()) {
1148     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1149     // getreg.
1150     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1151         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1152         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1153     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1154         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1155         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1156     unsigned Encoding =
1157         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1158         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1159         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1160 
1161     Register ApertureReg = MRI.createGenericVirtualRegister(S32);
1162     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1163 
1164     B.buildInstr(AMDGPU::S_GETREG_B32)
1165       .addDef(GetReg)
1166       .addImm(Encoding);
1167     MRI.setType(GetReg, S32);
1168 
1169     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1170     B.buildInstr(TargetOpcode::G_SHL)
1171       .addDef(ApertureReg)
1172       .addUse(GetReg)
1173       .addUse(ShiftAmt.getReg(0));
1174 
1175     return ApertureReg;
1176   }
1177 
1178   Register QueuePtr = MRI.createGenericVirtualRegister(
1179     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1180 
1181   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1182   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1183     return Register();
1184 
1185   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1186   // private_segment_aperture_base_hi.
1187   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1188 
1189   // FIXME: Don't use undef
1190   Value *V = UndefValue::get(PointerType::get(
1191                                Type::getInt8Ty(MF.getFunction().getContext()),
1192                                AMDGPUAS::CONSTANT_ADDRESS));
1193 
1194   MachinePointerInfo PtrInfo(V, StructOffset);
1195   MachineMemOperand *MMO = MF.getMachineMemOperand(
1196     PtrInfo,
1197     MachineMemOperand::MOLoad |
1198     MachineMemOperand::MODereferenceable |
1199     MachineMemOperand::MOInvariant,
1200     4,
1201     MinAlign(64, StructOffset));
1202 
1203   Register LoadResult = MRI.createGenericVirtualRegister(S32);
1204   Register LoadAddr;
1205 
1206   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1207   B.buildLoad(LoadResult, LoadAddr, *MMO);
1208   return LoadResult;
1209 }
1210 
1211 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1212   MachineInstr &MI, MachineRegisterInfo &MRI,
1213   MachineIRBuilder &B) const {
1214   MachineFunction &MF = B.getMF();
1215 
1216   B.setInstr(MI);
1217 
1218   const LLT S32 = LLT::scalar(32);
1219   Register Dst = MI.getOperand(0).getReg();
1220   Register Src = MI.getOperand(1).getReg();
1221 
1222   LLT DstTy = MRI.getType(Dst);
1223   LLT SrcTy = MRI.getType(Src);
1224   unsigned DestAS = DstTy.getAddressSpace();
1225   unsigned SrcAS = SrcTy.getAddressSpace();
1226 
1227   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1228   // vector element.
1229   assert(!DstTy.isVector());
1230 
1231   const AMDGPUTargetMachine &TM
1232     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1233 
1234   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1235   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1236     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1237     return true;
1238   }
1239 
1240   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1241     // Truncate.
1242     B.buildExtract(Dst, Src, 0);
1243     MI.eraseFromParent();
1244     return true;
1245   }
1246 
1247   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1248     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1249     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1250 
1251     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1252     // another. Merge operands are required to be the same type, but creating an
1253     // extra ptrtoint would be kind of pointless.
1254     auto HighAddr = B.buildConstant(
1255       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1256     B.buildMerge(Dst, {Src, HighAddr.getReg(0)});
1257     MI.eraseFromParent();
1258     return true;
1259   }
1260 
1261   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1262     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1263            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1264     unsigned NullVal = TM.getNullPointerValue(DestAS);
1265 
1266     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1267     auto FlatNull = B.buildConstant(SrcTy, 0);
1268 
1269     Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy);
1270 
1271     // Extract low 32-bits of the pointer.
1272     B.buildExtract(PtrLo32, Src, 0);
1273 
1274     Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1275     B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
1276     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1277 
1278     MI.eraseFromParent();
1279     return true;
1280   }
1281 
1282   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1283     return false;
1284 
1285   if (!ST.hasFlatAddressSpace())
1286     return false;
1287 
1288   auto SegmentNull =
1289       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1290   auto FlatNull =
1291       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1292 
1293   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1294   if (!ApertureReg.isValid())
1295     return false;
1296 
1297   Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1298   B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
1299 
1300   Register BuildPtr = MRI.createGenericVirtualRegister(DstTy);
1301 
1302   // Coerce the type of the low half of the result so we can use merge_values.
1303   Register SrcAsInt = MRI.createGenericVirtualRegister(S32);
1304   B.buildInstr(TargetOpcode::G_PTRTOINT)
1305     .addDef(SrcAsInt)
1306     .addUse(Src);
1307 
1308   // TODO: Should we allow mismatched types but matching sizes in merges to
1309   // avoid the ptrtoint?
1310   B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
1311   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
1312 
1313   MI.eraseFromParent();
1314   return true;
1315 }
1316 
1317 bool AMDGPULegalizerInfo::legalizeFrint(
1318   MachineInstr &MI, MachineRegisterInfo &MRI,
1319   MachineIRBuilder &B) const {
1320   B.setInstr(MI);
1321 
1322   Register Src = MI.getOperand(1).getReg();
1323   LLT Ty = MRI.getType(Src);
1324   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1325 
1326   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1327   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1328 
1329   auto C1 = B.buildFConstant(Ty, C1Val);
1330   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1331 
1332   // TODO: Should this propagate fast-math-flags?
1333   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1334   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1335 
1336   auto C2 = B.buildFConstant(Ty, C2Val);
1337   auto Fabs = B.buildFAbs(Ty, Src);
1338 
1339   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1340   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1341   return true;
1342 }
1343 
1344 bool AMDGPULegalizerInfo::legalizeFceil(
1345   MachineInstr &MI, MachineRegisterInfo &MRI,
1346   MachineIRBuilder &B) const {
1347   B.setInstr(MI);
1348 
1349   const LLT S1 = LLT::scalar(1);
1350   const LLT S64 = LLT::scalar(64);
1351 
1352   Register Src = MI.getOperand(1).getReg();
1353   assert(MRI.getType(Src) == S64);
1354 
1355   // result = trunc(src)
1356   // if (src > 0.0 && src != result)
1357   //   result += 1.0
1358 
1359   auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src});
1360 
1361   const auto Zero = B.buildFConstant(S64, 0.0);
1362   const auto One = B.buildFConstant(S64, 1.0);
1363   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1364   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1365   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1366   auto Add = B.buildSelect(S64, And, One, Zero);
1367 
1368   // TODO: Should this propagate fast-math-flags?
1369   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1370   return true;
1371 }
1372 
1373 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1374                                               MachineIRBuilder &B) {
1375   const unsigned FractBits = 52;
1376   const unsigned ExpBits = 11;
1377   LLT S32 = LLT::scalar(32);
1378 
1379   auto Const0 = B.buildConstant(S32, FractBits - 32);
1380   auto Const1 = B.buildConstant(S32, ExpBits);
1381 
1382   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1383     .addUse(Const0.getReg(0))
1384     .addUse(Const1.getReg(0));
1385 
1386   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1387 }
1388 
1389 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1390   MachineInstr &MI, MachineRegisterInfo &MRI,
1391   MachineIRBuilder &B) const {
1392   B.setInstr(MI);
1393 
1394   const LLT S1 = LLT::scalar(1);
1395   const LLT S32 = LLT::scalar(32);
1396   const LLT S64 = LLT::scalar(64);
1397 
1398   Register Src = MI.getOperand(1).getReg();
1399   assert(MRI.getType(Src) == S64);
1400 
1401   // TODO: Should this use extract since the low half is unused?
1402   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1403   Register Hi = Unmerge.getReg(1);
1404 
1405   // Extract the upper half, since this is where we will find the sign and
1406   // exponent.
1407   auto Exp = extractF64Exponent(Hi, B);
1408 
1409   const unsigned FractBits = 52;
1410 
1411   // Extract the sign bit.
1412   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1413   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1414 
1415   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1416 
1417   const auto Zero32 = B.buildConstant(S32, 0);
1418 
1419   // Extend back to 64-bits.
1420   auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)});
1421 
1422   auto Shr = B.buildAShr(S64, FractMask, Exp);
1423   auto Not = B.buildNot(S64, Shr);
1424   auto Tmp0 = B.buildAnd(S64, Src, Not);
1425   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1426 
1427   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1428   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1429 
1430   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1431   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1432   return true;
1433 }
1434 
1435 bool AMDGPULegalizerInfo::legalizeITOFP(
1436   MachineInstr &MI, MachineRegisterInfo &MRI,
1437   MachineIRBuilder &B, bool Signed) const {
1438   B.setInstr(MI);
1439 
1440   Register Dst = MI.getOperand(0).getReg();
1441   Register Src = MI.getOperand(1).getReg();
1442 
1443   const LLT S64 = LLT::scalar(64);
1444   const LLT S32 = LLT::scalar(32);
1445 
1446   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1447 
1448   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1449 
1450   auto CvtHi = Signed ?
1451     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1452     B.buildUITOFP(S64, Unmerge.getReg(1));
1453 
1454   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1455 
1456   auto ThirtyTwo = B.buildConstant(S32, 32);
1457   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1458     .addUse(CvtHi.getReg(0))
1459     .addUse(ThirtyTwo.getReg(0));
1460 
1461   // TODO: Should this propagate fast-math-flags?
1462   B.buildFAdd(Dst, LdExp, CvtLo);
1463   MI.eraseFromParent();
1464   return true;
1465 }
1466 
1467 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1468   MachineInstr &MI, MachineRegisterInfo &MRI,
1469   MachineIRBuilder &B) const {
1470   MachineFunction &MF = B.getMF();
1471   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1472 
1473   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1474                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1475 
1476   // With ieee_mode disabled, the instructions have the correct behavior
1477   // already for G_FMINNUM/G_FMAXNUM
1478   if (!MFI->getMode().IEEE)
1479     return !IsIEEEOp;
1480 
1481   if (IsIEEEOp)
1482     return true;
1483 
1484   MachineIRBuilder HelperBuilder(MI);
1485   GISelObserverWrapper DummyObserver;
1486   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1487   HelperBuilder.setInstr(MI);
1488   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1489 }
1490 
1491 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1492   MachineInstr &MI, MachineRegisterInfo &MRI,
1493   MachineIRBuilder &B) const {
1494   // TODO: Should move some of this into LegalizerHelper.
1495 
1496   // TODO: Promote dynamic indexing of s16 to s32
1497   // TODO: Dynamic s64 indexing is only legal for SGPR.
1498   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
1499   if (!IdxVal) // Dynamic case will be selected to register indexing.
1500     return true;
1501 
1502   Register Dst = MI.getOperand(0).getReg();
1503   Register Vec = MI.getOperand(1).getReg();
1504 
1505   LLT VecTy = MRI.getType(Vec);
1506   LLT EltTy = VecTy.getElementType();
1507   assert(EltTy == MRI.getType(Dst));
1508 
1509   B.setInstr(MI);
1510 
1511   if (IdxVal.getValue() < VecTy.getNumElements())
1512     B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits());
1513   else
1514     B.buildUndef(Dst);
1515 
1516   MI.eraseFromParent();
1517   return true;
1518 }
1519 
1520 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1521   MachineInstr &MI, MachineRegisterInfo &MRI,
1522   MachineIRBuilder &B) const {
1523   // TODO: Should move some of this into LegalizerHelper.
1524 
1525   // TODO: Promote dynamic indexing of s16 to s32
1526   // TODO: Dynamic s64 indexing is only legal for SGPR.
1527   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI);
1528   if (!IdxVal) // Dynamic case will be selected to register indexing.
1529     return true;
1530 
1531   Register Dst = MI.getOperand(0).getReg();
1532   Register Vec = MI.getOperand(1).getReg();
1533   Register Ins = MI.getOperand(2).getReg();
1534 
1535   LLT VecTy = MRI.getType(Vec);
1536   LLT EltTy = VecTy.getElementType();
1537   assert(EltTy == MRI.getType(Ins));
1538 
1539   B.setInstr(MI);
1540 
1541   if (IdxVal.getValue() < VecTy.getNumElements())
1542     B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits());
1543   else
1544     B.buildUndef(Dst);
1545 
1546   MI.eraseFromParent();
1547   return true;
1548 }
1549 
1550 bool AMDGPULegalizerInfo::legalizeSinCos(
1551   MachineInstr &MI, MachineRegisterInfo &MRI,
1552   MachineIRBuilder &B) const {
1553   B.setInstr(MI);
1554 
1555   Register DstReg = MI.getOperand(0).getReg();
1556   Register SrcReg = MI.getOperand(1).getReg();
1557   LLT Ty = MRI.getType(DstReg);
1558   unsigned Flags = MI.getFlags();
1559 
1560   Register TrigVal;
1561   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1562   if (ST.hasTrigReducedRange()) {
1563     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1564     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1565       .addUse(MulVal.getReg(0))
1566       .setMIFlags(Flags).getReg(0);
1567   } else
1568     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1569 
1570   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1571     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1572   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1573     .addUse(TrigVal)
1574     .setMIFlags(Flags);
1575   MI.eraseFromParent();
1576   return true;
1577 }
1578 
1579 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1580   Register DstReg, LLT PtrTy,
1581   MachineIRBuilder &B, const GlobalValue *GV,
1582   unsigned Offset, unsigned GAFlags) const {
1583   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1584   // to the following code sequence:
1585   //
1586   // For constant address space:
1587   //   s_getpc_b64 s[0:1]
1588   //   s_add_u32 s0, s0, $symbol
1589   //   s_addc_u32 s1, s1, 0
1590   //
1591   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1592   //   a fixup or relocation is emitted to replace $symbol with a literal
1593   //   constant, which is a pc-relative offset from the encoding of the $symbol
1594   //   operand to the global variable.
1595   //
1596   // For global address space:
1597   //   s_getpc_b64 s[0:1]
1598   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1599   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1600   //
1601   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1602   //   fixups or relocations are emitted to replace $symbol@*@lo and
1603   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1604   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
1605   //   operand to the global variable.
1606   //
1607   // What we want here is an offset from the value returned by s_getpc
1608   // (which is the address of the s_add_u32 instruction) to the global
1609   // variable, but since the encoding of $symbol starts 4 bytes after the start
1610   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1611   // small. This requires us to add 4 to the global variable offset in order to
1612   // compute the correct address.
1613 
1614   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1615 
1616   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1617     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1618 
1619   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1620     .addDef(PCReg);
1621 
1622   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1623   if (GAFlags == SIInstrInfo::MO_NONE)
1624     MIB.addImm(0);
1625   else
1626     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1627 
1628   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1629 
1630   if (PtrTy.getSizeInBits() == 32)
1631     B.buildExtract(DstReg, PCReg, 0);
1632   return true;
1633  }
1634 
1635 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1636   MachineInstr &MI, MachineRegisterInfo &MRI,
1637   MachineIRBuilder &B) const {
1638   Register DstReg = MI.getOperand(0).getReg();
1639   LLT Ty = MRI.getType(DstReg);
1640   unsigned AS = Ty.getAddressSpace();
1641 
1642   const GlobalValue *GV = MI.getOperand(1).getGlobal();
1643   MachineFunction &MF = B.getMF();
1644   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1645   B.setInstr(MI);
1646 
1647   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1648     if (!MFI->isEntryFunction()) {
1649       const Function &Fn = MF.getFunction();
1650       DiagnosticInfoUnsupported BadLDSDecl(
1651         Fn, "local memory global used by non-kernel function", MI.getDebugLoc());
1652       Fn.getContext().diagnose(BadLDSDecl);
1653     }
1654 
1655     // TODO: We could emit code to handle the initialization somewhere.
1656     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1657       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1658       MI.eraseFromParent();
1659       return true;
1660     }
1661 
1662     const Function &Fn = MF.getFunction();
1663     DiagnosticInfoUnsupported BadInit(
1664       Fn, "unsupported initializer for address space", MI.getDebugLoc());
1665     Fn.getContext().diagnose(BadInit);
1666     return true;
1667   }
1668 
1669   const SITargetLowering *TLI = ST.getTargetLowering();
1670 
1671   if (TLI->shouldEmitFixup(GV)) {
1672     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
1673     MI.eraseFromParent();
1674     return true;
1675   }
1676 
1677   if (TLI->shouldEmitPCReloc(GV)) {
1678     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
1679     MI.eraseFromParent();
1680     return true;
1681   }
1682 
1683   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1684   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
1685 
1686   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
1687     MachinePointerInfo::getGOT(MF),
1688     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1689     MachineMemOperand::MOInvariant,
1690     8 /*Size*/, 8 /*Align*/);
1691 
1692   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
1693 
1694   if (Ty.getSizeInBits() == 32) {
1695     // Truncate if this is a 32-bit constant adrdess.
1696     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
1697     B.buildExtract(DstReg, Load, 0);
1698   } else
1699     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
1700 
1701   MI.eraseFromParent();
1702   return true;
1703 }
1704 
1705 bool AMDGPULegalizerInfo::legalizeLoad(
1706   MachineInstr &MI, MachineRegisterInfo &MRI,
1707   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
1708   B.setInstr(MI);
1709   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1710   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
1711   Observer.changingInstr(MI);
1712   MI.getOperand(1).setReg(Cast.getReg(0));
1713   Observer.changedInstr(MI);
1714   return true;
1715 }
1716 
1717 bool AMDGPULegalizerInfo::legalizeFMad(
1718   MachineInstr &MI, MachineRegisterInfo &MRI,
1719   MachineIRBuilder &B) const {
1720   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1721   assert(Ty.isScalar());
1722 
1723   MachineFunction &MF = B.getMF();
1724   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1725 
1726   // TODO: Always legal with future ftz flag.
1727   if (Ty == LLT::scalar(32) && !MFI->getMode().FP32Denormals)
1728     return true;
1729   if (Ty == LLT::scalar(16) && !MFI->getMode().FP64FP16Denormals)
1730     return true;
1731 
1732 
1733   MachineIRBuilder HelperBuilder(MI);
1734   GISelObserverWrapper DummyObserver;
1735   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1736   HelperBuilder.setMBB(*MI.getParent());
1737   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
1738 }
1739 
1740 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
1741   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
1742   Register DstReg = MI.getOperand(0).getReg();
1743   Register PtrReg = MI.getOperand(1).getReg();
1744   Register CmpVal = MI.getOperand(2).getReg();
1745   Register NewVal = MI.getOperand(3).getReg();
1746 
1747   assert(SITargetLowering::isFlatGlobalAddrSpace(
1748            MRI.getType(PtrReg).getAddressSpace()) &&
1749          "this should not have been custom lowered");
1750 
1751   LLT ValTy = MRI.getType(CmpVal);
1752   LLT VecTy = LLT::vector(2, ValTy);
1753 
1754   B.setInstr(MI);
1755   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
1756 
1757   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
1758     .addDef(DstReg)
1759     .addUse(PtrReg)
1760     .addUse(PackedVal)
1761     .setMemRefs(MI.memoperands());
1762 
1763   MI.eraseFromParent();
1764   return true;
1765 }
1766 
1767 // Return the use branch instruction, otherwise null if the usage is invalid.
1768 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
1769                                        MachineRegisterInfo &MRI) {
1770   Register CondDef = MI.getOperand(0).getReg();
1771   if (!MRI.hasOneNonDBGUse(CondDef))
1772     return nullptr;
1773 
1774   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
1775   return UseMI.getParent() == MI.getParent() &&
1776     UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr;
1777 }
1778 
1779 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
1780                                                 Register Reg, LLT Ty) const {
1781   Register LiveIn = MRI.getLiveInVirtReg(Reg);
1782   if (LiveIn)
1783     return LiveIn;
1784 
1785   Register NewReg = MRI.createGenericVirtualRegister(Ty);
1786   MRI.addLiveIn(Reg, NewReg);
1787   return NewReg;
1788 }
1789 
1790 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
1791                                          const ArgDescriptor *Arg) const {
1792   if (!Arg->isRegister() || !Arg->getRegister().isValid())
1793     return false; // TODO: Handle these
1794 
1795   assert(Arg->getRegister().isPhysical());
1796 
1797   MachineRegisterInfo &MRI = *B.getMRI();
1798 
1799   LLT Ty = MRI.getType(DstReg);
1800   Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
1801 
1802   if (Arg->isMasked()) {
1803     // TODO: Should we try to emit this once in the entry block?
1804     const LLT S32 = LLT::scalar(32);
1805     const unsigned Mask = Arg->getMask();
1806     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
1807 
1808     Register AndMaskSrc = LiveIn;
1809 
1810     if (Shift != 0) {
1811       auto ShiftAmt = B.buildConstant(S32, Shift);
1812       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
1813     }
1814 
1815     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
1816   } else
1817     B.buildCopy(DstReg, LiveIn);
1818 
1819   // Insert the argument copy if it doens't already exist.
1820   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
1821   if (!MRI.getVRegDef(LiveIn)) {
1822     // FIXME: Should have scoped insert pt
1823     MachineBasicBlock &OrigInsBB = B.getMBB();
1824     auto OrigInsPt = B.getInsertPt();
1825 
1826     MachineBasicBlock &EntryMBB = B.getMF().front();
1827     EntryMBB.addLiveIn(Arg->getRegister());
1828     B.setInsertPt(EntryMBB, EntryMBB.begin());
1829     B.buildCopy(LiveIn, Arg->getRegister());
1830 
1831     B.setInsertPt(OrigInsBB, OrigInsPt);
1832   }
1833 
1834   return true;
1835 }
1836 
1837 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
1838   MachineInstr &MI,
1839   MachineRegisterInfo &MRI,
1840   MachineIRBuilder &B,
1841   AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
1842   B.setInstr(MI);
1843 
1844   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1845 
1846   const ArgDescriptor *Arg;
1847   const TargetRegisterClass *RC;
1848   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
1849   if (!Arg) {
1850     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
1851     return false;
1852   }
1853 
1854   if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
1855     MI.eraseFromParent();
1856     return true;
1857   }
1858 
1859   return false;
1860 }
1861 
1862 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
1863                                        MachineRegisterInfo &MRI,
1864                                        MachineIRBuilder &B) const {
1865   B.setInstr(MI);
1866   Register Dst = MI.getOperand(0).getReg();
1867   LLT DstTy = MRI.getType(Dst);
1868   LLT S16 = LLT::scalar(16);
1869   LLT S32 = LLT::scalar(32);
1870   LLT S64 = LLT::scalar(64);
1871 
1872   if (legalizeFastUnsafeFDIV(MI, MRI, B))
1873     return true;
1874 
1875   if (DstTy == S16)
1876     return legalizeFDIV16(MI, MRI, B);
1877   if (DstTy == S32)
1878     return legalizeFDIV32(MI, MRI, B);
1879   if (DstTy == S64)
1880     return legalizeFDIV64(MI, MRI, B);
1881 
1882   return false;
1883 }
1884 
1885 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
1886                                                  MachineRegisterInfo &MRI,
1887                                                  MachineIRBuilder &B) const {
1888   Register Res = MI.getOperand(0).getReg();
1889   Register LHS = MI.getOperand(1).getReg();
1890   Register RHS = MI.getOperand(2).getReg();
1891 
1892   uint16_t Flags = MI.getFlags();
1893 
1894   LLT ResTy = MRI.getType(Res);
1895   LLT S32 = LLT::scalar(32);
1896   LLT S64 = LLT::scalar(64);
1897 
1898   const MachineFunction &MF = B.getMF();
1899   bool Unsafe =
1900     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
1901 
1902   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
1903     return false;
1904 
1905   if (!Unsafe && ResTy == S32 &&
1906       MF.getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals)
1907     return false;
1908 
1909   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
1910     // 1 / x -> RCP(x)
1911     if (CLHS->isExactlyValue(1.0)) {
1912       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
1913         .addUse(RHS)
1914         .setMIFlags(Flags);
1915 
1916       MI.eraseFromParent();
1917       return true;
1918     }
1919 
1920     // -1 / x -> RCP( FNEG(x) )
1921     if (CLHS->isExactlyValue(-1.0)) {
1922       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
1923       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
1924         .addUse(FNeg.getReg(0))
1925         .setMIFlags(Flags);
1926 
1927       MI.eraseFromParent();
1928       return true;
1929     }
1930   }
1931 
1932   // x / y -> x * (1.0 / y)
1933   if (Unsafe) {
1934     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
1935       .addUse(RHS)
1936       .setMIFlags(Flags);
1937     B.buildFMul(Res, LHS, RCP, Flags);
1938 
1939     MI.eraseFromParent();
1940     return true;
1941   }
1942 
1943   return false;
1944 }
1945 
1946 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
1947                                          MachineRegisterInfo &MRI,
1948                                          MachineIRBuilder &B) const {
1949   B.setInstr(MI);
1950   Register Res = MI.getOperand(0).getReg();
1951   Register LHS = MI.getOperand(1).getReg();
1952   Register RHS = MI.getOperand(2).getReg();
1953 
1954   uint16_t Flags = MI.getFlags();
1955 
1956   LLT S16 = LLT::scalar(16);
1957   LLT S32 = LLT::scalar(32);
1958 
1959   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
1960   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
1961 
1962   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
1963     .addUse(RHSExt.getReg(0))
1964     .setMIFlags(Flags);
1965 
1966   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
1967   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
1968 
1969   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
1970     .addUse(RDst.getReg(0))
1971     .addUse(RHS)
1972     .addUse(LHS)
1973     .setMIFlags(Flags);
1974 
1975   MI.eraseFromParent();
1976   return true;
1977 }
1978 
1979 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
1980 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
1981 static void toggleSPDenormMode(bool Enable,
1982                                MachineIRBuilder &B,
1983                                const GCNSubtarget &ST,
1984                                AMDGPU::SIModeRegisterDefaults Mode) {
1985   // Set SP denorm mode to this value.
1986   unsigned SPDenormMode =
1987     Enable ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
1988 
1989   if (ST.hasDenormModeInst()) {
1990     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
1991     unsigned DPDenormModeDefault = Mode.FP64FP16Denormals
1992                                    ? FP_DENORM_FLUSH_NONE
1993                                    : FP_DENORM_FLUSH_IN_FLUSH_OUT;
1994 
1995     unsigned NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
1996     B.buildInstr(AMDGPU::S_DENORM_MODE)
1997       .addImm(NewDenormModeValue);
1998 
1999   } else {
2000     // Select FP32 bit field in mode register.
2001     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2002                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2003                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2004 
2005     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2006       .addImm(SPDenormMode)
2007       .addImm(SPDenormModeBitField);
2008   }
2009 }
2010 
2011 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2012                                          MachineRegisterInfo &MRI,
2013                                          MachineIRBuilder &B) const {
2014   B.setInstr(MI);
2015   Register Res = MI.getOperand(0).getReg();
2016   Register LHS = MI.getOperand(1).getReg();
2017   Register RHS = MI.getOperand(2).getReg();
2018   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2019   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2020 
2021   uint16_t Flags = MI.getFlags();
2022 
2023   LLT S32 = LLT::scalar(32);
2024   LLT S1 = LLT::scalar(1);
2025 
2026   auto One = B.buildFConstant(S32, 1.0f);
2027 
2028   auto DenominatorScaled =
2029     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2030       .addUse(RHS)
2031       .addUse(RHS)
2032       .addUse(LHS)
2033       .setMIFlags(Flags);
2034   auto NumeratorScaled =
2035     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2036       .addUse(LHS)
2037       .addUse(RHS)
2038       .addUse(LHS)
2039       .setMIFlags(Flags);
2040 
2041   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2042     .addUse(DenominatorScaled.getReg(0))
2043     .setMIFlags(Flags);
2044   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2045 
2046   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2047   // aren't modeled as reading it.
2048   if (!Mode.FP32Denormals)
2049     toggleSPDenormMode(true, B, ST, Mode);
2050 
2051   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2052   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2053   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2054   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2055   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2056   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2057 
2058   if (!Mode.FP32Denormals)
2059     toggleSPDenormMode(false, B, ST, Mode);
2060 
2061   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2062     .addUse(Fma4.getReg(0))
2063     .addUse(Fma1.getReg(0))
2064     .addUse(Fma3.getReg(0))
2065     .addUse(NumeratorScaled.getReg(1))
2066     .setMIFlags(Flags);
2067 
2068   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2069     .addUse(Fmas.getReg(0))
2070     .addUse(RHS)
2071     .addUse(LHS)
2072     .setMIFlags(Flags);
2073 
2074   MI.eraseFromParent();
2075   return true;
2076 }
2077 
2078 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
2079                                          MachineRegisterInfo &MRI,
2080                                          MachineIRBuilder &B) const {
2081   B.setInstr(MI);
2082   Register Res = MI.getOperand(0).getReg();
2083   Register LHS = MI.getOperand(1).getReg();
2084   Register RHS = MI.getOperand(2).getReg();
2085 
2086   uint16_t Flags = MI.getFlags();
2087 
2088   LLT S64 = LLT::scalar(64);
2089   LLT S1 = LLT::scalar(1);
2090 
2091   auto One = B.buildFConstant(S64, 1.0);
2092 
2093   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2094     .addUse(RHS)
2095     .addUse(RHS)
2096     .addUse(LHS)
2097     .setMIFlags(Flags);
2098 
2099   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
2100 
2101   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
2102     .addUse(DivScale0.getReg(0))
2103     .setMIFlags(Flags);
2104 
2105   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
2106   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
2107   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
2108 
2109   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2110     .addUse(LHS)
2111     .addUse(RHS)
2112     .addUse(LHS)
2113     .setMIFlags(Flags);
2114 
2115   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
2116   auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags);
2117   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
2118 
2119   Register Scale;
2120   if (!ST.hasUsableDivScaleConditionOutput()) {
2121     // Workaround a hardware bug on SI where the condition output from div_scale
2122     // is not usable.
2123 
2124     Scale = MRI.createGenericVirtualRegister(S1);
2125 
2126     LLT S32 = LLT::scalar(32);
2127 
2128     auto NumUnmerge = B.buildUnmerge(S32, LHS);
2129     auto DenUnmerge = B.buildUnmerge(S32, RHS);
2130     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
2131     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
2132 
2133     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
2134                               Scale1Unmerge.getReg(1));
2135     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
2136                               Scale0Unmerge.getReg(1));
2137     B.buildXor(Scale, CmpNum, CmpDen);
2138   } else {
2139     Scale = DivScale1.getReg(1);
2140   }
2141 
2142   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
2143     .addUse(Fma4.getReg(0))
2144     .addUse(Fma3.getReg(0))
2145     .addUse(Mul.getReg(0))
2146     .addUse(Scale)
2147     .setMIFlags(Flags);
2148 
2149   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, {S64}, false)
2150     .addDef(Res)
2151     .addUse(Fmas.getReg(0))
2152     .addUse(RHS)
2153     .addUse(LHS)
2154     .setMIFlags(Flags);
2155 
2156   MI.eraseFromParent();
2157   return true;
2158 }
2159 
2160 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
2161                                                  MachineRegisterInfo &MRI,
2162                                                  MachineIRBuilder &B) const {
2163   B.setInstr(MI);
2164   Register Res = MI.getOperand(0).getReg();
2165   Register LHS = MI.getOperand(2).getReg();
2166   Register RHS = MI.getOperand(3).getReg();
2167   uint16_t Flags = MI.getFlags();
2168 
2169   LLT S32 = LLT::scalar(32);
2170   LLT S1 = LLT::scalar(1);
2171 
2172   auto Abs = B.buildFAbs(S32, RHS, Flags);
2173   const APFloat C0Val(1.0f);
2174 
2175   auto C0 = B.buildConstant(S32, 0x6f800000);
2176   auto C1 = B.buildConstant(S32, 0x2f800000);
2177   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
2178 
2179   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
2180   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
2181 
2182   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
2183 
2184   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2185     .addUse(Mul0.getReg(0))
2186     .setMIFlags(Flags);
2187 
2188   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
2189 
2190   B.buildFMul(Res, Sel, Mul1, Flags);
2191 
2192   MI.eraseFromParent();
2193   return true;
2194 }
2195 
2196 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
2197                                                  MachineRegisterInfo &MRI,
2198                                                  MachineIRBuilder &B) const {
2199   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2200   if (!MFI->isEntryFunction()) {
2201     return legalizePreloadedArgIntrin(MI, MRI, B,
2202                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
2203   }
2204 
2205   B.setInstr(MI);
2206 
2207   uint64_t Offset =
2208     ST.getTargetLowering()->getImplicitParameterOffset(
2209       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
2210   Register DstReg = MI.getOperand(0).getReg();
2211   LLT DstTy = MRI.getType(DstReg);
2212   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
2213 
2214   const ArgDescriptor *Arg;
2215   const TargetRegisterClass *RC;
2216   std::tie(Arg, RC)
2217     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2218   if (!Arg)
2219     return false;
2220 
2221   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
2222   if (!loadInputValue(KernargPtrReg, B, Arg))
2223     return false;
2224 
2225   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
2226   MI.eraseFromParent();
2227   return true;
2228 }
2229 
2230 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
2231                                               MachineRegisterInfo &MRI,
2232                                               MachineIRBuilder &B,
2233                                               unsigned AddrSpace) const {
2234   B.setInstr(MI);
2235   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
2236   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
2237   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
2238   MI.eraseFromParent();
2239   return true;
2240 }
2241 
2242 /// Handle register layout difference for f16 images for some subtargets.
2243 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
2244                                              MachineRegisterInfo &MRI,
2245                                              Register Reg) const {
2246   if (!ST.hasUnpackedD16VMem())
2247     return Reg;
2248 
2249   const LLT S16 = LLT::scalar(16);
2250   const LLT S32 = LLT::scalar(32);
2251   LLT StoreVT = MRI.getType(Reg);
2252   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
2253 
2254   auto Unmerge = B.buildUnmerge(S16, Reg);
2255 
2256   SmallVector<Register, 4> WideRegs;
2257   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
2258     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
2259 
2260   int NumElts = StoreVT.getNumElements();
2261 
2262   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
2263 }
2264 
2265 bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI,
2266                                                  MachineRegisterInfo &MRI,
2267                                                  MachineIRBuilder &B,
2268                                                  bool IsFormat) const {
2269   // TODO: Reject f16 format on targets where unsupported.
2270   Register VData = MI.getOperand(1).getReg();
2271   LLT Ty = MRI.getType(VData);
2272 
2273   B.setInstr(MI);
2274 
2275   const LLT S32 = LLT::scalar(32);
2276   const LLT S16 = LLT::scalar(16);
2277 
2278   // Fixup illegal register types for i8 stores.
2279   if (Ty == LLT::scalar(8) || Ty == S16) {
2280     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
2281     MI.getOperand(1).setReg(AnyExt);
2282     return true;
2283   }
2284 
2285   if (Ty.isVector()) {
2286     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
2287       if (IsFormat)
2288         MI.getOperand(1).setReg(handleD16VData(B, MRI, VData));
2289       return true;
2290     }
2291 
2292     return Ty.getElementType() == S32 && Ty.getNumElements() <= 4;
2293   }
2294 
2295   return Ty == S32;
2296 }
2297 
2298 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
2299                                             MachineRegisterInfo &MRI,
2300                                             MachineIRBuilder &B) const {
2301   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
2302   switch (MI.getIntrinsicID()) {
2303   case Intrinsic::amdgcn_if: {
2304     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
2305       const SIRegisterInfo *TRI
2306         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
2307 
2308       B.setInstr(*BrCond);
2309       Register Def = MI.getOperand(1).getReg();
2310       Register Use = MI.getOperand(3).getReg();
2311       B.buildInstr(AMDGPU::SI_IF)
2312         .addDef(Def)
2313         .addUse(Use)
2314         .addMBB(BrCond->getOperand(1).getMBB());
2315 
2316       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
2317       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
2318       MI.eraseFromParent();
2319       BrCond->eraseFromParent();
2320       return true;
2321     }
2322 
2323     return false;
2324   }
2325   case Intrinsic::amdgcn_loop: {
2326     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
2327       const SIRegisterInfo *TRI
2328         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
2329 
2330       B.setInstr(*BrCond);
2331       Register Reg = MI.getOperand(2).getReg();
2332       B.buildInstr(AMDGPU::SI_LOOP)
2333         .addUse(Reg)
2334         .addMBB(BrCond->getOperand(1).getMBB());
2335       MI.eraseFromParent();
2336       BrCond->eraseFromParent();
2337       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
2338       return true;
2339     }
2340 
2341     return false;
2342   }
2343   case Intrinsic::amdgcn_kernarg_segment_ptr:
2344     return legalizePreloadedArgIntrin(
2345       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2346   case Intrinsic::amdgcn_implicitarg_ptr:
2347     return legalizeImplicitArgPtr(MI, MRI, B);
2348   case Intrinsic::amdgcn_workitem_id_x:
2349     return legalizePreloadedArgIntrin(MI, MRI, B,
2350                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
2351   case Intrinsic::amdgcn_workitem_id_y:
2352     return legalizePreloadedArgIntrin(MI, MRI, B,
2353                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
2354   case Intrinsic::amdgcn_workitem_id_z:
2355     return legalizePreloadedArgIntrin(MI, MRI, B,
2356                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
2357   case Intrinsic::amdgcn_workgroup_id_x:
2358     return legalizePreloadedArgIntrin(MI, MRI, B,
2359                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
2360   case Intrinsic::amdgcn_workgroup_id_y:
2361     return legalizePreloadedArgIntrin(MI, MRI, B,
2362                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
2363   case Intrinsic::amdgcn_workgroup_id_z:
2364     return legalizePreloadedArgIntrin(MI, MRI, B,
2365                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
2366   case Intrinsic::amdgcn_dispatch_ptr:
2367     return legalizePreloadedArgIntrin(MI, MRI, B,
2368                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
2369   case Intrinsic::amdgcn_queue_ptr:
2370     return legalizePreloadedArgIntrin(MI, MRI, B,
2371                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
2372   case Intrinsic::amdgcn_implicit_buffer_ptr:
2373     return legalizePreloadedArgIntrin(
2374       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
2375   case Intrinsic::amdgcn_dispatch_id:
2376     return legalizePreloadedArgIntrin(MI, MRI, B,
2377                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
2378   case Intrinsic::amdgcn_fdiv_fast:
2379     return legalizeFDIVFastIntrin(MI, MRI, B);
2380   case Intrinsic::amdgcn_is_shared:
2381     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
2382   case Intrinsic::amdgcn_is_private:
2383     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
2384   case Intrinsic::amdgcn_wavefrontsize: {
2385     B.setInstr(MI);
2386     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
2387     MI.eraseFromParent();
2388     return true;
2389   }
2390   case Intrinsic::amdgcn_raw_buffer_store:
2391     return legalizeRawBufferStore(MI, MRI, B, false);
2392   case Intrinsic::amdgcn_raw_buffer_store_format:
2393     return legalizeRawBufferStore(MI, MRI, B, true);
2394   default:
2395     return true;
2396   }
2397 
2398   return true;
2399 }
2400