1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
19 #endif
20 
21 #include "AMDGPU.h"
22 #include "AMDGPULegalizerInfo.h"
23 #include "AMDGPUTargetMachine.h"
24 #include "SIMachineFunctionInfo.h"
25 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/CodeGen/TargetOpcodes.h"
28 #include "llvm/CodeGen/ValueTypes.h"
29 #include "llvm/IR/DerivedTypes.h"
30 #include "llvm/IR/DiagnosticInfo.h"
31 #include "llvm/IR/Type.h"
32 #include "llvm/Support/Debug.h"
33 
34 #define DEBUG_TYPE "amdgpu-legalinfo"
35 
36 using namespace llvm;
37 using namespace LegalizeActions;
38 using namespace LegalizeMutations;
39 using namespace LegalityPredicates;
40 
41 
42 static LegalityPredicate isMultiple32(unsigned TypeIdx,
43                                       unsigned MaxSize = 1024) {
44   return [=](const LegalityQuery &Query) {
45     const LLT Ty = Query.Types[TypeIdx];
46     const LLT EltTy = Ty.getScalarType();
47     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
48   };
49 }
50 
51 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
52   return [=](const LegalityQuery &Query) {
53     return Query.Types[TypeIdx].getSizeInBits() == Size;
54   };
55 }
56 
57 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
58   return [=](const LegalityQuery &Query) {
59     const LLT Ty = Query.Types[TypeIdx];
60     return Ty.isVector() &&
61            Ty.getNumElements() % 2 != 0 &&
62            Ty.getElementType().getSizeInBits() < 32 &&
63            Ty.getSizeInBits() % 32 != 0;
64   };
65 }
66 
67 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
68   return [=](const LegalityQuery &Query) {
69     const LLT Ty = Query.Types[TypeIdx];
70     const LLT EltTy = Ty.getScalarType();
71     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
72   };
73 }
74 
75 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
76   return [=](const LegalityQuery &Query) {
77     const LLT Ty = Query.Types[TypeIdx];
78     const LLT EltTy = Ty.getElementType();
79     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
80   };
81 }
82 
83 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
84   return [=](const LegalityQuery &Query) {
85     const LLT Ty = Query.Types[TypeIdx];
86     const LLT EltTy = Ty.getElementType();
87     unsigned Size = Ty.getSizeInBits();
88     unsigned Pieces = (Size + 63) / 64;
89     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
90     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
91   };
92 }
93 
94 // Increase the number of vector elements to reach the next multiple of 32-bit
95 // type.
96 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
97   return [=](const LegalityQuery &Query) {
98     const LLT Ty = Query.Types[TypeIdx];
99 
100     const LLT EltTy = Ty.getElementType();
101     const int Size = Ty.getSizeInBits();
102     const int EltSize = EltTy.getSizeInBits();
103     const int NextMul32 = (Size + 31) / 32;
104 
105     assert(EltSize < 32);
106 
107     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
108     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
109   };
110 }
111 
112 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
113   return [=](const LegalityQuery &Query) {
114     const LLT QueryTy = Query.Types[TypeIdx];
115     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
116   };
117 }
118 
119 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
120   return [=](const LegalityQuery &Query) {
121     const LLT QueryTy = Query.Types[TypeIdx];
122     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
123   };
124 }
125 
126 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
127   return [=](const LegalityQuery &Query) {
128     const LLT QueryTy = Query.Types[TypeIdx];
129     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
130   };
131 }
132 
133 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
134 // v2s16.
135 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
136   return [=](const LegalityQuery &Query) {
137     const LLT Ty = Query.Types[TypeIdx];
138     if (Ty.isVector()) {
139       const int EltSize = Ty.getElementType().getSizeInBits();
140       return EltSize == 32 || EltSize == 64 ||
141             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
142              EltSize == 128 || EltSize == 256;
143     }
144 
145     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
146   };
147 }
148 
149 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
150   return [=](const LegalityQuery &Query) {
151     return Query.Types[TypeIdx].getElementType() == Type;
152   };
153 }
154 
155 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
156   return [=](const LegalityQuery &Query) {
157     const LLT Ty = Query.Types[TypeIdx];
158     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
159            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
160   };
161 }
162 
163 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
164                                          const GCNTargetMachine &TM)
165   :  ST(ST_) {
166   using namespace TargetOpcode;
167 
168   auto GetAddrSpacePtr = [&TM](unsigned AS) {
169     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
170   };
171 
172   const LLT S1 = LLT::scalar(1);
173   const LLT S8 = LLT::scalar(8);
174   const LLT S16 = LLT::scalar(16);
175   const LLT S32 = LLT::scalar(32);
176   const LLT S64 = LLT::scalar(64);
177   const LLT S96 = LLT::scalar(96);
178   const LLT S128 = LLT::scalar(128);
179   const LLT S256 = LLT::scalar(256);
180   const LLT S1024 = LLT::scalar(1024);
181 
182   const LLT V2S16 = LLT::vector(2, 16);
183   const LLT V4S16 = LLT::vector(4, 16);
184 
185   const LLT V2S32 = LLT::vector(2, 32);
186   const LLT V3S32 = LLT::vector(3, 32);
187   const LLT V4S32 = LLT::vector(4, 32);
188   const LLT V5S32 = LLT::vector(5, 32);
189   const LLT V6S32 = LLT::vector(6, 32);
190   const LLT V7S32 = LLT::vector(7, 32);
191   const LLT V8S32 = LLT::vector(8, 32);
192   const LLT V9S32 = LLT::vector(9, 32);
193   const LLT V10S32 = LLT::vector(10, 32);
194   const LLT V11S32 = LLT::vector(11, 32);
195   const LLT V12S32 = LLT::vector(12, 32);
196   const LLT V13S32 = LLT::vector(13, 32);
197   const LLT V14S32 = LLT::vector(14, 32);
198   const LLT V15S32 = LLT::vector(15, 32);
199   const LLT V16S32 = LLT::vector(16, 32);
200   const LLT V32S32 = LLT::vector(32, 32);
201 
202   const LLT V2S64 = LLT::vector(2, 64);
203   const LLT V3S64 = LLT::vector(3, 64);
204   const LLT V4S64 = LLT::vector(4, 64);
205   const LLT V5S64 = LLT::vector(5, 64);
206   const LLT V6S64 = LLT::vector(6, 64);
207   const LLT V7S64 = LLT::vector(7, 64);
208   const LLT V8S64 = LLT::vector(8, 64);
209   const LLT V16S64 = LLT::vector(16, 64);
210 
211   std::initializer_list<LLT> AllS32Vectors =
212     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
213      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
214   std::initializer_list<LLT> AllS64Vectors =
215     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
216 
217   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
218   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
219   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
220   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
221   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
222   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
223   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
224 
225   const LLT CodePtr = FlatPtr;
226 
227   const std::initializer_list<LLT> AddrSpaces64 = {
228     GlobalPtr, ConstantPtr, FlatPtr
229   };
230 
231   const std::initializer_list<LLT> AddrSpaces32 = {
232     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
233   };
234 
235   const std::initializer_list<LLT> FPTypesBase = {
236     S32, S64
237   };
238 
239   const std::initializer_list<LLT> FPTypes16 = {
240     S32, S64, S16
241   };
242 
243   const std::initializer_list<LLT> FPTypesPK16 = {
244     S32, S64, S16, V2S16
245   };
246 
247   const LLT MinLegalScalarShiftTy = ST.has16BitInsts() ? S16 : S32;
248 
249   setAction({G_BRCOND, S1}, Legal); // VCC branches
250   setAction({G_BRCOND, S32}, Legal); // SCC branches
251 
252   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
253   // elements for v3s16
254   getActionDefinitionsBuilder(G_PHI)
255     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
256     .legalFor(AllS32Vectors)
257     .legalFor(AllS64Vectors)
258     .legalFor(AddrSpaces64)
259     .legalFor(AddrSpaces32)
260     .clampScalar(0, S32, S256)
261     .widenScalarToNextPow2(0, 32)
262     .clampMaxNumElements(0, S32, 16)
263     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
264     .legalIf(isPointer(0));
265 
266   if (ST.has16BitInsts()) {
267     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
268       .legalFor({S32, S16})
269       .clampScalar(0, S16, S32)
270       .scalarize(0);
271   } else {
272     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
273       .legalFor({S32})
274       .clampScalar(0, S32, S32)
275       .scalarize(0);
276   }
277 
278   // FIXME: Not really legal. Placeholder for custom lowering.
279   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
280     .legalFor({S32, S64})
281     .clampScalar(0, S32, S64)
282     .widenScalarToNextPow2(0, 32)
283     .scalarize(0);
284 
285   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
286     .legalFor({S32})
287     .clampScalar(0, S32, S32)
288     .scalarize(0);
289 
290   // Report legal for any types we can handle anywhere. For the cases only legal
291   // on the SALU, RegBankSelect will be able to re-legalize.
292   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
293     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
294     .clampScalar(0, S32, S64)
295     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
296     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
297     .widenScalarToNextPow2(0)
298     .scalarize(0);
299 
300   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
301                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
302     .legalFor({{S32, S1}, {S32, S32}})
303     .clampScalar(0, S32, S32)
304     .scalarize(0); // TODO: Implement.
305 
306   getActionDefinitionsBuilder({G_SADDO, G_SSUBO})
307     .lower();
308 
309   getActionDefinitionsBuilder(G_BITCAST)
310     // Don't worry about the size constraint.
311     .legalIf(all(isRegisterType(0), isRegisterType(1)))
312     // FIXME: Testing hack
313     .legalForCartesianProduct({S16, LLT::vector(2, 8), })
314     .lower();
315 
316 
317   getActionDefinitionsBuilder(G_FCONSTANT)
318     .legalFor({S32, S64, S16})
319     .clampScalar(0, S16, S64);
320 
321   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
322     .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
323                ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
324     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
325     .clampScalarOrElt(0, S32, S1024)
326     .legalIf(isMultiple32(0))
327     .widenScalarToNextPow2(0, 32)
328     .clampMaxNumElements(0, S32, 16);
329 
330 
331   // FIXME: i1 operands to intrinsics should always be legal, but other i1
332   // values may not be legal.  We need to figure out how to distinguish
333   // between these two scenarios.
334   getActionDefinitionsBuilder(G_CONSTANT)
335     .legalFor({S1, S32, S64, S16, GlobalPtr,
336                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
337     .clampScalar(0, S32, S64)
338     .widenScalarToNextPow2(0)
339     .legalIf(isPointer(0));
340 
341   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
342   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
343     .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr});
344 
345 
346   auto &FPOpActions = getActionDefinitionsBuilder(
347     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
348     .legalFor({S32, S64});
349   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
350     .customFor({S32, S64});
351   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
352     .customFor({S32, S64});
353 
354   if (ST.has16BitInsts()) {
355     if (ST.hasVOP3PInsts())
356       FPOpActions.legalFor({S16, V2S16});
357     else
358       FPOpActions.legalFor({S16});
359 
360     TrigActions.customFor({S16});
361     FDIVActions.customFor({S16});
362   }
363 
364   auto &MinNumMaxNum = getActionDefinitionsBuilder({
365       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
366 
367   if (ST.hasVOP3PInsts()) {
368     MinNumMaxNum.customFor(FPTypesPK16)
369       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
370       .clampMaxNumElements(0, S16, 2)
371       .clampScalar(0, S16, S64)
372       .scalarize(0);
373   } else if (ST.has16BitInsts()) {
374     MinNumMaxNum.customFor(FPTypes16)
375       .clampScalar(0, S16, S64)
376       .scalarize(0);
377   } else {
378     MinNumMaxNum.customFor(FPTypesBase)
379       .clampScalar(0, S32, S64)
380       .scalarize(0);
381   }
382 
383   if (ST.hasVOP3PInsts())
384     FPOpActions.clampMaxNumElements(0, S16, 2);
385 
386   FPOpActions
387     .scalarize(0)
388     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
389 
390   TrigActions
391     .scalarize(0)
392     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
393 
394   FDIVActions
395     .scalarize(0)
396     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
397 
398   getActionDefinitionsBuilder({G_FNEG, G_FABS})
399     .legalFor(FPTypesPK16)
400     .clampMaxNumElements(0, S16, 2)
401     .scalarize(0)
402     .clampScalar(0, S16, S64);
403 
404   // TODO: Implement
405   getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
406 
407   if (ST.has16BitInsts()) {
408     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
409       .legalFor({S32, S64, S16})
410       .scalarize(0)
411       .clampScalar(0, S16, S64);
412   } else {
413     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
414       .legalFor({S32, S64})
415       .scalarize(0)
416       .clampScalar(0, S32, S64);
417   }
418 
419   getActionDefinitionsBuilder(G_FPTRUNC)
420     .legalFor({{S32, S64}, {S16, S32}})
421     .scalarize(0);
422 
423   getActionDefinitionsBuilder(G_FPEXT)
424     .legalFor({{S64, S32}, {S32, S16}})
425     .lowerFor({{S64, S16}}) // FIXME: Implement
426     .scalarize(0);
427 
428   // TODO: Verify V_BFI_B32 is generated from expanded bit ops.
429   getActionDefinitionsBuilder(G_FCOPYSIGN).lower();
430 
431   getActionDefinitionsBuilder(G_FSUB)
432       // Use actual fsub instruction
433       .legalFor({S32})
434       // Must use fadd + fneg
435       .lowerFor({S64, S16, V2S16})
436       .scalarize(0)
437       .clampScalar(0, S32, S64);
438 
439   // Whether this is legal depends on the floating point mode for the function.
440   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
441   if (ST.hasMadF16())
442     FMad.customFor({S32, S16});
443   else
444     FMad.customFor({S32});
445   FMad.scalarize(0)
446       .lower();
447 
448   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
449     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
450                {S32, S1}, {S64, S1}, {S16, S1},
451                {S96, S32},
452                // FIXME: Hack
453                {S64, LLT::scalar(33)},
454                {S32, S8}, {S32, LLT::scalar(24)}})
455     .scalarize(0)
456     .clampScalar(0, S32, S64);
457 
458   // TODO: Split s1->s64 during regbankselect for VALU.
459   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
460     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
461     .lowerFor({{S32, S64}})
462     .lowerIf(typeIs(1, S1))
463     .customFor({{S64, S64}});
464   if (ST.has16BitInsts())
465     IToFP.legalFor({{S16, S16}});
466   IToFP.clampScalar(1, S32, S64)
467        .scalarize(0);
468 
469   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
470     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}});
471   if (ST.has16BitInsts())
472     FPToI.legalFor({{S16, S16}});
473   else
474     FPToI.minScalar(1, S32);
475 
476   FPToI.minScalar(0, S32)
477        .scalarize(0);
478 
479   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
480     .scalarize(0)
481     .lower();
482 
483   if (ST.has16BitInsts()) {
484     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
485       .legalFor({S16, S32, S64})
486       .clampScalar(0, S16, S64)
487       .scalarize(0);
488   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
489     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
490       .legalFor({S32, S64})
491       .clampScalar(0, S32, S64)
492       .scalarize(0);
493   } else {
494     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
495       .legalFor({S32})
496       .customFor({S64})
497       .clampScalar(0, S32, S64)
498       .scalarize(0);
499   }
500 
501   getActionDefinitionsBuilder(G_PTR_ADD)
502     .legalForCartesianProduct(AddrSpaces64, {S64})
503     .legalForCartesianProduct(AddrSpaces32, {S32})
504     .scalarize(0);
505 
506   getActionDefinitionsBuilder(G_PTR_MASK)
507     .scalarize(0)
508     .alwaysLegal();
509 
510   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
511 
512   auto &CmpBuilder =
513     getActionDefinitionsBuilder(G_ICMP)
514     // The compare output type differs based on the register bank of the output,
515     // so make both s1 and s32 legal.
516     //
517     // Scalar compares producing output in scc will be promoted to s32, as that
518     // is the allocatable register type that will be needed for the copy from
519     // scc. This will be promoted during RegBankSelect, and we assume something
520     // before that won't try to use s32 result types.
521     //
522     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
523     // bank.
524     .legalForCartesianProduct(
525       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
526     .legalForCartesianProduct(
527       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
528   if (ST.has16BitInsts()) {
529     CmpBuilder.legalFor({{S1, S16}});
530   }
531 
532   CmpBuilder
533     .widenScalarToNextPow2(1)
534     .clampScalar(1, S32, S64)
535     .scalarize(0)
536     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
537 
538   getActionDefinitionsBuilder(G_FCMP)
539     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
540     .widenScalarToNextPow2(1)
541     .clampScalar(1, S32, S64)
542     .scalarize(0);
543 
544   // FIXME: fexp, flog2, flog10 needs to be custom lowered.
545   getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2,
546                                G_FLOG, G_FLOG2, G_FLOG10})
547     .legalFor({S32})
548     .scalarize(0);
549 
550   // The 64-bit versions produce 32-bit results, but only on the SALU.
551   getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF,
552                                G_CTTZ, G_CTTZ_ZERO_UNDEF,
553                                G_CTPOP})
554     .legalFor({{S32, S32}, {S32, S64}})
555     .clampScalar(0, S32, S32)
556     .clampScalar(1, S32, S64)
557     .scalarize(0)
558     .widenScalarToNextPow2(0, 32)
559     .widenScalarToNextPow2(1, 32);
560 
561   // TODO: Expand for > s32
562   getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE})
563     .legalFor({S32})
564     .clampScalar(0, S32, S32)
565     .scalarize(0);
566 
567   if (ST.has16BitInsts()) {
568     if (ST.hasVOP3PInsts()) {
569       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
570         .legalFor({S32, S16, V2S16})
571         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
572         .clampMaxNumElements(0, S16, 2)
573         .clampScalar(0, S16, S32)
574         .widenScalarToNextPow2(0)
575         .scalarize(0);
576     } else {
577       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
578         .legalFor({S32, S16})
579         .widenScalarToNextPow2(0)
580         .clampScalar(0, S16, S32)
581         .scalarize(0);
582     }
583   } else {
584     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
585       .legalFor({S32})
586       .clampScalar(0, S32, S32)
587       .widenScalarToNextPow2(0)
588       .scalarize(0);
589   }
590 
591   auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
592     return [=](const LegalityQuery &Query) {
593       return Query.Types[TypeIdx0].getSizeInBits() <
594              Query.Types[TypeIdx1].getSizeInBits();
595     };
596   };
597 
598   auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
599     return [=](const LegalityQuery &Query) {
600       return Query.Types[TypeIdx0].getSizeInBits() >
601              Query.Types[TypeIdx1].getSizeInBits();
602     };
603   };
604 
605   getActionDefinitionsBuilder(G_INTTOPTR)
606     // List the common cases
607     .legalForCartesianProduct(AddrSpaces64, {S64})
608     .legalForCartesianProduct(AddrSpaces32, {S32})
609     .scalarize(0)
610     // Accept any address space as long as the size matches
611     .legalIf(sameSize(0, 1))
612     .widenScalarIf(smallerThan(1, 0),
613       [](const LegalityQuery &Query) {
614         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
615       })
616     .narrowScalarIf(greaterThan(1, 0),
617       [](const LegalityQuery &Query) {
618         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
619       });
620 
621   getActionDefinitionsBuilder(G_PTRTOINT)
622     // List the common cases
623     .legalForCartesianProduct(AddrSpaces64, {S64})
624     .legalForCartesianProduct(AddrSpaces32, {S32})
625     .scalarize(0)
626     // Accept any address space as long as the size matches
627     .legalIf(sameSize(0, 1))
628     .widenScalarIf(smallerThan(0, 1),
629       [](const LegalityQuery &Query) {
630         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
631       })
632     .narrowScalarIf(
633       greaterThan(0, 1),
634       [](const LegalityQuery &Query) {
635         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
636       });
637 
638   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
639     .scalarize(0)
640     .custom();
641 
642   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
643   // handle some operations by just promoting the register during
644   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
645   auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned {
646     switch (AS) {
647     // FIXME: Private element size.
648     case AMDGPUAS::PRIVATE_ADDRESS:
649       return 32;
650     // FIXME: Check subtarget
651     case AMDGPUAS::LOCAL_ADDRESS:
652       return ST.useDS128() ? 128 : 64;
653 
654     // Treat constant and global as identical. SMRD loads are sometimes usable
655     // for global loads (ideally constant address space should be eliminated)
656     // depending on the context. Legality cannot be context dependent, but
657     // RegBankSelect can split the load as necessary depending on the pointer
658     // register bank/uniformity and if the memory is invariant or not written in
659     // a kernel.
660     case AMDGPUAS::CONSTANT_ADDRESS:
661     case AMDGPUAS::GLOBAL_ADDRESS:
662       return 512;
663     default:
664       return 128;
665     }
666   };
667 
668   const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool {
669     const LLT DstTy = Query.Types[0];
670 
671     // Split vector extloads.
672     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
673     unsigned Align = Query.MMODescrs[0].AlignInBits;
674 
675     if (MemSize < DstTy.getSizeInBits())
676       MemSize = std::max(MemSize, Align);
677 
678     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
679       return true;
680 
681     const LLT PtrTy = Query.Types[1];
682     unsigned AS = PtrTy.getAddressSpace();
683     if (MemSize > maxSizeForAddrSpace(AS))
684       return true;
685 
686     // Catch weird sized loads that don't evenly divide into the access sizes
687     // TODO: May be able to widen depending on alignment etc.
688     unsigned NumRegs = MemSize / 32;
689     if (NumRegs == 3 && !ST.hasDwordx3LoadStores())
690       return true;
691 
692     if (Align < MemSize) {
693       const SITargetLowering *TLI = ST.getTargetLowering();
694       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
695     }
696 
697     return false;
698   };
699 
700   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
701   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
702   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
703 
704   // TODO: Refine based on subtargets which support unaligned access or 128-bit
705   // LDS
706   // TODO: Unsupported flat for SI.
707 
708   for (unsigned Op : {G_LOAD, G_STORE}) {
709     const bool IsStore = Op == G_STORE;
710 
711     auto &Actions = getActionDefinitionsBuilder(Op);
712     // Whitelist the common cases.
713     // TODO: Pointer loads
714     // TODO: Wide constant loads
715     // TODO: Only CI+ has 3x loads
716     // TODO: Loads to s16 on gfx9
717     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
718                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
719                                       {V3S32, GlobalPtr, 96, GlobalAlign32},
720                                       {S96, GlobalPtr, 96, GlobalAlign32},
721                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
722                                       {S128, GlobalPtr, 128, GlobalAlign32},
723                                       {S64, GlobalPtr, 64, GlobalAlign32},
724                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
725                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
726                                       {S32, GlobalPtr, 8, GlobalAlign8},
727                                       {S32, GlobalPtr, 16, GlobalAlign16},
728 
729                                       {S32, LocalPtr, 32, 32},
730                                       {S64, LocalPtr, 64, 32},
731                                       {V2S32, LocalPtr, 64, 32},
732                                       {S32, LocalPtr, 8, 8},
733                                       {S32, LocalPtr, 16, 16},
734                                       {V2S16, LocalPtr, 32, 32},
735 
736                                       {S32, PrivatePtr, 32, 32},
737                                       {S32, PrivatePtr, 8, 8},
738                                       {S32, PrivatePtr, 16, 16},
739                                       {V2S16, PrivatePtr, 32, 32},
740 
741                                       {S32, FlatPtr, 32, GlobalAlign32},
742                                       {S32, FlatPtr, 16, GlobalAlign16},
743                                       {S32, FlatPtr, 8, GlobalAlign8},
744                                       {V2S16, FlatPtr, 32, GlobalAlign32},
745 
746                                       {S32, ConstantPtr, 32, GlobalAlign32},
747                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
748                                       {V3S32, ConstantPtr, 96, GlobalAlign32},
749                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
750                                       {S64, ConstantPtr, 64, GlobalAlign32},
751                                       {S128, ConstantPtr, 128, GlobalAlign32},
752                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
753     Actions
754         .customIf(typeIs(1, Constant32Ptr))
755         .narrowScalarIf(
756             [=](const LegalityQuery &Query) -> bool {
757               return !Query.Types[0].isVector() && needToSplitLoad(Query);
758             },
759             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
760               const LLT DstTy = Query.Types[0];
761               const LLT PtrTy = Query.Types[1];
762 
763               const unsigned DstSize = DstTy.getSizeInBits();
764               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
765 
766               // Split extloads.
767               if (DstSize > MemSize)
768                 return std::make_pair(0, LLT::scalar(MemSize));
769 
770               if (DstSize > 32 && (DstSize % 32 != 0)) {
771                 // FIXME: Need a way to specify non-extload of larger size if
772                 // suitably aligned.
773                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
774               }
775 
776               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
777               if (MemSize > MaxSize)
778                 return std::make_pair(0, LLT::scalar(MaxSize));
779 
780               unsigned Align = Query.MMODescrs[0].AlignInBits;
781               return std::make_pair(0, LLT::scalar(Align));
782             })
783         .fewerElementsIf(
784             [=](const LegalityQuery &Query) -> bool {
785               return Query.Types[0].isVector() && needToSplitLoad(Query);
786             },
787             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
788               const LLT DstTy = Query.Types[0];
789               const LLT PtrTy = Query.Types[1];
790 
791               LLT EltTy = DstTy.getElementType();
792               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
793 
794               // Split if it's too large for the address space.
795               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
796                 unsigned NumElts = DstTy.getNumElements();
797                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
798 
799                 // FIXME: Refine when odd breakdowns handled
800                 // The scalars will need to be re-legalized.
801                 if (NumPieces == 1 || NumPieces >= NumElts ||
802                     NumElts % NumPieces != 0)
803                   return std::make_pair(0, EltTy);
804 
805                 return std::make_pair(0,
806                                       LLT::vector(NumElts / NumPieces, EltTy));
807               }
808 
809               // Need to split because of alignment.
810               unsigned Align = Query.MMODescrs[0].AlignInBits;
811               unsigned EltSize = EltTy.getSizeInBits();
812               if (EltSize > Align &&
813                   (EltSize / Align < DstTy.getNumElements())) {
814                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
815               }
816 
817               // May need relegalization for the scalars.
818               return std::make_pair(0, EltTy);
819             })
820         .minScalar(0, S32);
821 
822     if (IsStore)
823       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
824 
825     // TODO: Need a bitcast lower option?
826     Actions
827         .legalIf([=](const LegalityQuery &Query) {
828           const LLT Ty0 = Query.Types[0];
829           unsigned Size = Ty0.getSizeInBits();
830           unsigned MemSize = Query.MMODescrs[0].SizeInBits;
831           unsigned Align = Query.MMODescrs[0].AlignInBits;
832 
833           // FIXME: Widening store from alignment not valid.
834           if (MemSize < Size)
835             MemSize = std::max(MemSize, Align);
836 
837           // No extending vector loads.
838           if (Size > MemSize && Ty0.isVector())
839             return false;
840 
841           switch (MemSize) {
842           case 8:
843           case 16:
844             return Size == 32;
845           case 32:
846           case 64:
847           case 128:
848             return true;
849           case 96:
850             return ST.hasDwordx3LoadStores();
851           case 256:
852           case 512:
853             return true;
854           default:
855             return false;
856           }
857         })
858         .widenScalarToNextPow2(0)
859         // TODO: v3s32->v4s32 with alignment
860         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
861   }
862 
863   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
864                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
865                                                   {S32, GlobalPtr, 16, 2 * 8},
866                                                   {S32, LocalPtr, 8, 8},
867                                                   {S32, LocalPtr, 16, 16},
868                                                   {S32, PrivatePtr, 8, 8},
869                                                   {S32, PrivatePtr, 16, 16},
870                                                   {S32, ConstantPtr, 8, 8},
871                                                   {S32, ConstantPtr, 16, 2 * 8}});
872   if (ST.hasFlatAddressSpace()) {
873     ExtLoads.legalForTypesWithMemDesc(
874         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
875   }
876 
877   ExtLoads.clampScalar(0, S32, S32)
878           .widenScalarToNextPow2(0)
879           .unsupportedIfMemSizeNotPow2()
880           .lower();
881 
882   auto &Atomics = getActionDefinitionsBuilder(
883     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
884      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
885      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
886      G_ATOMICRMW_UMIN})
887     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
888                {S64, GlobalPtr}, {S64, LocalPtr}});
889   if (ST.hasFlatAddressSpace()) {
890     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
891   }
892 
893   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
894     .legalFor({{S32, LocalPtr}});
895 
896   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
897   // demarshalling
898   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
899     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
900                 {S32, FlatPtr}, {S64, FlatPtr}})
901     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
902                {S32, RegionPtr}, {S64, RegionPtr}});
903 
904   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS)
905     .lower();
906 
907   // TODO: Pointer types, any 32-bit or 64-bit vector
908 
909   // Condition should be s32 for scalar, s1 for vector.
910   getActionDefinitionsBuilder(G_SELECT)
911     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
912           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
913           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
914     .clampScalar(0, S16, S64)
915     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
916     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
917     .scalarize(1)
918     .clampMaxNumElements(0, S32, 2)
919     .clampMaxNumElements(0, LocalPtr, 2)
920     .clampMaxNumElements(0, PrivatePtr, 2)
921     .scalarize(0)
922     .widenScalarToNextPow2(0)
923     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
924 
925   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
926   // be more flexible with the shift amount type.
927   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
928     .legalFor({{S32, S32}, {S64, S32}});
929   if (ST.has16BitInsts()) {
930     if (ST.hasVOP3PInsts()) {
931       Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
932             .clampMaxNumElements(0, S16, 2);
933     } else
934       Shifts.legalFor({{S16, S32}, {S16, S16}});
935 
936     // TODO: Support 16-bit shift amounts
937     Shifts.clampScalar(1, S32, S32);
938     Shifts.clampScalar(0, S16, S64);
939     Shifts.widenScalarToNextPow2(0, 16);
940   } else {
941     // Make sure we legalize the shift amount type first, as the general
942     // expansion for the shifted type will produce much worse code if it hasn't
943     // been truncated already.
944     Shifts.clampScalar(1, S32, S32);
945     Shifts.clampScalar(0, S32, S64);
946     Shifts.widenScalarToNextPow2(0, 32);
947   }
948   Shifts.scalarize(0);
949 
950   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
951     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
952     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
953     unsigned IdxTypeIdx = 2;
954 
955     getActionDefinitionsBuilder(Op)
956       .customIf([=](const LegalityQuery &Query) {
957           const LLT EltTy = Query.Types[EltTypeIdx];
958           const LLT VecTy = Query.Types[VecTypeIdx];
959           const LLT IdxTy = Query.Types[IdxTypeIdx];
960           return (EltTy.getSizeInBits() == 16 ||
961                   EltTy.getSizeInBits() % 32 == 0) &&
962                  VecTy.getSizeInBits() % 32 == 0 &&
963                  VecTy.getSizeInBits() <= 1024 &&
964                  IdxTy.getSizeInBits() == 32;
965         })
966       .clampScalar(EltTypeIdx, S32, S64)
967       .clampScalar(VecTypeIdx, S32, S64)
968       .clampScalar(IdxTypeIdx, S32, S32);
969   }
970 
971   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
972     .unsupportedIf([=](const LegalityQuery &Query) {
973         const LLT &EltTy = Query.Types[1].getElementType();
974         return Query.Types[0] != EltTy;
975       });
976 
977   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
978     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
979     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
980 
981     // FIXME: Doesn't handle extract of illegal sizes.
982     getActionDefinitionsBuilder(Op)
983       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
984       // FIXME: Multiples of 16 should not be legal.
985       .legalIf([=](const LegalityQuery &Query) {
986           const LLT BigTy = Query.Types[BigTyIdx];
987           const LLT LitTy = Query.Types[LitTyIdx];
988           return (BigTy.getSizeInBits() % 32 == 0) &&
989                  (LitTy.getSizeInBits() % 16 == 0);
990         })
991       .widenScalarIf(
992         [=](const LegalityQuery &Query) {
993           const LLT BigTy = Query.Types[BigTyIdx];
994           return (BigTy.getScalarSizeInBits() < 16);
995         },
996         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
997       .widenScalarIf(
998         [=](const LegalityQuery &Query) {
999           const LLT LitTy = Query.Types[LitTyIdx];
1000           return (LitTy.getScalarSizeInBits() < 16);
1001         },
1002         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1003       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1004       .widenScalarToNextPow2(BigTyIdx, 32);
1005 
1006   }
1007 
1008   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1009     .legalForCartesianProduct(AllS32Vectors, {S32})
1010     .legalForCartesianProduct(AllS64Vectors, {S64})
1011     .clampNumElements(0, V16S32, V32S32)
1012     .clampNumElements(0, V2S64, V16S64)
1013     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1014 
1015   if (ST.hasScalarPackInsts())
1016     BuildVector.legalFor({V2S16, S32});
1017 
1018   BuildVector
1019     .minScalarSameAs(1, 0)
1020     .legalIf(isRegisterType(0))
1021     .minScalarOrElt(0, S32);
1022 
1023   if (ST.hasScalarPackInsts()) {
1024     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1025       .legalFor({V2S16, S32})
1026       .lower();
1027   } else {
1028     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1029       .lower();
1030   }
1031 
1032   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1033     .legalIf(isRegisterType(0));
1034 
1035   // TODO: Don't fully scalarize v2s16 pieces
1036   getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1037 
1038   // Merge/Unmerge
1039   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1040     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1041     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1042 
1043     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1044       const LLT &Ty = Query.Types[TypeIdx];
1045       if (Ty.isVector()) {
1046         const LLT &EltTy = Ty.getElementType();
1047         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
1048           return true;
1049         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1050           return true;
1051       }
1052       return false;
1053     };
1054 
1055     auto &Builder = getActionDefinitionsBuilder(Op)
1056       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1057       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1058       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1059       // valid.
1060       .clampScalar(LitTyIdx, S16, S256)
1061       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1062       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1063       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1064                            elementTypeIs(1, S16)),
1065                        changeTo(1, V2S16))
1066       // Break up vectors with weird elements into scalars
1067       .fewerElementsIf(
1068         [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
1069         scalarize(0))
1070       .fewerElementsIf(
1071         [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
1072         scalarize(1))
1073       .clampScalar(BigTyIdx, S32, S1024)
1074       .lowerFor({{S16, V2S16}});
1075 
1076     if (Op == G_MERGE_VALUES) {
1077       Builder.widenScalarIf(
1078         // TODO: Use 16-bit shifts if legal for 8-bit values?
1079         [=](const LegalityQuery &Query) {
1080           const LLT Ty = Query.Types[LitTyIdx];
1081           return Ty.getSizeInBits() < 32;
1082         },
1083         changeTo(LitTyIdx, S32));
1084     }
1085 
1086     Builder.widenScalarIf(
1087       [=](const LegalityQuery &Query) {
1088         const LLT Ty = Query.Types[BigTyIdx];
1089         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1090           Ty.getSizeInBits() % 16 != 0;
1091       },
1092       [=](const LegalityQuery &Query) {
1093         // Pick the next power of 2, or a multiple of 64 over 128.
1094         // Whichever is smaller.
1095         const LLT &Ty = Query.Types[BigTyIdx];
1096         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1097         if (NewSizeInBits >= 256) {
1098           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1099           if (RoundedTo < NewSizeInBits)
1100             NewSizeInBits = RoundedTo;
1101         }
1102         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1103       })
1104       .legalIf([=](const LegalityQuery &Query) {
1105           const LLT &BigTy = Query.Types[BigTyIdx];
1106           const LLT &LitTy = Query.Types[LitTyIdx];
1107 
1108           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1109             return false;
1110           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1111             return false;
1112 
1113           return BigTy.getSizeInBits() % 16 == 0 &&
1114                  LitTy.getSizeInBits() % 16 == 0 &&
1115                  BigTy.getSizeInBits() <= 1024;
1116         })
1117       // Any vectors left are the wrong size. Scalarize them.
1118       .scalarize(0)
1119       .scalarize(1);
1120   }
1121 
1122   // TODO: Make legal for s32, s64. s64 case needs break down in regbankselect.
1123   getActionDefinitionsBuilder(G_SEXT_INREG)
1124     .clampScalar(0, MinLegalScalarShiftTy, S64)
1125     .lower();
1126 
1127   getActionDefinitionsBuilder({G_READ_REGISTER, G_WRITE_REGISTER}).lower();
1128 
1129   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1130     .legalFor({S64});
1131 
1132   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1133         G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1134         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1135     .unsupported();
1136 
1137   computeTables();
1138   verify(*ST.getInstrInfo());
1139 }
1140 
1141 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1142                                          MachineRegisterInfo &MRI,
1143                                          MachineIRBuilder &B,
1144                                          GISelChangeObserver &Observer) const {
1145   switch (MI.getOpcode()) {
1146   case TargetOpcode::G_ADDRSPACE_CAST:
1147     return legalizeAddrSpaceCast(MI, MRI, B);
1148   case TargetOpcode::G_FRINT:
1149     return legalizeFrint(MI, MRI, B);
1150   case TargetOpcode::G_FCEIL:
1151     return legalizeFceil(MI, MRI, B);
1152   case TargetOpcode::G_INTRINSIC_TRUNC:
1153     return legalizeIntrinsicTrunc(MI, MRI, B);
1154   case TargetOpcode::G_SITOFP:
1155     return legalizeITOFP(MI, MRI, B, true);
1156   case TargetOpcode::G_UITOFP:
1157     return legalizeITOFP(MI, MRI, B, false);
1158   case TargetOpcode::G_FMINNUM:
1159   case TargetOpcode::G_FMAXNUM:
1160   case TargetOpcode::G_FMINNUM_IEEE:
1161   case TargetOpcode::G_FMAXNUM_IEEE:
1162     return legalizeMinNumMaxNum(MI, MRI, B);
1163   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1164     return legalizeExtractVectorElt(MI, MRI, B);
1165   case TargetOpcode::G_INSERT_VECTOR_ELT:
1166     return legalizeInsertVectorElt(MI, MRI, B);
1167   case TargetOpcode::G_FSIN:
1168   case TargetOpcode::G_FCOS:
1169     return legalizeSinCos(MI, MRI, B);
1170   case TargetOpcode::G_GLOBAL_VALUE:
1171     return legalizeGlobalValue(MI, MRI, B);
1172   case TargetOpcode::G_LOAD:
1173     return legalizeLoad(MI, MRI, B, Observer);
1174   case TargetOpcode::G_FMAD:
1175     return legalizeFMad(MI, MRI, B);
1176   case TargetOpcode::G_FDIV:
1177     return legalizeFDIV(MI, MRI, B);
1178   case TargetOpcode::G_ATOMIC_CMPXCHG:
1179     return legalizeAtomicCmpXChg(MI, MRI, B);
1180   default:
1181     return false;
1182   }
1183 
1184   llvm_unreachable("expected switch to return");
1185 }
1186 
1187 Register AMDGPULegalizerInfo::getSegmentAperture(
1188   unsigned AS,
1189   MachineRegisterInfo &MRI,
1190   MachineIRBuilder &B) const {
1191   MachineFunction &MF = B.getMF();
1192   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1193   const LLT S32 = LLT::scalar(32);
1194 
1195   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1196 
1197   if (ST.hasApertureRegs()) {
1198     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1199     // getreg.
1200     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1201         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1202         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1203     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1204         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1205         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1206     unsigned Encoding =
1207         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1208         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1209         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1210 
1211     Register ApertureReg = MRI.createGenericVirtualRegister(S32);
1212     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1213 
1214     B.buildInstr(AMDGPU::S_GETREG_B32)
1215       .addDef(GetReg)
1216       .addImm(Encoding);
1217     MRI.setType(GetReg, S32);
1218 
1219     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1220     B.buildInstr(TargetOpcode::G_SHL)
1221       .addDef(ApertureReg)
1222       .addUse(GetReg)
1223       .addUse(ShiftAmt.getReg(0));
1224 
1225     return ApertureReg;
1226   }
1227 
1228   Register QueuePtr = MRI.createGenericVirtualRegister(
1229     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1230 
1231   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1232   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1233     return Register();
1234 
1235   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1236   // private_segment_aperture_base_hi.
1237   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1238 
1239   // TODO: can we be smarter about machine pointer info?
1240   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1241   MachineMemOperand *MMO = MF.getMachineMemOperand(
1242     PtrInfo,
1243     MachineMemOperand::MOLoad |
1244     MachineMemOperand::MODereferenceable |
1245     MachineMemOperand::MOInvariant,
1246     4,
1247     MinAlign(64, StructOffset));
1248 
1249   Register LoadResult = MRI.createGenericVirtualRegister(S32);
1250   Register LoadAddr;
1251 
1252   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1253   B.buildLoad(LoadResult, LoadAddr, *MMO);
1254   return LoadResult;
1255 }
1256 
1257 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1258   MachineInstr &MI, MachineRegisterInfo &MRI,
1259   MachineIRBuilder &B) const {
1260   MachineFunction &MF = B.getMF();
1261 
1262   B.setInstr(MI);
1263 
1264   const LLT S32 = LLT::scalar(32);
1265   Register Dst = MI.getOperand(0).getReg();
1266   Register Src = MI.getOperand(1).getReg();
1267 
1268   LLT DstTy = MRI.getType(Dst);
1269   LLT SrcTy = MRI.getType(Src);
1270   unsigned DestAS = DstTy.getAddressSpace();
1271   unsigned SrcAS = SrcTy.getAddressSpace();
1272 
1273   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1274   // vector element.
1275   assert(!DstTy.isVector());
1276 
1277   const AMDGPUTargetMachine &TM
1278     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1279 
1280   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1281   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1282     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1283     return true;
1284   }
1285 
1286   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1287     // Truncate.
1288     B.buildExtract(Dst, Src, 0);
1289     MI.eraseFromParent();
1290     return true;
1291   }
1292 
1293   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1294     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1295     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1296 
1297     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1298     // another. Merge operands are required to be the same type, but creating an
1299     // extra ptrtoint would be kind of pointless.
1300     auto HighAddr = B.buildConstant(
1301       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1302     B.buildMerge(Dst, {Src, HighAddr.getReg(0)});
1303     MI.eraseFromParent();
1304     return true;
1305   }
1306 
1307   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1308     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1309            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1310     unsigned NullVal = TM.getNullPointerValue(DestAS);
1311 
1312     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1313     auto FlatNull = B.buildConstant(SrcTy, 0);
1314 
1315     Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy);
1316 
1317     // Extract low 32-bits of the pointer.
1318     B.buildExtract(PtrLo32, Src, 0);
1319 
1320     Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1321     B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
1322     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1323 
1324     MI.eraseFromParent();
1325     return true;
1326   }
1327 
1328   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1329     return false;
1330 
1331   if (!ST.hasFlatAddressSpace())
1332     return false;
1333 
1334   auto SegmentNull =
1335       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1336   auto FlatNull =
1337       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1338 
1339   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1340   if (!ApertureReg.isValid())
1341     return false;
1342 
1343   Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1344   B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
1345 
1346   Register BuildPtr = MRI.createGenericVirtualRegister(DstTy);
1347 
1348   // Coerce the type of the low half of the result so we can use merge_values.
1349   Register SrcAsInt = MRI.createGenericVirtualRegister(S32);
1350   B.buildInstr(TargetOpcode::G_PTRTOINT)
1351     .addDef(SrcAsInt)
1352     .addUse(Src);
1353 
1354   // TODO: Should we allow mismatched types but matching sizes in merges to
1355   // avoid the ptrtoint?
1356   B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
1357   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
1358 
1359   MI.eraseFromParent();
1360   return true;
1361 }
1362 
1363 bool AMDGPULegalizerInfo::legalizeFrint(
1364   MachineInstr &MI, MachineRegisterInfo &MRI,
1365   MachineIRBuilder &B) const {
1366   B.setInstr(MI);
1367 
1368   Register Src = MI.getOperand(1).getReg();
1369   LLT Ty = MRI.getType(Src);
1370   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1371 
1372   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1373   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1374 
1375   auto C1 = B.buildFConstant(Ty, C1Val);
1376   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1377 
1378   // TODO: Should this propagate fast-math-flags?
1379   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1380   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1381 
1382   auto C2 = B.buildFConstant(Ty, C2Val);
1383   auto Fabs = B.buildFAbs(Ty, Src);
1384 
1385   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1386   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1387   return true;
1388 }
1389 
1390 bool AMDGPULegalizerInfo::legalizeFceil(
1391   MachineInstr &MI, MachineRegisterInfo &MRI,
1392   MachineIRBuilder &B) const {
1393   B.setInstr(MI);
1394 
1395   const LLT S1 = LLT::scalar(1);
1396   const LLT S64 = LLT::scalar(64);
1397 
1398   Register Src = MI.getOperand(1).getReg();
1399   assert(MRI.getType(Src) == S64);
1400 
1401   // result = trunc(src)
1402   // if (src > 0.0 && src != result)
1403   //   result += 1.0
1404 
1405   auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src});
1406 
1407   const auto Zero = B.buildFConstant(S64, 0.0);
1408   const auto One = B.buildFConstant(S64, 1.0);
1409   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1410   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1411   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1412   auto Add = B.buildSelect(S64, And, One, Zero);
1413 
1414   // TODO: Should this propagate fast-math-flags?
1415   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1416   return true;
1417 }
1418 
1419 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1420                                               MachineIRBuilder &B) {
1421   const unsigned FractBits = 52;
1422   const unsigned ExpBits = 11;
1423   LLT S32 = LLT::scalar(32);
1424 
1425   auto Const0 = B.buildConstant(S32, FractBits - 32);
1426   auto Const1 = B.buildConstant(S32, ExpBits);
1427 
1428   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1429     .addUse(Const0.getReg(0))
1430     .addUse(Const1.getReg(0));
1431 
1432   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1433 }
1434 
1435 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1436   MachineInstr &MI, MachineRegisterInfo &MRI,
1437   MachineIRBuilder &B) const {
1438   B.setInstr(MI);
1439 
1440   const LLT S1 = LLT::scalar(1);
1441   const LLT S32 = LLT::scalar(32);
1442   const LLT S64 = LLT::scalar(64);
1443 
1444   Register Src = MI.getOperand(1).getReg();
1445   assert(MRI.getType(Src) == S64);
1446 
1447   // TODO: Should this use extract since the low half is unused?
1448   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1449   Register Hi = Unmerge.getReg(1);
1450 
1451   // Extract the upper half, since this is where we will find the sign and
1452   // exponent.
1453   auto Exp = extractF64Exponent(Hi, B);
1454 
1455   const unsigned FractBits = 52;
1456 
1457   // Extract the sign bit.
1458   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1459   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1460 
1461   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1462 
1463   const auto Zero32 = B.buildConstant(S32, 0);
1464 
1465   // Extend back to 64-bits.
1466   auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)});
1467 
1468   auto Shr = B.buildAShr(S64, FractMask, Exp);
1469   auto Not = B.buildNot(S64, Shr);
1470   auto Tmp0 = B.buildAnd(S64, Src, Not);
1471   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1472 
1473   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1474   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1475 
1476   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1477   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1478   return true;
1479 }
1480 
1481 bool AMDGPULegalizerInfo::legalizeITOFP(
1482   MachineInstr &MI, MachineRegisterInfo &MRI,
1483   MachineIRBuilder &B, bool Signed) const {
1484   B.setInstr(MI);
1485 
1486   Register Dst = MI.getOperand(0).getReg();
1487   Register Src = MI.getOperand(1).getReg();
1488 
1489   const LLT S64 = LLT::scalar(64);
1490   const LLT S32 = LLT::scalar(32);
1491 
1492   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1493 
1494   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1495 
1496   auto CvtHi = Signed ?
1497     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1498     B.buildUITOFP(S64, Unmerge.getReg(1));
1499 
1500   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1501 
1502   auto ThirtyTwo = B.buildConstant(S32, 32);
1503   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1504     .addUse(CvtHi.getReg(0))
1505     .addUse(ThirtyTwo.getReg(0));
1506 
1507   // TODO: Should this propagate fast-math-flags?
1508   B.buildFAdd(Dst, LdExp, CvtLo);
1509   MI.eraseFromParent();
1510   return true;
1511 }
1512 
1513 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1514   MachineInstr &MI, MachineRegisterInfo &MRI,
1515   MachineIRBuilder &B) const {
1516   MachineFunction &MF = B.getMF();
1517   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1518 
1519   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1520                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1521 
1522   // With ieee_mode disabled, the instructions have the correct behavior
1523   // already for G_FMINNUM/G_FMAXNUM
1524   if (!MFI->getMode().IEEE)
1525     return !IsIEEEOp;
1526 
1527   if (IsIEEEOp)
1528     return true;
1529 
1530   MachineIRBuilder HelperBuilder(MI);
1531   GISelObserverWrapper DummyObserver;
1532   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1533   HelperBuilder.setInstr(MI);
1534   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1535 }
1536 
1537 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1538   MachineInstr &MI, MachineRegisterInfo &MRI,
1539   MachineIRBuilder &B) const {
1540   // TODO: Should move some of this into LegalizerHelper.
1541 
1542   // TODO: Promote dynamic indexing of s16 to s32
1543   // TODO: Dynamic s64 indexing is only legal for SGPR.
1544   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
1545   if (!IdxVal) // Dynamic case will be selected to register indexing.
1546     return true;
1547 
1548   Register Dst = MI.getOperand(0).getReg();
1549   Register Vec = MI.getOperand(1).getReg();
1550 
1551   LLT VecTy = MRI.getType(Vec);
1552   LLT EltTy = VecTy.getElementType();
1553   assert(EltTy == MRI.getType(Dst));
1554 
1555   B.setInstr(MI);
1556 
1557   if (IdxVal.getValue() < VecTy.getNumElements())
1558     B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits());
1559   else
1560     B.buildUndef(Dst);
1561 
1562   MI.eraseFromParent();
1563   return true;
1564 }
1565 
1566 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1567   MachineInstr &MI, MachineRegisterInfo &MRI,
1568   MachineIRBuilder &B) const {
1569   // TODO: Should move some of this into LegalizerHelper.
1570 
1571   // TODO: Promote dynamic indexing of s16 to s32
1572   // TODO: Dynamic s64 indexing is only legal for SGPR.
1573   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI);
1574   if (!IdxVal) // Dynamic case will be selected to register indexing.
1575     return true;
1576 
1577   Register Dst = MI.getOperand(0).getReg();
1578   Register Vec = MI.getOperand(1).getReg();
1579   Register Ins = MI.getOperand(2).getReg();
1580 
1581   LLT VecTy = MRI.getType(Vec);
1582   LLT EltTy = VecTy.getElementType();
1583   assert(EltTy == MRI.getType(Ins));
1584 
1585   B.setInstr(MI);
1586 
1587   if (IdxVal.getValue() < VecTy.getNumElements())
1588     B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits());
1589   else
1590     B.buildUndef(Dst);
1591 
1592   MI.eraseFromParent();
1593   return true;
1594 }
1595 
1596 bool AMDGPULegalizerInfo::legalizeSinCos(
1597   MachineInstr &MI, MachineRegisterInfo &MRI,
1598   MachineIRBuilder &B) const {
1599   B.setInstr(MI);
1600 
1601   Register DstReg = MI.getOperand(0).getReg();
1602   Register SrcReg = MI.getOperand(1).getReg();
1603   LLT Ty = MRI.getType(DstReg);
1604   unsigned Flags = MI.getFlags();
1605 
1606   Register TrigVal;
1607   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1608   if (ST.hasTrigReducedRange()) {
1609     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1610     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1611       .addUse(MulVal.getReg(0))
1612       .setMIFlags(Flags).getReg(0);
1613   } else
1614     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1615 
1616   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1617     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1618   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1619     .addUse(TrigVal)
1620     .setMIFlags(Flags);
1621   MI.eraseFromParent();
1622   return true;
1623 }
1624 
1625 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1626   Register DstReg, LLT PtrTy,
1627   MachineIRBuilder &B, const GlobalValue *GV,
1628   unsigned Offset, unsigned GAFlags) const {
1629   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1630   // to the following code sequence:
1631   //
1632   // For constant address space:
1633   //   s_getpc_b64 s[0:1]
1634   //   s_add_u32 s0, s0, $symbol
1635   //   s_addc_u32 s1, s1, 0
1636   //
1637   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1638   //   a fixup or relocation is emitted to replace $symbol with a literal
1639   //   constant, which is a pc-relative offset from the encoding of the $symbol
1640   //   operand to the global variable.
1641   //
1642   // For global address space:
1643   //   s_getpc_b64 s[0:1]
1644   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1645   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1646   //
1647   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1648   //   fixups or relocations are emitted to replace $symbol@*@lo and
1649   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1650   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
1651   //   operand to the global variable.
1652   //
1653   // What we want here is an offset from the value returned by s_getpc
1654   // (which is the address of the s_add_u32 instruction) to the global
1655   // variable, but since the encoding of $symbol starts 4 bytes after the start
1656   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1657   // small. This requires us to add 4 to the global variable offset in order to
1658   // compute the correct address.
1659 
1660   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1661 
1662   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1663     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1664 
1665   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1666     .addDef(PCReg);
1667 
1668   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1669   if (GAFlags == SIInstrInfo::MO_NONE)
1670     MIB.addImm(0);
1671   else
1672     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1673 
1674   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1675 
1676   if (PtrTy.getSizeInBits() == 32)
1677     B.buildExtract(DstReg, PCReg, 0);
1678   return true;
1679  }
1680 
1681 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1682   MachineInstr &MI, MachineRegisterInfo &MRI,
1683   MachineIRBuilder &B) const {
1684   Register DstReg = MI.getOperand(0).getReg();
1685   LLT Ty = MRI.getType(DstReg);
1686   unsigned AS = Ty.getAddressSpace();
1687 
1688   const GlobalValue *GV = MI.getOperand(1).getGlobal();
1689   MachineFunction &MF = B.getMF();
1690   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1691   B.setInstr(MI);
1692 
1693   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1694     if (!MFI->isEntryFunction()) {
1695       const Function &Fn = MF.getFunction();
1696       DiagnosticInfoUnsupported BadLDSDecl(
1697         Fn, "local memory global used by non-kernel function", MI.getDebugLoc());
1698       Fn.getContext().diagnose(BadLDSDecl);
1699     }
1700 
1701     // TODO: We could emit code to handle the initialization somewhere.
1702     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1703       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1704       MI.eraseFromParent();
1705       return true;
1706     }
1707 
1708     const Function &Fn = MF.getFunction();
1709     DiagnosticInfoUnsupported BadInit(
1710       Fn, "unsupported initializer for address space", MI.getDebugLoc());
1711     Fn.getContext().diagnose(BadInit);
1712     return true;
1713   }
1714 
1715   const SITargetLowering *TLI = ST.getTargetLowering();
1716 
1717   if (TLI->shouldEmitFixup(GV)) {
1718     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
1719     MI.eraseFromParent();
1720     return true;
1721   }
1722 
1723   if (TLI->shouldEmitPCReloc(GV)) {
1724     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
1725     MI.eraseFromParent();
1726     return true;
1727   }
1728 
1729   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1730   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
1731 
1732   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
1733     MachinePointerInfo::getGOT(MF),
1734     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1735     MachineMemOperand::MOInvariant,
1736     8 /*Size*/, 8 /*Align*/);
1737 
1738   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
1739 
1740   if (Ty.getSizeInBits() == 32) {
1741     // Truncate if this is a 32-bit constant adrdess.
1742     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
1743     B.buildExtract(DstReg, Load, 0);
1744   } else
1745     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
1746 
1747   MI.eraseFromParent();
1748   return true;
1749 }
1750 
1751 bool AMDGPULegalizerInfo::legalizeLoad(
1752   MachineInstr &MI, MachineRegisterInfo &MRI,
1753   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
1754   B.setInstr(MI);
1755   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1756   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
1757   Observer.changingInstr(MI);
1758   MI.getOperand(1).setReg(Cast.getReg(0));
1759   Observer.changedInstr(MI);
1760   return true;
1761 }
1762 
1763 bool AMDGPULegalizerInfo::legalizeFMad(
1764   MachineInstr &MI, MachineRegisterInfo &MRI,
1765   MachineIRBuilder &B) const {
1766   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1767   assert(Ty.isScalar());
1768 
1769   MachineFunction &MF = B.getMF();
1770   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1771 
1772   // TODO: Always legal with future ftz flag.
1773   if (Ty == LLT::scalar(32) && !MFI->getMode().FP32Denormals)
1774     return true;
1775   if (Ty == LLT::scalar(16) && !MFI->getMode().FP64FP16Denormals)
1776     return true;
1777 
1778 
1779   MachineIRBuilder HelperBuilder(MI);
1780   GISelObserverWrapper DummyObserver;
1781   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1782   HelperBuilder.setMBB(*MI.getParent());
1783   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
1784 }
1785 
1786 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
1787   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
1788   Register DstReg = MI.getOperand(0).getReg();
1789   Register PtrReg = MI.getOperand(1).getReg();
1790   Register CmpVal = MI.getOperand(2).getReg();
1791   Register NewVal = MI.getOperand(3).getReg();
1792 
1793   assert(SITargetLowering::isFlatGlobalAddrSpace(
1794            MRI.getType(PtrReg).getAddressSpace()) &&
1795          "this should not have been custom lowered");
1796 
1797   LLT ValTy = MRI.getType(CmpVal);
1798   LLT VecTy = LLT::vector(2, ValTy);
1799 
1800   B.setInstr(MI);
1801   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
1802 
1803   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
1804     .addDef(DstReg)
1805     .addUse(PtrReg)
1806     .addUse(PackedVal)
1807     .setMemRefs(MI.memoperands());
1808 
1809   MI.eraseFromParent();
1810   return true;
1811 }
1812 
1813 // Return the use branch instruction, otherwise null if the usage is invalid.
1814 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
1815                                        MachineRegisterInfo &MRI,
1816                                        MachineInstr *&Br) {
1817   Register CondDef = MI.getOperand(0).getReg();
1818   if (!MRI.hasOneNonDBGUse(CondDef))
1819     return nullptr;
1820 
1821   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
1822   if (UseMI.getParent() != MI.getParent() ||
1823       UseMI.getOpcode() != AMDGPU::G_BRCOND)
1824     return nullptr;
1825 
1826   // Make sure the cond br is followed by a G_BR
1827   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
1828   if (Next != MI.getParent()->end()) {
1829     if (Next->getOpcode() != AMDGPU::G_BR)
1830       return nullptr;
1831     Br = &*Next;
1832   }
1833 
1834   return &UseMI;
1835 }
1836 
1837 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
1838                                                 Register Reg, LLT Ty) const {
1839   Register LiveIn = MRI.getLiveInVirtReg(Reg);
1840   if (LiveIn)
1841     return LiveIn;
1842 
1843   Register NewReg = MRI.createGenericVirtualRegister(Ty);
1844   MRI.addLiveIn(Reg, NewReg);
1845   return NewReg;
1846 }
1847 
1848 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
1849                                          const ArgDescriptor *Arg) const {
1850   if (!Arg->isRegister() || !Arg->getRegister().isValid())
1851     return false; // TODO: Handle these
1852 
1853   assert(Arg->getRegister().isPhysical());
1854 
1855   MachineRegisterInfo &MRI = *B.getMRI();
1856 
1857   LLT Ty = MRI.getType(DstReg);
1858   Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
1859 
1860   if (Arg->isMasked()) {
1861     // TODO: Should we try to emit this once in the entry block?
1862     const LLT S32 = LLT::scalar(32);
1863     const unsigned Mask = Arg->getMask();
1864     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
1865 
1866     Register AndMaskSrc = LiveIn;
1867 
1868     if (Shift != 0) {
1869       auto ShiftAmt = B.buildConstant(S32, Shift);
1870       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
1871     }
1872 
1873     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
1874   } else
1875     B.buildCopy(DstReg, LiveIn);
1876 
1877   // Insert the argument copy if it doens't already exist.
1878   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
1879   if (!MRI.getVRegDef(LiveIn)) {
1880     // FIXME: Should have scoped insert pt
1881     MachineBasicBlock &OrigInsBB = B.getMBB();
1882     auto OrigInsPt = B.getInsertPt();
1883 
1884     MachineBasicBlock &EntryMBB = B.getMF().front();
1885     EntryMBB.addLiveIn(Arg->getRegister());
1886     B.setInsertPt(EntryMBB, EntryMBB.begin());
1887     B.buildCopy(LiveIn, Arg->getRegister());
1888 
1889     B.setInsertPt(OrigInsBB, OrigInsPt);
1890   }
1891 
1892   return true;
1893 }
1894 
1895 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
1896   MachineInstr &MI,
1897   MachineRegisterInfo &MRI,
1898   MachineIRBuilder &B,
1899   AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
1900   B.setInstr(MI);
1901 
1902   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1903 
1904   const ArgDescriptor *Arg;
1905   const TargetRegisterClass *RC;
1906   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
1907   if (!Arg) {
1908     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
1909     return false;
1910   }
1911 
1912   if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
1913     MI.eraseFromParent();
1914     return true;
1915   }
1916 
1917   return false;
1918 }
1919 
1920 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
1921                                        MachineRegisterInfo &MRI,
1922                                        MachineIRBuilder &B) const {
1923   B.setInstr(MI);
1924   Register Dst = MI.getOperand(0).getReg();
1925   LLT DstTy = MRI.getType(Dst);
1926   LLT S16 = LLT::scalar(16);
1927   LLT S32 = LLT::scalar(32);
1928   LLT S64 = LLT::scalar(64);
1929 
1930   if (legalizeFastUnsafeFDIV(MI, MRI, B))
1931     return true;
1932 
1933   if (DstTy == S16)
1934     return legalizeFDIV16(MI, MRI, B);
1935   if (DstTy == S32)
1936     return legalizeFDIV32(MI, MRI, B);
1937   if (DstTy == S64)
1938     return legalizeFDIV64(MI, MRI, B);
1939 
1940   return false;
1941 }
1942 
1943 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
1944                                                  MachineRegisterInfo &MRI,
1945                                                  MachineIRBuilder &B) const {
1946   Register Res = MI.getOperand(0).getReg();
1947   Register LHS = MI.getOperand(1).getReg();
1948   Register RHS = MI.getOperand(2).getReg();
1949 
1950   uint16_t Flags = MI.getFlags();
1951 
1952   LLT ResTy = MRI.getType(Res);
1953   LLT S32 = LLT::scalar(32);
1954   LLT S64 = LLT::scalar(64);
1955 
1956   const MachineFunction &MF = B.getMF();
1957   bool Unsafe =
1958     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
1959 
1960   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
1961     return false;
1962 
1963   if (!Unsafe && ResTy == S32 &&
1964       MF.getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals)
1965     return false;
1966 
1967   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
1968     // 1 / x -> RCP(x)
1969     if (CLHS->isExactlyValue(1.0)) {
1970       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
1971         .addUse(RHS)
1972         .setMIFlags(Flags);
1973 
1974       MI.eraseFromParent();
1975       return true;
1976     }
1977 
1978     // -1 / x -> RCP( FNEG(x) )
1979     if (CLHS->isExactlyValue(-1.0)) {
1980       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
1981       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
1982         .addUse(FNeg.getReg(0))
1983         .setMIFlags(Flags);
1984 
1985       MI.eraseFromParent();
1986       return true;
1987     }
1988   }
1989 
1990   // x / y -> x * (1.0 / y)
1991   if (Unsafe) {
1992     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
1993       .addUse(RHS)
1994       .setMIFlags(Flags);
1995     B.buildFMul(Res, LHS, RCP, Flags);
1996 
1997     MI.eraseFromParent();
1998     return true;
1999   }
2000 
2001   return false;
2002 }
2003 
2004 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2005                                          MachineRegisterInfo &MRI,
2006                                          MachineIRBuilder &B) const {
2007   B.setInstr(MI);
2008   Register Res = MI.getOperand(0).getReg();
2009   Register LHS = MI.getOperand(1).getReg();
2010   Register RHS = MI.getOperand(2).getReg();
2011 
2012   uint16_t Flags = MI.getFlags();
2013 
2014   LLT S16 = LLT::scalar(16);
2015   LLT S32 = LLT::scalar(32);
2016 
2017   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2018   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2019 
2020   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2021     .addUse(RHSExt.getReg(0))
2022     .setMIFlags(Flags);
2023 
2024   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2025   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2026 
2027   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2028     .addUse(RDst.getReg(0))
2029     .addUse(RHS)
2030     .addUse(LHS)
2031     .setMIFlags(Flags);
2032 
2033   MI.eraseFromParent();
2034   return true;
2035 }
2036 
2037 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2038 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2039 static void toggleSPDenormMode(bool Enable,
2040                                MachineIRBuilder &B,
2041                                const GCNSubtarget &ST,
2042                                AMDGPU::SIModeRegisterDefaults Mode) {
2043   // Set SP denorm mode to this value.
2044   unsigned SPDenormMode =
2045     Enable ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
2046 
2047   if (ST.hasDenormModeInst()) {
2048     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2049     unsigned DPDenormModeDefault = Mode.FP64FP16Denormals
2050                                    ? FP_DENORM_FLUSH_NONE
2051                                    : FP_DENORM_FLUSH_IN_FLUSH_OUT;
2052 
2053     unsigned NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2054     B.buildInstr(AMDGPU::S_DENORM_MODE)
2055       .addImm(NewDenormModeValue);
2056 
2057   } else {
2058     // Select FP32 bit field in mode register.
2059     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2060                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2061                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2062 
2063     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2064       .addImm(SPDenormMode)
2065       .addImm(SPDenormModeBitField);
2066   }
2067 }
2068 
2069 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2070                                          MachineRegisterInfo &MRI,
2071                                          MachineIRBuilder &B) const {
2072   B.setInstr(MI);
2073   Register Res = MI.getOperand(0).getReg();
2074   Register LHS = MI.getOperand(1).getReg();
2075   Register RHS = MI.getOperand(2).getReg();
2076   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2077   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2078 
2079   uint16_t Flags = MI.getFlags();
2080 
2081   LLT S32 = LLT::scalar(32);
2082   LLT S1 = LLT::scalar(1);
2083 
2084   auto One = B.buildFConstant(S32, 1.0f);
2085 
2086   auto DenominatorScaled =
2087     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2088       .addUse(RHS)
2089       .addUse(LHS)
2090       .addImm(1)
2091       .setMIFlags(Flags);
2092   auto NumeratorScaled =
2093     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2094       .addUse(LHS)
2095       .addUse(RHS)
2096       .addImm(0)
2097       .setMIFlags(Flags);
2098 
2099   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2100     .addUse(DenominatorScaled.getReg(0))
2101     .setMIFlags(Flags);
2102   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2103 
2104   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2105   // aren't modeled as reading it.
2106   if (!Mode.FP32Denormals)
2107     toggleSPDenormMode(true, B, ST, Mode);
2108 
2109   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2110   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2111   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2112   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2113   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2114   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2115 
2116   if (!Mode.FP32Denormals)
2117     toggleSPDenormMode(false, B, ST, Mode);
2118 
2119   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2120     .addUse(Fma4.getReg(0))
2121     .addUse(Fma1.getReg(0))
2122     .addUse(Fma3.getReg(0))
2123     .addUse(NumeratorScaled.getReg(1))
2124     .setMIFlags(Flags);
2125 
2126   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2127     .addUse(Fmas.getReg(0))
2128     .addUse(RHS)
2129     .addUse(LHS)
2130     .setMIFlags(Flags);
2131 
2132   MI.eraseFromParent();
2133   return true;
2134 }
2135 
2136 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
2137                                          MachineRegisterInfo &MRI,
2138                                          MachineIRBuilder &B) const {
2139   B.setInstr(MI);
2140   Register Res = MI.getOperand(0).getReg();
2141   Register LHS = MI.getOperand(1).getReg();
2142   Register RHS = MI.getOperand(2).getReg();
2143 
2144   uint16_t Flags = MI.getFlags();
2145 
2146   LLT S64 = LLT::scalar(64);
2147   LLT S1 = LLT::scalar(1);
2148 
2149   auto One = B.buildFConstant(S64, 1.0);
2150 
2151   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2152     .addUse(LHS)
2153     .addUse(RHS)
2154     .addImm(1)
2155     .setMIFlags(Flags);
2156 
2157   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
2158 
2159   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
2160     .addUse(DivScale0.getReg(0))
2161     .setMIFlags(Flags);
2162 
2163   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
2164   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
2165   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
2166 
2167   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2168     .addUse(LHS)
2169     .addUse(RHS)
2170     .addImm(0)
2171     .setMIFlags(Flags);
2172 
2173   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
2174   auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags);
2175   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
2176 
2177   Register Scale;
2178   if (!ST.hasUsableDivScaleConditionOutput()) {
2179     // Workaround a hardware bug on SI where the condition output from div_scale
2180     // is not usable.
2181 
2182     Scale = MRI.createGenericVirtualRegister(S1);
2183 
2184     LLT S32 = LLT::scalar(32);
2185 
2186     auto NumUnmerge = B.buildUnmerge(S32, LHS);
2187     auto DenUnmerge = B.buildUnmerge(S32, RHS);
2188     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
2189     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
2190 
2191     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
2192                               Scale1Unmerge.getReg(1));
2193     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
2194                               Scale0Unmerge.getReg(1));
2195     B.buildXor(Scale, CmpNum, CmpDen);
2196   } else {
2197     Scale = DivScale1.getReg(1);
2198   }
2199 
2200   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
2201     .addUse(Fma4.getReg(0))
2202     .addUse(Fma3.getReg(0))
2203     .addUse(Mul.getReg(0))
2204     .addUse(Scale)
2205     .setMIFlags(Flags);
2206 
2207   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
2208     .addUse(Fmas.getReg(0))
2209     .addUse(RHS)
2210     .addUse(LHS)
2211     .setMIFlags(Flags);
2212 
2213   MI.eraseFromParent();
2214   return true;
2215 }
2216 
2217 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
2218                                                  MachineRegisterInfo &MRI,
2219                                                  MachineIRBuilder &B) const {
2220   B.setInstr(MI);
2221   Register Res = MI.getOperand(0).getReg();
2222   Register LHS = MI.getOperand(2).getReg();
2223   Register RHS = MI.getOperand(3).getReg();
2224   uint16_t Flags = MI.getFlags();
2225 
2226   LLT S32 = LLT::scalar(32);
2227   LLT S1 = LLT::scalar(1);
2228 
2229   auto Abs = B.buildFAbs(S32, RHS, Flags);
2230   const APFloat C0Val(1.0f);
2231 
2232   auto C0 = B.buildConstant(S32, 0x6f800000);
2233   auto C1 = B.buildConstant(S32, 0x2f800000);
2234   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
2235 
2236   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
2237   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
2238 
2239   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
2240 
2241   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2242     .addUse(Mul0.getReg(0))
2243     .setMIFlags(Flags);
2244 
2245   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
2246 
2247   B.buildFMul(Res, Sel, Mul1, Flags);
2248 
2249   MI.eraseFromParent();
2250   return true;
2251 }
2252 
2253 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
2254                                                  MachineRegisterInfo &MRI,
2255                                                  MachineIRBuilder &B) const {
2256   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2257   if (!MFI->isEntryFunction()) {
2258     return legalizePreloadedArgIntrin(MI, MRI, B,
2259                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
2260   }
2261 
2262   B.setInstr(MI);
2263 
2264   uint64_t Offset =
2265     ST.getTargetLowering()->getImplicitParameterOffset(
2266       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
2267   Register DstReg = MI.getOperand(0).getReg();
2268   LLT DstTy = MRI.getType(DstReg);
2269   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
2270 
2271   const ArgDescriptor *Arg;
2272   const TargetRegisterClass *RC;
2273   std::tie(Arg, RC)
2274     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2275   if (!Arg)
2276     return false;
2277 
2278   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
2279   if (!loadInputValue(KernargPtrReg, B, Arg))
2280     return false;
2281 
2282   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
2283   MI.eraseFromParent();
2284   return true;
2285 }
2286 
2287 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
2288                                               MachineRegisterInfo &MRI,
2289                                               MachineIRBuilder &B,
2290                                               unsigned AddrSpace) const {
2291   B.setInstr(MI);
2292   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
2293   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
2294   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
2295   MI.eraseFromParent();
2296   return true;
2297 }
2298 
2299 /// Handle register layout difference for f16 images for some subtargets.
2300 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
2301                                              MachineRegisterInfo &MRI,
2302                                              Register Reg) const {
2303   if (!ST.hasUnpackedD16VMem())
2304     return Reg;
2305 
2306   const LLT S16 = LLT::scalar(16);
2307   const LLT S32 = LLT::scalar(32);
2308   LLT StoreVT = MRI.getType(Reg);
2309   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
2310 
2311   auto Unmerge = B.buildUnmerge(S16, Reg);
2312 
2313   SmallVector<Register, 4> WideRegs;
2314   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
2315     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
2316 
2317   int NumElts = StoreVT.getNumElements();
2318 
2319   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
2320 }
2321 
2322 bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI,
2323                                                  MachineRegisterInfo &MRI,
2324                                                  MachineIRBuilder &B,
2325                                                  bool IsFormat) const {
2326   // TODO: Reject f16 format on targets where unsupported.
2327   Register VData = MI.getOperand(1).getReg();
2328   LLT Ty = MRI.getType(VData);
2329 
2330   B.setInstr(MI);
2331 
2332   const LLT S32 = LLT::scalar(32);
2333   const LLT S16 = LLT::scalar(16);
2334 
2335   // Fixup illegal register types for i8 stores.
2336   if (Ty == LLT::scalar(8) || Ty == S16) {
2337     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
2338     MI.getOperand(1).setReg(AnyExt);
2339     return true;
2340   }
2341 
2342   if (Ty.isVector()) {
2343     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
2344       if (IsFormat)
2345         MI.getOperand(1).setReg(handleD16VData(B, MRI, VData));
2346       return true;
2347     }
2348 
2349     return Ty.getElementType() == S32 && Ty.getNumElements() <= 4;
2350   }
2351 
2352   return Ty == S32;
2353 }
2354 
2355 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
2356                                             MachineRegisterInfo &MRI,
2357                                             MachineIRBuilder &B) const {
2358   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
2359   auto IntrID = MI.getIntrinsicID();
2360   switch (IntrID) {
2361   case Intrinsic::amdgcn_if:
2362   case Intrinsic::amdgcn_else: {
2363     MachineInstr *Br = nullptr;
2364     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
2365       const SIRegisterInfo *TRI
2366         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
2367 
2368       B.setInstr(*BrCond);
2369       Register Def = MI.getOperand(1).getReg();
2370       Register Use = MI.getOperand(3).getReg();
2371 
2372       MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB();
2373       if (Br)
2374         BrTarget = Br->getOperand(0).getMBB();
2375 
2376       if (IntrID == Intrinsic::amdgcn_if) {
2377         B.buildInstr(AMDGPU::SI_IF)
2378           .addDef(Def)
2379           .addUse(Use)
2380           .addMBB(BrTarget);
2381       } else {
2382         B.buildInstr(AMDGPU::SI_ELSE)
2383           .addDef(Def)
2384           .addUse(Use)
2385           .addMBB(BrTarget)
2386           .addImm(0);
2387       }
2388 
2389       if (Br)
2390         Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB());
2391 
2392       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
2393       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
2394       MI.eraseFromParent();
2395       BrCond->eraseFromParent();
2396       return true;
2397     }
2398 
2399     return false;
2400   }
2401   case Intrinsic::amdgcn_loop: {
2402     MachineInstr *Br = nullptr;
2403     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
2404       const SIRegisterInfo *TRI
2405         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
2406 
2407       B.setInstr(*BrCond);
2408 
2409       // FIXME: Need to adjust branch targets based on unconditional branch.
2410       Register Reg = MI.getOperand(2).getReg();
2411       B.buildInstr(AMDGPU::SI_LOOP)
2412         .addUse(Reg)
2413         .addMBB(BrCond->getOperand(1).getMBB());
2414       MI.eraseFromParent();
2415       BrCond->eraseFromParent();
2416       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
2417       return true;
2418     }
2419 
2420     return false;
2421   }
2422   case Intrinsic::amdgcn_kernarg_segment_ptr:
2423     return legalizePreloadedArgIntrin(
2424       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2425   case Intrinsic::amdgcn_implicitarg_ptr:
2426     return legalizeImplicitArgPtr(MI, MRI, B);
2427   case Intrinsic::amdgcn_workitem_id_x:
2428     return legalizePreloadedArgIntrin(MI, MRI, B,
2429                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
2430   case Intrinsic::amdgcn_workitem_id_y:
2431     return legalizePreloadedArgIntrin(MI, MRI, B,
2432                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
2433   case Intrinsic::amdgcn_workitem_id_z:
2434     return legalizePreloadedArgIntrin(MI, MRI, B,
2435                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
2436   case Intrinsic::amdgcn_workgroup_id_x:
2437     return legalizePreloadedArgIntrin(MI, MRI, B,
2438                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
2439   case Intrinsic::amdgcn_workgroup_id_y:
2440     return legalizePreloadedArgIntrin(MI, MRI, B,
2441                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
2442   case Intrinsic::amdgcn_workgroup_id_z:
2443     return legalizePreloadedArgIntrin(MI, MRI, B,
2444                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
2445   case Intrinsic::amdgcn_dispatch_ptr:
2446     return legalizePreloadedArgIntrin(MI, MRI, B,
2447                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
2448   case Intrinsic::amdgcn_queue_ptr:
2449     return legalizePreloadedArgIntrin(MI, MRI, B,
2450                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
2451   case Intrinsic::amdgcn_implicit_buffer_ptr:
2452     return legalizePreloadedArgIntrin(
2453       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
2454   case Intrinsic::amdgcn_dispatch_id:
2455     return legalizePreloadedArgIntrin(MI, MRI, B,
2456                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
2457   case Intrinsic::amdgcn_fdiv_fast:
2458     return legalizeFDIVFastIntrin(MI, MRI, B);
2459   case Intrinsic::amdgcn_is_shared:
2460     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
2461   case Intrinsic::amdgcn_is_private:
2462     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
2463   case Intrinsic::amdgcn_wavefrontsize: {
2464     B.setInstr(MI);
2465     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
2466     MI.eraseFromParent();
2467     return true;
2468   }
2469   case Intrinsic::amdgcn_raw_buffer_store:
2470     return legalizeRawBufferStore(MI, MRI, B, false);
2471   case Intrinsic::amdgcn_raw_buffer_store_format:
2472     return legalizeRawBufferStore(MI, MRI, B, true);
2473   default:
2474     return true;
2475   }
2476 
2477   return true;
2478 }
2479