1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
19 #endif
20 
21 #include "AMDGPU.h"
22 #include "AMDGPULegalizerInfo.h"
23 #include "AMDGPUTargetMachine.h"
24 #include "SIMachineFunctionInfo.h"
25 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/CodeGen/TargetOpcodes.h"
28 #include "llvm/CodeGen/ValueTypes.h"
29 #include "llvm/IR/DerivedTypes.h"
30 #include "llvm/IR/DiagnosticInfo.h"
31 #include "llvm/IR/Type.h"
32 #include "llvm/Support/Debug.h"
33 
34 #define DEBUG_TYPE "amdgpu-legalinfo"
35 
36 using namespace llvm;
37 using namespace LegalizeActions;
38 using namespace LegalizeMutations;
39 using namespace LegalityPredicates;
40 
41 
42 static LegalityPredicate isMultiple32(unsigned TypeIdx,
43                                       unsigned MaxSize = 1024) {
44   return [=](const LegalityQuery &Query) {
45     const LLT Ty = Query.Types[TypeIdx];
46     const LLT EltTy = Ty.getScalarType();
47     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
48   };
49 }
50 
51 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
52   return [=](const LegalityQuery &Query) {
53     return Query.Types[TypeIdx].getSizeInBits() == Size;
54   };
55 }
56 
57 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
58   return [=](const LegalityQuery &Query) {
59     const LLT Ty = Query.Types[TypeIdx];
60     return Ty.isVector() &&
61            Ty.getNumElements() % 2 != 0 &&
62            Ty.getElementType().getSizeInBits() < 32 &&
63            Ty.getSizeInBits() % 32 != 0;
64   };
65 }
66 
67 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
68   return [=](const LegalityQuery &Query) {
69     const LLT Ty = Query.Types[TypeIdx];
70     const LLT EltTy = Ty.getScalarType();
71     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
72   };
73 }
74 
75 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
76   return [=](const LegalityQuery &Query) {
77     const LLT Ty = Query.Types[TypeIdx];
78     const LLT EltTy = Ty.getElementType();
79     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
80   };
81 }
82 
83 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
84   return [=](const LegalityQuery &Query) {
85     const LLT Ty = Query.Types[TypeIdx];
86     const LLT EltTy = Ty.getElementType();
87     unsigned Size = Ty.getSizeInBits();
88     unsigned Pieces = (Size + 63) / 64;
89     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
90     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
91   };
92 }
93 
94 // Increase the number of vector elements to reach the next multiple of 32-bit
95 // type.
96 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
97   return [=](const LegalityQuery &Query) {
98     const LLT Ty = Query.Types[TypeIdx];
99 
100     const LLT EltTy = Ty.getElementType();
101     const int Size = Ty.getSizeInBits();
102     const int EltSize = EltTy.getSizeInBits();
103     const int NextMul32 = (Size + 31) / 32;
104 
105     assert(EltSize < 32);
106 
107     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
108     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
109   };
110 }
111 
112 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
113   return [=](const LegalityQuery &Query) {
114     const LLT QueryTy = Query.Types[TypeIdx];
115     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
116   };
117 }
118 
119 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
120   return [=](const LegalityQuery &Query) {
121     const LLT QueryTy = Query.Types[TypeIdx];
122     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
123   };
124 }
125 
126 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
127   return [=](const LegalityQuery &Query) {
128     const LLT QueryTy = Query.Types[TypeIdx];
129     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
130   };
131 }
132 
133 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
134 // v2s16.
135 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
136   return [=](const LegalityQuery &Query) {
137     const LLT Ty = Query.Types[TypeIdx];
138     if (Ty.isVector()) {
139       const int EltSize = Ty.getElementType().getSizeInBits();
140       return EltSize == 32 || EltSize == 64 ||
141             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
142              EltSize == 128 || EltSize == 256;
143     }
144 
145     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
146   };
147 }
148 
149 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
150   return [=](const LegalityQuery &Query) {
151     return Query.Types[TypeIdx].getElementType() == Type;
152   };
153 }
154 
155 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
156   return [=](const LegalityQuery &Query) {
157     const LLT Ty = Query.Types[TypeIdx];
158     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
159            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
160   };
161 }
162 
163 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
164                                          const GCNTargetMachine &TM)
165   :  ST(ST_) {
166   using namespace TargetOpcode;
167 
168   auto GetAddrSpacePtr = [&TM](unsigned AS) {
169     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
170   };
171 
172   const LLT S1 = LLT::scalar(1);
173   const LLT S8 = LLT::scalar(8);
174   const LLT S16 = LLT::scalar(16);
175   const LLT S32 = LLT::scalar(32);
176   const LLT S64 = LLT::scalar(64);
177   const LLT S96 = LLT::scalar(96);
178   const LLT S128 = LLT::scalar(128);
179   const LLT S256 = LLT::scalar(256);
180   const LLT S1024 = LLT::scalar(1024);
181 
182   const LLT V2S16 = LLT::vector(2, 16);
183   const LLT V4S16 = LLT::vector(4, 16);
184 
185   const LLT V2S32 = LLT::vector(2, 32);
186   const LLT V3S32 = LLT::vector(3, 32);
187   const LLT V4S32 = LLT::vector(4, 32);
188   const LLT V5S32 = LLT::vector(5, 32);
189   const LLT V6S32 = LLT::vector(6, 32);
190   const LLT V7S32 = LLT::vector(7, 32);
191   const LLT V8S32 = LLT::vector(8, 32);
192   const LLT V9S32 = LLT::vector(9, 32);
193   const LLT V10S32 = LLT::vector(10, 32);
194   const LLT V11S32 = LLT::vector(11, 32);
195   const LLT V12S32 = LLT::vector(12, 32);
196   const LLT V13S32 = LLT::vector(13, 32);
197   const LLT V14S32 = LLT::vector(14, 32);
198   const LLT V15S32 = LLT::vector(15, 32);
199   const LLT V16S32 = LLT::vector(16, 32);
200   const LLT V32S32 = LLT::vector(32, 32);
201 
202   const LLT V2S64 = LLT::vector(2, 64);
203   const LLT V3S64 = LLT::vector(3, 64);
204   const LLT V4S64 = LLT::vector(4, 64);
205   const LLT V5S64 = LLT::vector(5, 64);
206   const LLT V6S64 = LLT::vector(6, 64);
207   const LLT V7S64 = LLT::vector(7, 64);
208   const LLT V8S64 = LLT::vector(8, 64);
209   const LLT V16S64 = LLT::vector(16, 64);
210 
211   std::initializer_list<LLT> AllS32Vectors =
212     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
213      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
214   std::initializer_list<LLT> AllS64Vectors =
215     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
216 
217   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
218   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
219   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
220   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
221   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
222   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
223   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
224 
225   const LLT CodePtr = FlatPtr;
226 
227   const std::initializer_list<LLT> AddrSpaces64 = {
228     GlobalPtr, ConstantPtr, FlatPtr
229   };
230 
231   const std::initializer_list<LLT> AddrSpaces32 = {
232     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
233   };
234 
235   const std::initializer_list<LLT> FPTypesBase = {
236     S32, S64
237   };
238 
239   const std::initializer_list<LLT> FPTypes16 = {
240     S32, S64, S16
241   };
242 
243   const std::initializer_list<LLT> FPTypesPK16 = {
244     S32, S64, S16, V2S16
245   };
246 
247   setAction({G_BRCOND, S1}, Legal); // VCC branches
248   setAction({G_BRCOND, S32}, Legal); // SCC branches
249 
250   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
251   // elements for v3s16
252   getActionDefinitionsBuilder(G_PHI)
253     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
254     .legalFor(AllS32Vectors)
255     .legalFor(AllS64Vectors)
256     .legalFor(AddrSpaces64)
257     .legalFor(AddrSpaces32)
258     .clampScalar(0, S32, S256)
259     .widenScalarToNextPow2(0, 32)
260     .clampMaxNumElements(0, S32, 16)
261     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
262     .legalIf(isPointer(0));
263 
264   if (ST.has16BitInsts()) {
265     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
266       .legalFor({S32, S16})
267       .clampScalar(0, S16, S32)
268       .scalarize(0);
269   } else {
270     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
271       .legalFor({S32})
272       .clampScalar(0, S32, S32)
273       .scalarize(0);
274   }
275 
276   // FIXME: Not really legal. Placeholder for custom lowering.
277   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
278     .legalFor({S32, S64})
279     .clampScalar(0, S32, S64)
280     .widenScalarToNextPow2(0, 32)
281     .scalarize(0);
282 
283   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
284     .legalFor({S32})
285     .clampScalar(0, S32, S32)
286     .scalarize(0);
287 
288   // Report legal for any types we can handle anywhere. For the cases only legal
289   // on the SALU, RegBankSelect will be able to re-legalize.
290   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
291     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
292     .clampScalar(0, S32, S64)
293     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
294     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
295     .widenScalarToNextPow2(0)
296     .scalarize(0);
297 
298   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
299                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
300     .legalFor({{S32, S1}, {S32, S32}})
301     .clampScalar(0, S32, S32)
302     .scalarize(0); // TODO: Implement.
303 
304   getActionDefinitionsBuilder({G_SADDO, G_SSUBO})
305     .lower();
306 
307   getActionDefinitionsBuilder(G_BITCAST)
308     // Don't worry about the size constraint.
309     .legalIf(all(isRegisterType(0), isRegisterType(1)))
310     // FIXME: Testing hack
311     .legalForCartesianProduct({S16, LLT::vector(2, 8), });
312 
313   getActionDefinitionsBuilder(G_FCONSTANT)
314     .legalFor({S32, S64, S16})
315     .clampScalar(0, S16, S64);
316 
317   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
318     .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
319                ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
320     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
321     .clampScalarOrElt(0, S32, S1024)
322     .legalIf(isMultiple32(0))
323     .widenScalarToNextPow2(0, 32)
324     .clampMaxNumElements(0, S32, 16);
325 
326 
327   // FIXME: i1 operands to intrinsics should always be legal, but other i1
328   // values may not be legal.  We need to figure out how to distinguish
329   // between these two scenarios.
330   getActionDefinitionsBuilder(G_CONSTANT)
331     .legalFor({S1, S32, S64, S16, GlobalPtr,
332                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
333     .clampScalar(0, S32, S64)
334     .widenScalarToNextPow2(0)
335     .legalIf(isPointer(0));
336 
337   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
338   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
339     .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr});
340 
341 
342   auto &FPOpActions = getActionDefinitionsBuilder(
343     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
344     .legalFor({S32, S64});
345   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
346     .customFor({S32, S64});
347   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
348     .customFor({S32, S64});
349 
350   if (ST.has16BitInsts()) {
351     if (ST.hasVOP3PInsts())
352       FPOpActions.legalFor({S16, V2S16});
353     else
354       FPOpActions.legalFor({S16});
355 
356     TrigActions.customFor({S16});
357     FDIVActions.customFor({S16});
358   }
359 
360   auto &MinNumMaxNum = getActionDefinitionsBuilder({
361       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
362 
363   if (ST.hasVOP3PInsts()) {
364     MinNumMaxNum.customFor(FPTypesPK16)
365       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
366       .clampMaxNumElements(0, S16, 2)
367       .clampScalar(0, S16, S64)
368       .scalarize(0);
369   } else if (ST.has16BitInsts()) {
370     MinNumMaxNum.customFor(FPTypes16)
371       .clampScalar(0, S16, S64)
372       .scalarize(0);
373   } else {
374     MinNumMaxNum.customFor(FPTypesBase)
375       .clampScalar(0, S32, S64)
376       .scalarize(0);
377   }
378 
379   if (ST.hasVOP3PInsts())
380     FPOpActions.clampMaxNumElements(0, S16, 2);
381 
382   FPOpActions
383     .scalarize(0)
384     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
385 
386   TrigActions
387     .scalarize(0)
388     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
389 
390   FDIVActions
391     .scalarize(0)
392     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
393 
394   getActionDefinitionsBuilder({G_FNEG, G_FABS})
395     .legalFor(FPTypesPK16)
396     .clampMaxNumElements(0, S16, 2)
397     .scalarize(0)
398     .clampScalar(0, S16, S64);
399 
400   // TODO: Implement
401   getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
402 
403   if (ST.has16BitInsts()) {
404     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
405       .legalFor({S32, S64, S16})
406       .scalarize(0)
407       .clampScalar(0, S16, S64);
408   } else {
409     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
410       .legalFor({S32, S64})
411       .scalarize(0)
412       .clampScalar(0, S32, S64);
413   }
414 
415   getActionDefinitionsBuilder(G_FPTRUNC)
416     .legalFor({{S32, S64}, {S16, S32}})
417     .scalarize(0);
418 
419   getActionDefinitionsBuilder(G_FPEXT)
420     .legalFor({{S64, S32}, {S32, S16}})
421     .lowerFor({{S64, S16}}) // FIXME: Implement
422     .scalarize(0);
423 
424   // TODO: Verify V_BFI_B32 is generated from expanded bit ops.
425   getActionDefinitionsBuilder(G_FCOPYSIGN).lower();
426 
427   getActionDefinitionsBuilder(G_FSUB)
428       // Use actual fsub instruction
429       .legalFor({S32})
430       // Must use fadd + fneg
431       .lowerFor({S64, S16, V2S16})
432       .scalarize(0)
433       .clampScalar(0, S32, S64);
434 
435   // Whether this is legal depends on the floating point mode for the function.
436   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
437   if (ST.hasMadF16())
438     FMad.customFor({S32, S16});
439   else
440     FMad.customFor({S32});
441   FMad.scalarize(0)
442       .lower();
443 
444   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
445     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
446                {S32, S1}, {S64, S1}, {S16, S1},
447                {S96, S32},
448                // FIXME: Hack
449                {S64, LLT::scalar(33)},
450                {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}})
451     .scalarize(0);
452 
453   // TODO: Split s1->s64 during regbankselect for VALU.
454   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
455     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
456     .lowerFor({{S32, S64}})
457     .lowerIf(typeIs(1, S1))
458     .customFor({{S64, S64}});
459   if (ST.has16BitInsts())
460     IToFP.legalFor({{S16, S16}});
461   IToFP.clampScalar(1, S32, S64)
462        .scalarize(0);
463 
464   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
465     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}});
466   if (ST.has16BitInsts())
467     FPToI.legalFor({{S16, S16}});
468   else
469     FPToI.minScalar(1, S32);
470 
471   FPToI.minScalar(0, S32)
472        .scalarize(0);
473 
474   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
475     .scalarize(0)
476     .lower();
477 
478   if (ST.has16BitInsts()) {
479     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
480       .legalFor({S16, S32, S64})
481       .clampScalar(0, S16, S64)
482       .scalarize(0);
483   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
484     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
485       .legalFor({S32, S64})
486       .clampScalar(0, S32, S64)
487       .scalarize(0);
488   } else {
489     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
490       .legalFor({S32})
491       .customFor({S64})
492       .clampScalar(0, S32, S64)
493       .scalarize(0);
494   }
495 
496   getActionDefinitionsBuilder(G_PTR_ADD)
497     .legalForCartesianProduct(AddrSpaces64, {S64})
498     .legalForCartesianProduct(AddrSpaces32, {S32})
499     .scalarize(0);
500 
501   getActionDefinitionsBuilder(G_PTR_MASK)
502     .scalarize(0)
503     .alwaysLegal();
504 
505   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
506 
507   auto &CmpBuilder =
508     getActionDefinitionsBuilder(G_ICMP)
509     // The compare output type differs based on the register bank of the output,
510     // so make both s1 and s32 legal.
511     //
512     // Scalar compares producing output in scc will be promoted to s32, as that
513     // is the allocatable register type that will be needed for the copy from
514     // scc. This will be promoted during RegBankSelect, and we assume something
515     // before that won't try to use s32 result types.
516     //
517     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
518     // bank.
519     .legalForCartesianProduct(
520       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
521     .legalForCartesianProduct(
522       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
523   if (ST.has16BitInsts()) {
524     CmpBuilder.legalFor({{S1, S16}});
525   }
526 
527   CmpBuilder
528     .widenScalarToNextPow2(1)
529     .clampScalar(1, S32, S64)
530     .scalarize(0)
531     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
532 
533   getActionDefinitionsBuilder(G_FCMP)
534     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
535     .widenScalarToNextPow2(1)
536     .clampScalar(1, S32, S64)
537     .scalarize(0);
538 
539   // FIXME: fexp, flog2, flog10 needs to be custom lowered.
540   getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2,
541                                G_FLOG, G_FLOG2, G_FLOG10})
542     .legalFor({S32})
543     .scalarize(0);
544 
545   // The 64-bit versions produce 32-bit results, but only on the SALU.
546   getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF,
547                                G_CTTZ, G_CTTZ_ZERO_UNDEF,
548                                G_CTPOP})
549     .legalFor({{S32, S32}, {S32, S64}})
550     .clampScalar(0, S32, S32)
551     .clampScalar(1, S32, S64)
552     .scalarize(0)
553     .widenScalarToNextPow2(0, 32)
554     .widenScalarToNextPow2(1, 32);
555 
556   // TODO: Expand for > s32
557   getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE})
558     .legalFor({S32})
559     .clampScalar(0, S32, S32)
560     .scalarize(0);
561 
562   if (ST.has16BitInsts()) {
563     if (ST.hasVOP3PInsts()) {
564       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
565         .legalFor({S32, S16, V2S16})
566         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
567         .clampMaxNumElements(0, S16, 2)
568         .clampScalar(0, S16, S32)
569         .widenScalarToNextPow2(0)
570         .scalarize(0);
571     } else {
572       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
573         .legalFor({S32, S16})
574         .widenScalarToNextPow2(0)
575         .clampScalar(0, S16, S32)
576         .scalarize(0);
577     }
578   } else {
579     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
580       .legalFor({S32})
581       .clampScalar(0, S32, S32)
582       .widenScalarToNextPow2(0)
583       .scalarize(0);
584   }
585 
586   auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
587     return [=](const LegalityQuery &Query) {
588       return Query.Types[TypeIdx0].getSizeInBits() <
589              Query.Types[TypeIdx1].getSizeInBits();
590     };
591   };
592 
593   auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
594     return [=](const LegalityQuery &Query) {
595       return Query.Types[TypeIdx0].getSizeInBits() >
596              Query.Types[TypeIdx1].getSizeInBits();
597     };
598   };
599 
600   getActionDefinitionsBuilder(G_INTTOPTR)
601     // List the common cases
602     .legalForCartesianProduct(AddrSpaces64, {S64})
603     .legalForCartesianProduct(AddrSpaces32, {S32})
604     .scalarize(0)
605     // Accept any address space as long as the size matches
606     .legalIf(sameSize(0, 1))
607     .widenScalarIf(smallerThan(1, 0),
608       [](const LegalityQuery &Query) {
609         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
610       })
611     .narrowScalarIf(greaterThan(1, 0),
612       [](const LegalityQuery &Query) {
613         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
614       });
615 
616   getActionDefinitionsBuilder(G_PTRTOINT)
617     // List the common cases
618     .legalForCartesianProduct(AddrSpaces64, {S64})
619     .legalForCartesianProduct(AddrSpaces32, {S32})
620     .scalarize(0)
621     // Accept any address space as long as the size matches
622     .legalIf(sameSize(0, 1))
623     .widenScalarIf(smallerThan(0, 1),
624       [](const LegalityQuery &Query) {
625         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
626       })
627     .narrowScalarIf(
628       greaterThan(0, 1),
629       [](const LegalityQuery &Query) {
630         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
631       });
632 
633   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
634     .scalarize(0)
635     .custom();
636 
637   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
638   // handle some operations by just promoting the register during
639   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
640   auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned {
641     switch (AS) {
642     // FIXME: Private element size.
643     case AMDGPUAS::PRIVATE_ADDRESS:
644       return 32;
645     // FIXME: Check subtarget
646     case AMDGPUAS::LOCAL_ADDRESS:
647       return ST.useDS128() ? 128 : 64;
648 
649     // Treat constant and global as identical. SMRD loads are sometimes usable
650     // for global loads (ideally constant address space should be eliminated)
651     // depending on the context. Legality cannot be context dependent, but
652     // RegBankSelect can split the load as necessary depending on the pointer
653     // register bank/uniformity and if the memory is invariant or not written in
654     // a kernel.
655     case AMDGPUAS::CONSTANT_ADDRESS:
656     case AMDGPUAS::GLOBAL_ADDRESS:
657       return 512;
658     default:
659       return 128;
660     }
661   };
662 
663   const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool {
664     const LLT DstTy = Query.Types[0];
665 
666     // Split vector extloads.
667     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
668     unsigned Align = Query.MMODescrs[0].AlignInBits;
669 
670     if (MemSize < DstTy.getSizeInBits())
671       MemSize = std::max(MemSize, Align);
672 
673     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
674       return true;
675 
676     const LLT PtrTy = Query.Types[1];
677     unsigned AS = PtrTy.getAddressSpace();
678     if (MemSize > maxSizeForAddrSpace(AS))
679       return true;
680 
681     // Catch weird sized loads that don't evenly divide into the access sizes
682     // TODO: May be able to widen depending on alignment etc.
683     unsigned NumRegs = MemSize / 32;
684     if (NumRegs == 3 && !ST.hasDwordx3LoadStores())
685       return true;
686 
687     if (Align < MemSize) {
688       const SITargetLowering *TLI = ST.getTargetLowering();
689       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
690     }
691 
692     return false;
693   };
694 
695   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
696   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
697   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
698 
699   // TODO: Refine based on subtargets which support unaligned access or 128-bit
700   // LDS
701   // TODO: Unsupported flat for SI.
702 
703   for (unsigned Op : {G_LOAD, G_STORE}) {
704     const bool IsStore = Op == G_STORE;
705 
706     auto &Actions = getActionDefinitionsBuilder(Op);
707     // Whitelist the common cases.
708     // TODO: Pointer loads
709     // TODO: Wide constant loads
710     // TODO: Only CI+ has 3x loads
711     // TODO: Loads to s16 on gfx9
712     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
713                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
714                                       {V3S32, GlobalPtr, 96, GlobalAlign32},
715                                       {S96, GlobalPtr, 96, GlobalAlign32},
716                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
717                                       {S128, GlobalPtr, 128, GlobalAlign32},
718                                       {S64, GlobalPtr, 64, GlobalAlign32},
719                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
720                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
721                                       {S32, GlobalPtr, 8, GlobalAlign8},
722                                       {S32, GlobalPtr, 16, GlobalAlign16},
723 
724                                       {S32, LocalPtr, 32, 32},
725                                       {S64, LocalPtr, 64, 32},
726                                       {V2S32, LocalPtr, 64, 32},
727                                       {S32, LocalPtr, 8, 8},
728                                       {S32, LocalPtr, 16, 16},
729                                       {V2S16, LocalPtr, 32, 32},
730 
731                                       {S32, PrivatePtr, 32, 32},
732                                       {S32, PrivatePtr, 8, 8},
733                                       {S32, PrivatePtr, 16, 16},
734                                       {V2S16, PrivatePtr, 32, 32},
735 
736                                       {S32, FlatPtr, 32, GlobalAlign32},
737                                       {S32, FlatPtr, 16, GlobalAlign16},
738                                       {S32, FlatPtr, 8, GlobalAlign8},
739                                       {V2S16, FlatPtr, 32, GlobalAlign32},
740 
741                                       {S32, ConstantPtr, 32, GlobalAlign32},
742                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
743                                       {V3S32, ConstantPtr, 96, GlobalAlign32},
744                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
745                                       {S64, ConstantPtr, 64, GlobalAlign32},
746                                       {S128, ConstantPtr, 128, GlobalAlign32},
747                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
748     Actions
749         .customIf(typeIs(1, Constant32Ptr))
750         .narrowScalarIf(
751             [=](const LegalityQuery &Query) -> bool {
752               return !Query.Types[0].isVector() && needToSplitLoad(Query);
753             },
754             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
755               const LLT DstTy = Query.Types[0];
756               const LLT PtrTy = Query.Types[1];
757 
758               const unsigned DstSize = DstTy.getSizeInBits();
759               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
760 
761               // Split extloads.
762               if (DstSize > MemSize)
763                 return std::make_pair(0, LLT::scalar(MemSize));
764 
765               if (DstSize > 32 && (DstSize % 32 != 0)) {
766                 // FIXME: Need a way to specify non-extload of larger size if
767                 // suitably aligned.
768                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
769               }
770 
771               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
772               if (MemSize > MaxSize)
773                 return std::make_pair(0, LLT::scalar(MaxSize));
774 
775               unsigned Align = Query.MMODescrs[0].AlignInBits;
776               return std::make_pair(0, LLT::scalar(Align));
777             })
778         .fewerElementsIf(
779             [=](const LegalityQuery &Query) -> bool {
780               return Query.Types[0].isVector() && needToSplitLoad(Query);
781             },
782             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
783               const LLT DstTy = Query.Types[0];
784               const LLT PtrTy = Query.Types[1];
785 
786               LLT EltTy = DstTy.getElementType();
787               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
788 
789               // Split if it's too large for the address space.
790               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
791                 unsigned NumElts = DstTy.getNumElements();
792                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
793 
794                 // FIXME: Refine when odd breakdowns handled
795                 // The scalars will need to be re-legalized.
796                 if (NumPieces == 1 || NumPieces >= NumElts ||
797                     NumElts % NumPieces != 0)
798                   return std::make_pair(0, EltTy);
799 
800                 return std::make_pair(0,
801                                       LLT::vector(NumElts / NumPieces, EltTy));
802               }
803 
804               // Need to split because of alignment.
805               unsigned Align = Query.MMODescrs[0].AlignInBits;
806               unsigned EltSize = EltTy.getSizeInBits();
807               if (EltSize > Align &&
808                   (EltSize / Align < DstTy.getNumElements())) {
809                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
810               }
811 
812               // May need relegalization for the scalars.
813               return std::make_pair(0, EltTy);
814             })
815         .minScalar(0, S32);
816 
817     if (IsStore)
818       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
819 
820     // TODO: Need a bitcast lower option?
821     Actions
822         .legalIf([=](const LegalityQuery &Query) {
823           const LLT Ty0 = Query.Types[0];
824           unsigned Size = Ty0.getSizeInBits();
825           unsigned MemSize = Query.MMODescrs[0].SizeInBits;
826           unsigned Align = Query.MMODescrs[0].AlignInBits;
827 
828           // FIXME: Widening store from alignment not valid.
829           if (MemSize < Size)
830             MemSize = std::max(MemSize, Align);
831 
832           // No extending vector loads.
833           if (Size > MemSize && Ty0.isVector())
834             return false;
835 
836           switch (MemSize) {
837           case 8:
838           case 16:
839             return Size == 32;
840           case 32:
841           case 64:
842           case 128:
843             return true;
844           case 96:
845             return ST.hasDwordx3LoadStores();
846           case 256:
847           case 512:
848             return true;
849           default:
850             return false;
851           }
852         })
853         .widenScalarToNextPow2(0)
854         // TODO: v3s32->v4s32 with alignment
855         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
856   }
857 
858   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
859                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
860                                                   {S32, GlobalPtr, 16, 2 * 8},
861                                                   {S32, LocalPtr, 8, 8},
862                                                   {S32, LocalPtr, 16, 16},
863                                                   {S32, PrivatePtr, 8, 8},
864                                                   {S32, PrivatePtr, 16, 16},
865                                                   {S32, ConstantPtr, 8, 8},
866                                                   {S32, ConstantPtr, 16, 2 * 8}});
867   if (ST.hasFlatAddressSpace()) {
868     ExtLoads.legalForTypesWithMemDesc(
869         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
870   }
871 
872   ExtLoads.clampScalar(0, S32, S32)
873           .widenScalarToNextPow2(0)
874           .unsupportedIfMemSizeNotPow2()
875           .lower();
876 
877   auto &Atomics = getActionDefinitionsBuilder(
878     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
879      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
880      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
881      G_ATOMICRMW_UMIN})
882     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
883                {S64, GlobalPtr}, {S64, LocalPtr}});
884   if (ST.hasFlatAddressSpace()) {
885     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
886   }
887 
888   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
889     .legalFor({{S32, LocalPtr}});
890 
891   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
892   // demarshalling
893   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
894     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
895                 {S32, FlatPtr}, {S64, FlatPtr}})
896     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
897                {S32, RegionPtr}, {S64, RegionPtr}});
898 
899   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS)
900     .lower();
901 
902   // TODO: Pointer types, any 32-bit or 64-bit vector
903 
904   // Condition should be s32 for scalar, s1 for vector.
905   getActionDefinitionsBuilder(G_SELECT)
906     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
907           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
908           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
909     .clampScalar(0, S16, S64)
910     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
911     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
912     .scalarize(1)
913     .clampMaxNumElements(0, S32, 2)
914     .clampMaxNumElements(0, LocalPtr, 2)
915     .clampMaxNumElements(0, PrivatePtr, 2)
916     .scalarize(0)
917     .widenScalarToNextPow2(0)
918     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
919 
920   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
921   // be more flexible with the shift amount type.
922   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
923     .legalFor({{S32, S32}, {S64, S32}});
924   if (ST.has16BitInsts()) {
925     if (ST.hasVOP3PInsts()) {
926       Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
927             .clampMaxNumElements(0, S16, 2);
928     } else
929       Shifts.legalFor({{S16, S32}, {S16, S16}});
930 
931     Shifts.clampScalar(1, S16, S32);
932     Shifts.clampScalar(0, S16, S64);
933     Shifts.widenScalarToNextPow2(0, 16);
934   } else {
935     // Make sure we legalize the shift amount type first, as the general
936     // expansion for the shifted type will produce much worse code if it hasn't
937     // been truncated already.
938     Shifts.clampScalar(1, S32, S32);
939     Shifts.clampScalar(0, S32, S64);
940     Shifts.widenScalarToNextPow2(0, 32);
941   }
942   Shifts.scalarize(0);
943 
944   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
945     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
946     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
947     unsigned IdxTypeIdx = 2;
948 
949     getActionDefinitionsBuilder(Op)
950       .customIf([=](const LegalityQuery &Query) {
951           const LLT EltTy = Query.Types[EltTypeIdx];
952           const LLT VecTy = Query.Types[VecTypeIdx];
953           const LLT IdxTy = Query.Types[IdxTypeIdx];
954           return (EltTy.getSizeInBits() == 16 ||
955                   EltTy.getSizeInBits() % 32 == 0) &&
956                  VecTy.getSizeInBits() % 32 == 0 &&
957                  VecTy.getSizeInBits() <= 1024 &&
958                  IdxTy.getSizeInBits() == 32;
959         })
960       .clampScalar(EltTypeIdx, S32, S64)
961       .clampScalar(VecTypeIdx, S32, S64)
962       .clampScalar(IdxTypeIdx, S32, S32);
963   }
964 
965   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
966     .unsupportedIf([=](const LegalityQuery &Query) {
967         const LLT &EltTy = Query.Types[1].getElementType();
968         return Query.Types[0] != EltTy;
969       });
970 
971   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
972     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
973     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
974 
975     // FIXME: Doesn't handle extract of illegal sizes.
976     getActionDefinitionsBuilder(Op)
977       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
978       // FIXME: Multiples of 16 should not be legal.
979       .legalIf([=](const LegalityQuery &Query) {
980           const LLT BigTy = Query.Types[BigTyIdx];
981           const LLT LitTy = Query.Types[LitTyIdx];
982           return (BigTy.getSizeInBits() % 32 == 0) &&
983                  (LitTy.getSizeInBits() % 16 == 0);
984         })
985       .widenScalarIf(
986         [=](const LegalityQuery &Query) {
987           const LLT BigTy = Query.Types[BigTyIdx];
988           return (BigTy.getScalarSizeInBits() < 16);
989         },
990         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
991       .widenScalarIf(
992         [=](const LegalityQuery &Query) {
993           const LLT LitTy = Query.Types[LitTyIdx];
994           return (LitTy.getScalarSizeInBits() < 16);
995         },
996         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
997       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
998       .widenScalarToNextPow2(BigTyIdx, 32);
999 
1000   }
1001 
1002   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1003     .legalForCartesianProduct(AllS32Vectors, {S32})
1004     .legalForCartesianProduct(AllS64Vectors, {S64})
1005     .clampNumElements(0, V16S32, V32S32)
1006     .clampNumElements(0, V2S64, V16S64)
1007     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1008 
1009   if (ST.hasScalarPackInsts())
1010     BuildVector.legalFor({V2S16, S32});
1011 
1012   BuildVector
1013     .minScalarSameAs(1, 0)
1014     .legalIf(isRegisterType(0))
1015     .minScalarOrElt(0, S32);
1016 
1017   if (ST.hasScalarPackInsts()) {
1018     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1019       .legalFor({V2S16, S32})
1020       .lower();
1021   } else {
1022     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1023       .lower();
1024   }
1025 
1026   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1027     .legalIf(isRegisterType(0));
1028 
1029   // TODO: Don't fully scalarize v2s16 pieces
1030   getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1031 
1032   // Merge/Unmerge
1033   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1034     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1035     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1036 
1037     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1038       const LLT &Ty = Query.Types[TypeIdx];
1039       if (Ty.isVector()) {
1040         const LLT &EltTy = Ty.getElementType();
1041         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
1042           return true;
1043         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1044           return true;
1045       }
1046       return false;
1047     };
1048 
1049     auto &Builder = getActionDefinitionsBuilder(Op)
1050       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1051       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1052       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1053       // valid.
1054       .clampScalar(LitTyIdx, S16, S256)
1055       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1056       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1057       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1058                            elementTypeIs(1, S16)),
1059                        changeTo(1, V2S16))
1060       // Break up vectors with weird elements into scalars
1061       .fewerElementsIf(
1062         [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
1063         scalarize(0))
1064       .fewerElementsIf(
1065         [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
1066         scalarize(1))
1067       .clampScalar(BigTyIdx, S32, S1024)
1068       .lowerFor({{S16, V2S16}});
1069 
1070     if (Op == G_MERGE_VALUES) {
1071       Builder.widenScalarIf(
1072         // TODO: Use 16-bit shifts if legal for 8-bit values?
1073         [=](const LegalityQuery &Query) {
1074           const LLT Ty = Query.Types[LitTyIdx];
1075           return Ty.getSizeInBits() < 32;
1076         },
1077         changeTo(LitTyIdx, S32));
1078     }
1079 
1080     Builder.widenScalarIf(
1081       [=](const LegalityQuery &Query) {
1082         const LLT Ty = Query.Types[BigTyIdx];
1083         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1084           Ty.getSizeInBits() % 16 != 0;
1085       },
1086       [=](const LegalityQuery &Query) {
1087         // Pick the next power of 2, or a multiple of 64 over 128.
1088         // Whichever is smaller.
1089         const LLT &Ty = Query.Types[BigTyIdx];
1090         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1091         if (NewSizeInBits >= 256) {
1092           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1093           if (RoundedTo < NewSizeInBits)
1094             NewSizeInBits = RoundedTo;
1095         }
1096         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1097       })
1098       .legalIf([=](const LegalityQuery &Query) {
1099           const LLT &BigTy = Query.Types[BigTyIdx];
1100           const LLT &LitTy = Query.Types[LitTyIdx];
1101 
1102           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1103             return false;
1104           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1105             return false;
1106 
1107           return BigTy.getSizeInBits() % 16 == 0 &&
1108                  LitTy.getSizeInBits() % 16 == 0 &&
1109                  BigTy.getSizeInBits() <= 1024;
1110         })
1111       // Any vectors left are the wrong size. Scalarize them.
1112       .scalarize(0)
1113       .scalarize(1);
1114   }
1115 
1116   getActionDefinitionsBuilder(G_SEXT_INREG).lower();
1117 
1118   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1119     .legalFor({S64});
1120 
1121   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1122         G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1123         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1124     .unsupported();
1125 
1126   computeTables();
1127   verify(*ST.getInstrInfo());
1128 }
1129 
1130 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1131                                          MachineRegisterInfo &MRI,
1132                                          MachineIRBuilder &B,
1133                                          GISelChangeObserver &Observer) const {
1134   switch (MI.getOpcode()) {
1135   case TargetOpcode::G_ADDRSPACE_CAST:
1136     return legalizeAddrSpaceCast(MI, MRI, B);
1137   case TargetOpcode::G_FRINT:
1138     return legalizeFrint(MI, MRI, B);
1139   case TargetOpcode::G_FCEIL:
1140     return legalizeFceil(MI, MRI, B);
1141   case TargetOpcode::G_INTRINSIC_TRUNC:
1142     return legalizeIntrinsicTrunc(MI, MRI, B);
1143   case TargetOpcode::G_SITOFP:
1144     return legalizeITOFP(MI, MRI, B, true);
1145   case TargetOpcode::G_UITOFP:
1146     return legalizeITOFP(MI, MRI, B, false);
1147   case TargetOpcode::G_FMINNUM:
1148   case TargetOpcode::G_FMAXNUM:
1149   case TargetOpcode::G_FMINNUM_IEEE:
1150   case TargetOpcode::G_FMAXNUM_IEEE:
1151     return legalizeMinNumMaxNum(MI, MRI, B);
1152   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1153     return legalizeExtractVectorElt(MI, MRI, B);
1154   case TargetOpcode::G_INSERT_VECTOR_ELT:
1155     return legalizeInsertVectorElt(MI, MRI, B);
1156   case TargetOpcode::G_FSIN:
1157   case TargetOpcode::G_FCOS:
1158     return legalizeSinCos(MI, MRI, B);
1159   case TargetOpcode::G_GLOBAL_VALUE:
1160     return legalizeGlobalValue(MI, MRI, B);
1161   case TargetOpcode::G_LOAD:
1162     return legalizeLoad(MI, MRI, B, Observer);
1163   case TargetOpcode::G_FMAD:
1164     return legalizeFMad(MI, MRI, B);
1165   case TargetOpcode::G_FDIV:
1166     return legalizeFDIV(MI, MRI, B);
1167   case TargetOpcode::G_ATOMIC_CMPXCHG:
1168     return legalizeAtomicCmpXChg(MI, MRI, B);
1169   default:
1170     return false;
1171   }
1172 
1173   llvm_unreachable("expected switch to return");
1174 }
1175 
1176 Register AMDGPULegalizerInfo::getSegmentAperture(
1177   unsigned AS,
1178   MachineRegisterInfo &MRI,
1179   MachineIRBuilder &B) const {
1180   MachineFunction &MF = B.getMF();
1181   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1182   const LLT S32 = LLT::scalar(32);
1183 
1184   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1185 
1186   if (ST.hasApertureRegs()) {
1187     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1188     // getreg.
1189     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1190         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1191         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1192     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1193         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1194         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1195     unsigned Encoding =
1196         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1197         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1198         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1199 
1200     Register ApertureReg = MRI.createGenericVirtualRegister(S32);
1201     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1202 
1203     B.buildInstr(AMDGPU::S_GETREG_B32)
1204       .addDef(GetReg)
1205       .addImm(Encoding);
1206     MRI.setType(GetReg, S32);
1207 
1208     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1209     B.buildInstr(TargetOpcode::G_SHL)
1210       .addDef(ApertureReg)
1211       .addUse(GetReg)
1212       .addUse(ShiftAmt.getReg(0));
1213 
1214     return ApertureReg;
1215   }
1216 
1217   Register QueuePtr = MRI.createGenericVirtualRegister(
1218     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1219 
1220   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1221   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1222     return Register();
1223 
1224   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1225   // private_segment_aperture_base_hi.
1226   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1227 
1228   // TODO: can we be smarter about machine pointer info?
1229   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1230   MachineMemOperand *MMO = MF.getMachineMemOperand(
1231     PtrInfo,
1232     MachineMemOperand::MOLoad |
1233     MachineMemOperand::MODereferenceable |
1234     MachineMemOperand::MOInvariant,
1235     4,
1236     MinAlign(64, StructOffset));
1237 
1238   Register LoadResult = MRI.createGenericVirtualRegister(S32);
1239   Register LoadAddr;
1240 
1241   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1242   B.buildLoad(LoadResult, LoadAddr, *MMO);
1243   return LoadResult;
1244 }
1245 
1246 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1247   MachineInstr &MI, MachineRegisterInfo &MRI,
1248   MachineIRBuilder &B) const {
1249   MachineFunction &MF = B.getMF();
1250 
1251   B.setInstr(MI);
1252 
1253   const LLT S32 = LLT::scalar(32);
1254   Register Dst = MI.getOperand(0).getReg();
1255   Register Src = MI.getOperand(1).getReg();
1256 
1257   LLT DstTy = MRI.getType(Dst);
1258   LLT SrcTy = MRI.getType(Src);
1259   unsigned DestAS = DstTy.getAddressSpace();
1260   unsigned SrcAS = SrcTy.getAddressSpace();
1261 
1262   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1263   // vector element.
1264   assert(!DstTy.isVector());
1265 
1266   const AMDGPUTargetMachine &TM
1267     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1268 
1269   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1270   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1271     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1272     return true;
1273   }
1274 
1275   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1276     // Truncate.
1277     B.buildExtract(Dst, Src, 0);
1278     MI.eraseFromParent();
1279     return true;
1280   }
1281 
1282   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1283     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1284     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1285 
1286     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1287     // another. Merge operands are required to be the same type, but creating an
1288     // extra ptrtoint would be kind of pointless.
1289     auto HighAddr = B.buildConstant(
1290       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1291     B.buildMerge(Dst, {Src, HighAddr.getReg(0)});
1292     MI.eraseFromParent();
1293     return true;
1294   }
1295 
1296   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1297     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1298            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1299     unsigned NullVal = TM.getNullPointerValue(DestAS);
1300 
1301     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1302     auto FlatNull = B.buildConstant(SrcTy, 0);
1303 
1304     Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy);
1305 
1306     // Extract low 32-bits of the pointer.
1307     B.buildExtract(PtrLo32, Src, 0);
1308 
1309     Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1310     B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
1311     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1312 
1313     MI.eraseFromParent();
1314     return true;
1315   }
1316 
1317   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1318     return false;
1319 
1320   if (!ST.hasFlatAddressSpace())
1321     return false;
1322 
1323   auto SegmentNull =
1324       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1325   auto FlatNull =
1326       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1327 
1328   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1329   if (!ApertureReg.isValid())
1330     return false;
1331 
1332   Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1333   B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
1334 
1335   Register BuildPtr = MRI.createGenericVirtualRegister(DstTy);
1336 
1337   // Coerce the type of the low half of the result so we can use merge_values.
1338   Register SrcAsInt = MRI.createGenericVirtualRegister(S32);
1339   B.buildInstr(TargetOpcode::G_PTRTOINT)
1340     .addDef(SrcAsInt)
1341     .addUse(Src);
1342 
1343   // TODO: Should we allow mismatched types but matching sizes in merges to
1344   // avoid the ptrtoint?
1345   B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
1346   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
1347 
1348   MI.eraseFromParent();
1349   return true;
1350 }
1351 
1352 bool AMDGPULegalizerInfo::legalizeFrint(
1353   MachineInstr &MI, MachineRegisterInfo &MRI,
1354   MachineIRBuilder &B) const {
1355   B.setInstr(MI);
1356 
1357   Register Src = MI.getOperand(1).getReg();
1358   LLT Ty = MRI.getType(Src);
1359   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1360 
1361   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1362   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1363 
1364   auto C1 = B.buildFConstant(Ty, C1Val);
1365   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1366 
1367   // TODO: Should this propagate fast-math-flags?
1368   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1369   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1370 
1371   auto C2 = B.buildFConstant(Ty, C2Val);
1372   auto Fabs = B.buildFAbs(Ty, Src);
1373 
1374   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1375   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1376   return true;
1377 }
1378 
1379 bool AMDGPULegalizerInfo::legalizeFceil(
1380   MachineInstr &MI, MachineRegisterInfo &MRI,
1381   MachineIRBuilder &B) const {
1382   B.setInstr(MI);
1383 
1384   const LLT S1 = LLT::scalar(1);
1385   const LLT S64 = LLT::scalar(64);
1386 
1387   Register Src = MI.getOperand(1).getReg();
1388   assert(MRI.getType(Src) == S64);
1389 
1390   // result = trunc(src)
1391   // if (src > 0.0 && src != result)
1392   //   result += 1.0
1393 
1394   auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src});
1395 
1396   const auto Zero = B.buildFConstant(S64, 0.0);
1397   const auto One = B.buildFConstant(S64, 1.0);
1398   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1399   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1400   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1401   auto Add = B.buildSelect(S64, And, One, Zero);
1402 
1403   // TODO: Should this propagate fast-math-flags?
1404   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1405   return true;
1406 }
1407 
1408 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1409                                               MachineIRBuilder &B) {
1410   const unsigned FractBits = 52;
1411   const unsigned ExpBits = 11;
1412   LLT S32 = LLT::scalar(32);
1413 
1414   auto Const0 = B.buildConstant(S32, FractBits - 32);
1415   auto Const1 = B.buildConstant(S32, ExpBits);
1416 
1417   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1418     .addUse(Const0.getReg(0))
1419     .addUse(Const1.getReg(0));
1420 
1421   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1422 }
1423 
1424 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1425   MachineInstr &MI, MachineRegisterInfo &MRI,
1426   MachineIRBuilder &B) const {
1427   B.setInstr(MI);
1428 
1429   const LLT S1 = LLT::scalar(1);
1430   const LLT S32 = LLT::scalar(32);
1431   const LLT S64 = LLT::scalar(64);
1432 
1433   Register Src = MI.getOperand(1).getReg();
1434   assert(MRI.getType(Src) == S64);
1435 
1436   // TODO: Should this use extract since the low half is unused?
1437   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1438   Register Hi = Unmerge.getReg(1);
1439 
1440   // Extract the upper half, since this is where we will find the sign and
1441   // exponent.
1442   auto Exp = extractF64Exponent(Hi, B);
1443 
1444   const unsigned FractBits = 52;
1445 
1446   // Extract the sign bit.
1447   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1448   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1449 
1450   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1451 
1452   const auto Zero32 = B.buildConstant(S32, 0);
1453 
1454   // Extend back to 64-bits.
1455   auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)});
1456 
1457   auto Shr = B.buildAShr(S64, FractMask, Exp);
1458   auto Not = B.buildNot(S64, Shr);
1459   auto Tmp0 = B.buildAnd(S64, Src, Not);
1460   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1461 
1462   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1463   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1464 
1465   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1466   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1467   return true;
1468 }
1469 
1470 bool AMDGPULegalizerInfo::legalizeITOFP(
1471   MachineInstr &MI, MachineRegisterInfo &MRI,
1472   MachineIRBuilder &B, bool Signed) const {
1473   B.setInstr(MI);
1474 
1475   Register Dst = MI.getOperand(0).getReg();
1476   Register Src = MI.getOperand(1).getReg();
1477 
1478   const LLT S64 = LLT::scalar(64);
1479   const LLT S32 = LLT::scalar(32);
1480 
1481   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1482 
1483   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1484 
1485   auto CvtHi = Signed ?
1486     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1487     B.buildUITOFP(S64, Unmerge.getReg(1));
1488 
1489   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1490 
1491   auto ThirtyTwo = B.buildConstant(S32, 32);
1492   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1493     .addUse(CvtHi.getReg(0))
1494     .addUse(ThirtyTwo.getReg(0));
1495 
1496   // TODO: Should this propagate fast-math-flags?
1497   B.buildFAdd(Dst, LdExp, CvtLo);
1498   MI.eraseFromParent();
1499   return true;
1500 }
1501 
1502 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1503   MachineInstr &MI, MachineRegisterInfo &MRI,
1504   MachineIRBuilder &B) const {
1505   MachineFunction &MF = B.getMF();
1506   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1507 
1508   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1509                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1510 
1511   // With ieee_mode disabled, the instructions have the correct behavior
1512   // already for G_FMINNUM/G_FMAXNUM
1513   if (!MFI->getMode().IEEE)
1514     return !IsIEEEOp;
1515 
1516   if (IsIEEEOp)
1517     return true;
1518 
1519   MachineIRBuilder HelperBuilder(MI);
1520   GISelObserverWrapper DummyObserver;
1521   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1522   HelperBuilder.setInstr(MI);
1523   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1524 }
1525 
1526 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1527   MachineInstr &MI, MachineRegisterInfo &MRI,
1528   MachineIRBuilder &B) const {
1529   // TODO: Should move some of this into LegalizerHelper.
1530 
1531   // TODO: Promote dynamic indexing of s16 to s32
1532   // TODO: Dynamic s64 indexing is only legal for SGPR.
1533   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
1534   if (!IdxVal) // Dynamic case will be selected to register indexing.
1535     return true;
1536 
1537   Register Dst = MI.getOperand(0).getReg();
1538   Register Vec = MI.getOperand(1).getReg();
1539 
1540   LLT VecTy = MRI.getType(Vec);
1541   LLT EltTy = VecTy.getElementType();
1542   assert(EltTy == MRI.getType(Dst));
1543 
1544   B.setInstr(MI);
1545 
1546   if (IdxVal.getValue() < VecTy.getNumElements())
1547     B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits());
1548   else
1549     B.buildUndef(Dst);
1550 
1551   MI.eraseFromParent();
1552   return true;
1553 }
1554 
1555 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1556   MachineInstr &MI, MachineRegisterInfo &MRI,
1557   MachineIRBuilder &B) const {
1558   // TODO: Should move some of this into LegalizerHelper.
1559 
1560   // TODO: Promote dynamic indexing of s16 to s32
1561   // TODO: Dynamic s64 indexing is only legal for SGPR.
1562   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI);
1563   if (!IdxVal) // Dynamic case will be selected to register indexing.
1564     return true;
1565 
1566   Register Dst = MI.getOperand(0).getReg();
1567   Register Vec = MI.getOperand(1).getReg();
1568   Register Ins = MI.getOperand(2).getReg();
1569 
1570   LLT VecTy = MRI.getType(Vec);
1571   LLT EltTy = VecTy.getElementType();
1572   assert(EltTy == MRI.getType(Ins));
1573 
1574   B.setInstr(MI);
1575 
1576   if (IdxVal.getValue() < VecTy.getNumElements())
1577     B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits());
1578   else
1579     B.buildUndef(Dst);
1580 
1581   MI.eraseFromParent();
1582   return true;
1583 }
1584 
1585 bool AMDGPULegalizerInfo::legalizeSinCos(
1586   MachineInstr &MI, MachineRegisterInfo &MRI,
1587   MachineIRBuilder &B) const {
1588   B.setInstr(MI);
1589 
1590   Register DstReg = MI.getOperand(0).getReg();
1591   Register SrcReg = MI.getOperand(1).getReg();
1592   LLT Ty = MRI.getType(DstReg);
1593   unsigned Flags = MI.getFlags();
1594 
1595   Register TrigVal;
1596   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1597   if (ST.hasTrigReducedRange()) {
1598     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1599     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1600       .addUse(MulVal.getReg(0))
1601       .setMIFlags(Flags).getReg(0);
1602   } else
1603     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1604 
1605   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1606     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1607   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1608     .addUse(TrigVal)
1609     .setMIFlags(Flags);
1610   MI.eraseFromParent();
1611   return true;
1612 }
1613 
1614 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1615   Register DstReg, LLT PtrTy,
1616   MachineIRBuilder &B, const GlobalValue *GV,
1617   unsigned Offset, unsigned GAFlags) const {
1618   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1619   // to the following code sequence:
1620   //
1621   // For constant address space:
1622   //   s_getpc_b64 s[0:1]
1623   //   s_add_u32 s0, s0, $symbol
1624   //   s_addc_u32 s1, s1, 0
1625   //
1626   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1627   //   a fixup or relocation is emitted to replace $symbol with a literal
1628   //   constant, which is a pc-relative offset from the encoding of the $symbol
1629   //   operand to the global variable.
1630   //
1631   // For global address space:
1632   //   s_getpc_b64 s[0:1]
1633   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1634   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1635   //
1636   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1637   //   fixups or relocations are emitted to replace $symbol@*@lo and
1638   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1639   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
1640   //   operand to the global variable.
1641   //
1642   // What we want here is an offset from the value returned by s_getpc
1643   // (which is the address of the s_add_u32 instruction) to the global
1644   // variable, but since the encoding of $symbol starts 4 bytes after the start
1645   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1646   // small. This requires us to add 4 to the global variable offset in order to
1647   // compute the correct address.
1648 
1649   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1650 
1651   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1652     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1653 
1654   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1655     .addDef(PCReg);
1656 
1657   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1658   if (GAFlags == SIInstrInfo::MO_NONE)
1659     MIB.addImm(0);
1660   else
1661     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1662 
1663   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1664 
1665   if (PtrTy.getSizeInBits() == 32)
1666     B.buildExtract(DstReg, PCReg, 0);
1667   return true;
1668  }
1669 
1670 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1671   MachineInstr &MI, MachineRegisterInfo &MRI,
1672   MachineIRBuilder &B) const {
1673   Register DstReg = MI.getOperand(0).getReg();
1674   LLT Ty = MRI.getType(DstReg);
1675   unsigned AS = Ty.getAddressSpace();
1676 
1677   const GlobalValue *GV = MI.getOperand(1).getGlobal();
1678   MachineFunction &MF = B.getMF();
1679   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1680   B.setInstr(MI);
1681 
1682   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1683     if (!MFI->isEntryFunction()) {
1684       const Function &Fn = MF.getFunction();
1685       DiagnosticInfoUnsupported BadLDSDecl(
1686         Fn, "local memory global used by non-kernel function", MI.getDebugLoc());
1687       Fn.getContext().diagnose(BadLDSDecl);
1688     }
1689 
1690     // TODO: We could emit code to handle the initialization somewhere.
1691     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1692       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1693       MI.eraseFromParent();
1694       return true;
1695     }
1696 
1697     const Function &Fn = MF.getFunction();
1698     DiagnosticInfoUnsupported BadInit(
1699       Fn, "unsupported initializer for address space", MI.getDebugLoc());
1700     Fn.getContext().diagnose(BadInit);
1701     return true;
1702   }
1703 
1704   const SITargetLowering *TLI = ST.getTargetLowering();
1705 
1706   if (TLI->shouldEmitFixup(GV)) {
1707     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
1708     MI.eraseFromParent();
1709     return true;
1710   }
1711 
1712   if (TLI->shouldEmitPCReloc(GV)) {
1713     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
1714     MI.eraseFromParent();
1715     return true;
1716   }
1717 
1718   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1719   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
1720 
1721   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
1722     MachinePointerInfo::getGOT(MF),
1723     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1724     MachineMemOperand::MOInvariant,
1725     8 /*Size*/, 8 /*Align*/);
1726 
1727   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
1728 
1729   if (Ty.getSizeInBits() == 32) {
1730     // Truncate if this is a 32-bit constant adrdess.
1731     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
1732     B.buildExtract(DstReg, Load, 0);
1733   } else
1734     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
1735 
1736   MI.eraseFromParent();
1737   return true;
1738 }
1739 
1740 bool AMDGPULegalizerInfo::legalizeLoad(
1741   MachineInstr &MI, MachineRegisterInfo &MRI,
1742   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
1743   B.setInstr(MI);
1744   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1745   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
1746   Observer.changingInstr(MI);
1747   MI.getOperand(1).setReg(Cast.getReg(0));
1748   Observer.changedInstr(MI);
1749   return true;
1750 }
1751 
1752 bool AMDGPULegalizerInfo::legalizeFMad(
1753   MachineInstr &MI, MachineRegisterInfo &MRI,
1754   MachineIRBuilder &B) const {
1755   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1756   assert(Ty.isScalar());
1757 
1758   MachineFunction &MF = B.getMF();
1759   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1760 
1761   // TODO: Always legal with future ftz flag.
1762   if (Ty == LLT::scalar(32) && !MFI->getMode().FP32Denormals)
1763     return true;
1764   if (Ty == LLT::scalar(16) && !MFI->getMode().FP64FP16Denormals)
1765     return true;
1766 
1767 
1768   MachineIRBuilder HelperBuilder(MI);
1769   GISelObserverWrapper DummyObserver;
1770   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1771   HelperBuilder.setMBB(*MI.getParent());
1772   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
1773 }
1774 
1775 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
1776   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
1777   Register DstReg = MI.getOperand(0).getReg();
1778   Register PtrReg = MI.getOperand(1).getReg();
1779   Register CmpVal = MI.getOperand(2).getReg();
1780   Register NewVal = MI.getOperand(3).getReg();
1781 
1782   assert(SITargetLowering::isFlatGlobalAddrSpace(
1783            MRI.getType(PtrReg).getAddressSpace()) &&
1784          "this should not have been custom lowered");
1785 
1786   LLT ValTy = MRI.getType(CmpVal);
1787   LLT VecTy = LLT::vector(2, ValTy);
1788 
1789   B.setInstr(MI);
1790   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
1791 
1792   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
1793     .addDef(DstReg)
1794     .addUse(PtrReg)
1795     .addUse(PackedVal)
1796     .setMemRefs(MI.memoperands());
1797 
1798   MI.eraseFromParent();
1799   return true;
1800 }
1801 
1802 // Return the use branch instruction, otherwise null if the usage is invalid.
1803 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
1804                                        MachineRegisterInfo &MRI) {
1805   Register CondDef = MI.getOperand(0).getReg();
1806   if (!MRI.hasOneNonDBGUse(CondDef))
1807     return nullptr;
1808 
1809   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
1810   return UseMI.getParent() == MI.getParent() &&
1811     UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr;
1812 }
1813 
1814 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
1815                                                 Register Reg, LLT Ty) const {
1816   Register LiveIn = MRI.getLiveInVirtReg(Reg);
1817   if (LiveIn)
1818     return LiveIn;
1819 
1820   Register NewReg = MRI.createGenericVirtualRegister(Ty);
1821   MRI.addLiveIn(Reg, NewReg);
1822   return NewReg;
1823 }
1824 
1825 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
1826                                          const ArgDescriptor *Arg) const {
1827   if (!Arg->isRegister() || !Arg->getRegister().isValid())
1828     return false; // TODO: Handle these
1829 
1830   assert(Arg->getRegister().isPhysical());
1831 
1832   MachineRegisterInfo &MRI = *B.getMRI();
1833 
1834   LLT Ty = MRI.getType(DstReg);
1835   Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
1836 
1837   if (Arg->isMasked()) {
1838     // TODO: Should we try to emit this once in the entry block?
1839     const LLT S32 = LLT::scalar(32);
1840     const unsigned Mask = Arg->getMask();
1841     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
1842 
1843     Register AndMaskSrc = LiveIn;
1844 
1845     if (Shift != 0) {
1846       auto ShiftAmt = B.buildConstant(S32, Shift);
1847       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
1848     }
1849 
1850     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
1851   } else
1852     B.buildCopy(DstReg, LiveIn);
1853 
1854   // Insert the argument copy if it doens't already exist.
1855   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
1856   if (!MRI.getVRegDef(LiveIn)) {
1857     // FIXME: Should have scoped insert pt
1858     MachineBasicBlock &OrigInsBB = B.getMBB();
1859     auto OrigInsPt = B.getInsertPt();
1860 
1861     MachineBasicBlock &EntryMBB = B.getMF().front();
1862     EntryMBB.addLiveIn(Arg->getRegister());
1863     B.setInsertPt(EntryMBB, EntryMBB.begin());
1864     B.buildCopy(LiveIn, Arg->getRegister());
1865 
1866     B.setInsertPt(OrigInsBB, OrigInsPt);
1867   }
1868 
1869   return true;
1870 }
1871 
1872 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
1873   MachineInstr &MI,
1874   MachineRegisterInfo &MRI,
1875   MachineIRBuilder &B,
1876   AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
1877   B.setInstr(MI);
1878 
1879   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1880 
1881   const ArgDescriptor *Arg;
1882   const TargetRegisterClass *RC;
1883   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
1884   if (!Arg) {
1885     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
1886     return false;
1887   }
1888 
1889   if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
1890     MI.eraseFromParent();
1891     return true;
1892   }
1893 
1894   return false;
1895 }
1896 
1897 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
1898                                        MachineRegisterInfo &MRI,
1899                                        MachineIRBuilder &B) const {
1900   B.setInstr(MI);
1901   Register Dst = MI.getOperand(0).getReg();
1902   LLT DstTy = MRI.getType(Dst);
1903   LLT S16 = LLT::scalar(16);
1904   LLT S32 = LLT::scalar(32);
1905   LLT S64 = LLT::scalar(64);
1906 
1907   if (legalizeFastUnsafeFDIV(MI, MRI, B))
1908     return true;
1909 
1910   if (DstTy == S16)
1911     return legalizeFDIV16(MI, MRI, B);
1912   if (DstTy == S32)
1913     return legalizeFDIV32(MI, MRI, B);
1914   if (DstTy == S64)
1915     return legalizeFDIV64(MI, MRI, B);
1916 
1917   return false;
1918 }
1919 
1920 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
1921                                                  MachineRegisterInfo &MRI,
1922                                                  MachineIRBuilder &B) const {
1923   Register Res = MI.getOperand(0).getReg();
1924   Register LHS = MI.getOperand(1).getReg();
1925   Register RHS = MI.getOperand(2).getReg();
1926 
1927   uint16_t Flags = MI.getFlags();
1928 
1929   LLT ResTy = MRI.getType(Res);
1930   LLT S32 = LLT::scalar(32);
1931   LLT S64 = LLT::scalar(64);
1932 
1933   const MachineFunction &MF = B.getMF();
1934   bool Unsafe =
1935     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
1936 
1937   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
1938     return false;
1939 
1940   if (!Unsafe && ResTy == S32 &&
1941       MF.getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals)
1942     return false;
1943 
1944   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
1945     // 1 / x -> RCP(x)
1946     if (CLHS->isExactlyValue(1.0)) {
1947       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
1948         .addUse(RHS)
1949         .setMIFlags(Flags);
1950 
1951       MI.eraseFromParent();
1952       return true;
1953     }
1954 
1955     // -1 / x -> RCP( FNEG(x) )
1956     if (CLHS->isExactlyValue(-1.0)) {
1957       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
1958       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
1959         .addUse(FNeg.getReg(0))
1960         .setMIFlags(Flags);
1961 
1962       MI.eraseFromParent();
1963       return true;
1964     }
1965   }
1966 
1967   // x / y -> x * (1.0 / y)
1968   if (Unsafe) {
1969     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
1970       .addUse(RHS)
1971       .setMIFlags(Flags);
1972     B.buildFMul(Res, LHS, RCP, Flags);
1973 
1974     MI.eraseFromParent();
1975     return true;
1976   }
1977 
1978   return false;
1979 }
1980 
1981 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
1982                                          MachineRegisterInfo &MRI,
1983                                          MachineIRBuilder &B) const {
1984   B.setInstr(MI);
1985   Register Res = MI.getOperand(0).getReg();
1986   Register LHS = MI.getOperand(1).getReg();
1987   Register RHS = MI.getOperand(2).getReg();
1988 
1989   uint16_t Flags = MI.getFlags();
1990 
1991   LLT S16 = LLT::scalar(16);
1992   LLT S32 = LLT::scalar(32);
1993 
1994   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
1995   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
1996 
1997   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
1998     .addUse(RHSExt.getReg(0))
1999     .setMIFlags(Flags);
2000 
2001   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2002   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2003 
2004   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2005     .addUse(RDst.getReg(0))
2006     .addUse(RHS)
2007     .addUse(LHS)
2008     .setMIFlags(Flags);
2009 
2010   MI.eraseFromParent();
2011   return true;
2012 }
2013 
2014 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2015 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2016 static void toggleSPDenormMode(bool Enable,
2017                                MachineIRBuilder &B,
2018                                const GCNSubtarget &ST,
2019                                AMDGPU::SIModeRegisterDefaults Mode) {
2020   // Set SP denorm mode to this value.
2021   unsigned SPDenormMode =
2022     Enable ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
2023 
2024   if (ST.hasDenormModeInst()) {
2025     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2026     unsigned DPDenormModeDefault = Mode.FP64FP16Denormals
2027                                    ? FP_DENORM_FLUSH_NONE
2028                                    : FP_DENORM_FLUSH_IN_FLUSH_OUT;
2029 
2030     unsigned NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2031     B.buildInstr(AMDGPU::S_DENORM_MODE)
2032       .addImm(NewDenormModeValue);
2033 
2034   } else {
2035     // Select FP32 bit field in mode register.
2036     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2037                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2038                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2039 
2040     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2041       .addImm(SPDenormMode)
2042       .addImm(SPDenormModeBitField);
2043   }
2044 }
2045 
2046 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2047                                          MachineRegisterInfo &MRI,
2048                                          MachineIRBuilder &B) const {
2049   B.setInstr(MI);
2050   Register Res = MI.getOperand(0).getReg();
2051   Register LHS = MI.getOperand(1).getReg();
2052   Register RHS = MI.getOperand(2).getReg();
2053   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2054   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2055 
2056   uint16_t Flags = MI.getFlags();
2057 
2058   LLT S32 = LLT::scalar(32);
2059   LLT S1 = LLT::scalar(1);
2060 
2061   auto One = B.buildFConstant(S32, 1.0f);
2062 
2063   auto DenominatorScaled =
2064     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2065       .addUse(RHS)
2066       .addUse(LHS)
2067       .addImm(1)
2068       .setMIFlags(Flags);
2069   auto NumeratorScaled =
2070     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2071       .addUse(LHS)
2072       .addUse(RHS)
2073       .addImm(0)
2074       .setMIFlags(Flags);
2075 
2076   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2077     .addUse(DenominatorScaled.getReg(0))
2078     .setMIFlags(Flags);
2079   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2080 
2081   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2082   // aren't modeled as reading it.
2083   if (!Mode.FP32Denormals)
2084     toggleSPDenormMode(true, B, ST, Mode);
2085 
2086   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2087   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2088   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2089   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2090   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2091   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2092 
2093   if (!Mode.FP32Denormals)
2094     toggleSPDenormMode(false, B, ST, Mode);
2095 
2096   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2097     .addUse(Fma4.getReg(0))
2098     .addUse(Fma1.getReg(0))
2099     .addUse(Fma3.getReg(0))
2100     .addUse(NumeratorScaled.getReg(1))
2101     .setMIFlags(Flags);
2102 
2103   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2104     .addUse(Fmas.getReg(0))
2105     .addUse(RHS)
2106     .addUse(LHS)
2107     .setMIFlags(Flags);
2108 
2109   MI.eraseFromParent();
2110   return true;
2111 }
2112 
2113 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
2114                                          MachineRegisterInfo &MRI,
2115                                          MachineIRBuilder &B) const {
2116   B.setInstr(MI);
2117   Register Res = MI.getOperand(0).getReg();
2118   Register LHS = MI.getOperand(1).getReg();
2119   Register RHS = MI.getOperand(2).getReg();
2120 
2121   uint16_t Flags = MI.getFlags();
2122 
2123   LLT S64 = LLT::scalar(64);
2124   LLT S1 = LLT::scalar(1);
2125 
2126   auto One = B.buildFConstant(S64, 1.0);
2127 
2128   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2129     .addUse(LHS)
2130     .addUse(RHS)
2131     .addImm(1)
2132     .setMIFlags(Flags);
2133 
2134   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
2135 
2136   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
2137     .addUse(DivScale0.getReg(0))
2138     .setMIFlags(Flags);
2139 
2140   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
2141   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
2142   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
2143 
2144   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2145     .addUse(LHS)
2146     .addUse(RHS)
2147     .addImm(0)
2148     .setMIFlags(Flags);
2149 
2150   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
2151   auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags);
2152   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
2153 
2154   Register Scale;
2155   if (!ST.hasUsableDivScaleConditionOutput()) {
2156     // Workaround a hardware bug on SI where the condition output from div_scale
2157     // is not usable.
2158 
2159     Scale = MRI.createGenericVirtualRegister(S1);
2160 
2161     LLT S32 = LLT::scalar(32);
2162 
2163     auto NumUnmerge = B.buildUnmerge(S32, LHS);
2164     auto DenUnmerge = B.buildUnmerge(S32, RHS);
2165     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
2166     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
2167 
2168     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
2169                               Scale1Unmerge.getReg(1));
2170     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
2171                               Scale0Unmerge.getReg(1));
2172     B.buildXor(Scale, CmpNum, CmpDen);
2173   } else {
2174     Scale = DivScale1.getReg(1);
2175   }
2176 
2177   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
2178     .addUse(Fma4.getReg(0))
2179     .addUse(Fma3.getReg(0))
2180     .addUse(Mul.getReg(0))
2181     .addUse(Scale)
2182     .setMIFlags(Flags);
2183 
2184   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
2185     .addUse(Fmas.getReg(0))
2186     .addUse(RHS)
2187     .addUse(LHS)
2188     .setMIFlags(Flags);
2189 
2190   MI.eraseFromParent();
2191   return true;
2192 }
2193 
2194 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
2195                                                  MachineRegisterInfo &MRI,
2196                                                  MachineIRBuilder &B) const {
2197   B.setInstr(MI);
2198   Register Res = MI.getOperand(0).getReg();
2199   Register LHS = MI.getOperand(2).getReg();
2200   Register RHS = MI.getOperand(3).getReg();
2201   uint16_t Flags = MI.getFlags();
2202 
2203   LLT S32 = LLT::scalar(32);
2204   LLT S1 = LLT::scalar(1);
2205 
2206   auto Abs = B.buildFAbs(S32, RHS, Flags);
2207   const APFloat C0Val(1.0f);
2208 
2209   auto C0 = B.buildConstant(S32, 0x6f800000);
2210   auto C1 = B.buildConstant(S32, 0x2f800000);
2211   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
2212 
2213   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
2214   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
2215 
2216   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
2217 
2218   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2219     .addUse(Mul0.getReg(0))
2220     .setMIFlags(Flags);
2221 
2222   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
2223 
2224   B.buildFMul(Res, Sel, Mul1, Flags);
2225 
2226   MI.eraseFromParent();
2227   return true;
2228 }
2229 
2230 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
2231                                                  MachineRegisterInfo &MRI,
2232                                                  MachineIRBuilder &B) const {
2233   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2234   if (!MFI->isEntryFunction()) {
2235     return legalizePreloadedArgIntrin(MI, MRI, B,
2236                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
2237   }
2238 
2239   B.setInstr(MI);
2240 
2241   uint64_t Offset =
2242     ST.getTargetLowering()->getImplicitParameterOffset(
2243       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
2244   Register DstReg = MI.getOperand(0).getReg();
2245   LLT DstTy = MRI.getType(DstReg);
2246   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
2247 
2248   const ArgDescriptor *Arg;
2249   const TargetRegisterClass *RC;
2250   std::tie(Arg, RC)
2251     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2252   if (!Arg)
2253     return false;
2254 
2255   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
2256   if (!loadInputValue(KernargPtrReg, B, Arg))
2257     return false;
2258 
2259   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
2260   MI.eraseFromParent();
2261   return true;
2262 }
2263 
2264 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
2265                                               MachineRegisterInfo &MRI,
2266                                               MachineIRBuilder &B,
2267                                               unsigned AddrSpace) const {
2268   B.setInstr(MI);
2269   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
2270   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
2271   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
2272   MI.eraseFromParent();
2273   return true;
2274 }
2275 
2276 /// Handle register layout difference for f16 images for some subtargets.
2277 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
2278                                              MachineRegisterInfo &MRI,
2279                                              Register Reg) const {
2280   if (!ST.hasUnpackedD16VMem())
2281     return Reg;
2282 
2283   const LLT S16 = LLT::scalar(16);
2284   const LLT S32 = LLT::scalar(32);
2285   LLT StoreVT = MRI.getType(Reg);
2286   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
2287 
2288   auto Unmerge = B.buildUnmerge(S16, Reg);
2289 
2290   SmallVector<Register, 4> WideRegs;
2291   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
2292     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
2293 
2294   int NumElts = StoreVT.getNumElements();
2295 
2296   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
2297 }
2298 
2299 bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI,
2300                                                  MachineRegisterInfo &MRI,
2301                                                  MachineIRBuilder &B,
2302                                                  bool IsFormat) const {
2303   // TODO: Reject f16 format on targets where unsupported.
2304   Register VData = MI.getOperand(1).getReg();
2305   LLT Ty = MRI.getType(VData);
2306 
2307   B.setInstr(MI);
2308 
2309   const LLT S32 = LLT::scalar(32);
2310   const LLT S16 = LLT::scalar(16);
2311 
2312   // Fixup illegal register types for i8 stores.
2313   if (Ty == LLT::scalar(8) || Ty == S16) {
2314     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
2315     MI.getOperand(1).setReg(AnyExt);
2316     return true;
2317   }
2318 
2319   if (Ty.isVector()) {
2320     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
2321       if (IsFormat)
2322         MI.getOperand(1).setReg(handleD16VData(B, MRI, VData));
2323       return true;
2324     }
2325 
2326     return Ty.getElementType() == S32 && Ty.getNumElements() <= 4;
2327   }
2328 
2329   return Ty == S32;
2330 }
2331 
2332 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
2333                                             MachineRegisterInfo &MRI,
2334                                             MachineIRBuilder &B) const {
2335   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
2336   auto IntrID = MI.getIntrinsicID();
2337   switch (IntrID) {
2338   case Intrinsic::amdgcn_if:
2339   case Intrinsic::amdgcn_else: {
2340     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
2341       const SIRegisterInfo *TRI
2342         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
2343 
2344       B.setInstr(*BrCond);
2345       Register Def = MI.getOperand(1).getReg();
2346       Register Use = MI.getOperand(3).getReg();
2347 
2348       if (IntrID == Intrinsic::amdgcn_if) {
2349         B.buildInstr(AMDGPU::SI_IF)
2350           .addDef(Def)
2351           .addUse(Use)
2352           .addMBB(BrCond->getOperand(1).getMBB());
2353       } else {
2354         B.buildInstr(AMDGPU::SI_ELSE)
2355           .addDef(Def)
2356           .addUse(Use)
2357           .addMBB(BrCond->getOperand(1).getMBB())
2358           .addImm(0);
2359       }
2360 
2361       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
2362       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
2363       MI.eraseFromParent();
2364       BrCond->eraseFromParent();
2365       return true;
2366     }
2367 
2368     return false;
2369   }
2370   case Intrinsic::amdgcn_loop: {
2371     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
2372       const SIRegisterInfo *TRI
2373         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
2374 
2375       B.setInstr(*BrCond);
2376       Register Reg = MI.getOperand(2).getReg();
2377       B.buildInstr(AMDGPU::SI_LOOP)
2378         .addUse(Reg)
2379         .addMBB(BrCond->getOperand(1).getMBB());
2380       MI.eraseFromParent();
2381       BrCond->eraseFromParent();
2382       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
2383       return true;
2384     }
2385 
2386     return false;
2387   }
2388   case Intrinsic::amdgcn_kernarg_segment_ptr:
2389     return legalizePreloadedArgIntrin(
2390       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2391   case Intrinsic::amdgcn_implicitarg_ptr:
2392     return legalizeImplicitArgPtr(MI, MRI, B);
2393   case Intrinsic::amdgcn_workitem_id_x:
2394     return legalizePreloadedArgIntrin(MI, MRI, B,
2395                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
2396   case Intrinsic::amdgcn_workitem_id_y:
2397     return legalizePreloadedArgIntrin(MI, MRI, B,
2398                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
2399   case Intrinsic::amdgcn_workitem_id_z:
2400     return legalizePreloadedArgIntrin(MI, MRI, B,
2401                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
2402   case Intrinsic::amdgcn_workgroup_id_x:
2403     return legalizePreloadedArgIntrin(MI, MRI, B,
2404                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
2405   case Intrinsic::amdgcn_workgroup_id_y:
2406     return legalizePreloadedArgIntrin(MI, MRI, B,
2407                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
2408   case Intrinsic::amdgcn_workgroup_id_z:
2409     return legalizePreloadedArgIntrin(MI, MRI, B,
2410                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
2411   case Intrinsic::amdgcn_dispatch_ptr:
2412     return legalizePreloadedArgIntrin(MI, MRI, B,
2413                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
2414   case Intrinsic::amdgcn_queue_ptr:
2415     return legalizePreloadedArgIntrin(MI, MRI, B,
2416                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
2417   case Intrinsic::amdgcn_implicit_buffer_ptr:
2418     return legalizePreloadedArgIntrin(
2419       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
2420   case Intrinsic::amdgcn_dispatch_id:
2421     return legalizePreloadedArgIntrin(MI, MRI, B,
2422                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
2423   case Intrinsic::amdgcn_fdiv_fast:
2424     return legalizeFDIVFastIntrin(MI, MRI, B);
2425   case Intrinsic::amdgcn_is_shared:
2426     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
2427   case Intrinsic::amdgcn_is_private:
2428     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
2429   case Intrinsic::amdgcn_wavefrontsize: {
2430     B.setInstr(MI);
2431     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
2432     MI.eraseFromParent();
2433     return true;
2434   }
2435   case Intrinsic::amdgcn_raw_buffer_store:
2436     return legalizeRawBufferStore(MI, MRI, B, false);
2437   case Intrinsic::amdgcn_raw_buffer_store_format:
2438     return legalizeRawBufferStore(MI, MRI, B, true);
2439   default:
2440     return true;
2441   }
2442 
2443   return true;
2444 }
2445