1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
19 #endif
20 
21 #include "AMDGPU.h"
22 #include "AMDGPULegalizerInfo.h"
23 #include "AMDGPUTargetMachine.h"
24 #include "SIMachineFunctionInfo.h"
25 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/CodeGen/TargetOpcodes.h"
28 #include "llvm/CodeGen/ValueTypes.h"
29 #include "llvm/IR/DerivedTypes.h"
30 #include "llvm/IR/DiagnosticInfo.h"
31 #include "llvm/IR/Type.h"
32 #include "llvm/Support/Debug.h"
33 
34 #define DEBUG_TYPE "amdgpu-legalinfo"
35 
36 using namespace llvm;
37 using namespace LegalizeActions;
38 using namespace LegalizeMutations;
39 using namespace LegalityPredicates;
40 
41 
42 static LegalityPredicate isMultiple32(unsigned TypeIdx,
43                                       unsigned MaxSize = 1024) {
44   return [=](const LegalityQuery &Query) {
45     const LLT Ty = Query.Types[TypeIdx];
46     const LLT EltTy = Ty.getScalarType();
47     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
48   };
49 }
50 
51 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
52   return [=](const LegalityQuery &Query) {
53     return Query.Types[TypeIdx].getSizeInBits() == Size;
54   };
55 }
56 
57 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
58   return [=](const LegalityQuery &Query) {
59     const LLT Ty = Query.Types[TypeIdx];
60     return Ty.isVector() &&
61            Ty.getNumElements() % 2 != 0 &&
62            Ty.getElementType().getSizeInBits() < 32 &&
63            Ty.getSizeInBits() % 32 != 0;
64   };
65 }
66 
67 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
68   return [=](const LegalityQuery &Query) {
69     const LLT Ty = Query.Types[TypeIdx];
70     const LLT EltTy = Ty.getScalarType();
71     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
72   };
73 }
74 
75 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
76   return [=](const LegalityQuery &Query) {
77     const LLT Ty = Query.Types[TypeIdx];
78     const LLT EltTy = Ty.getElementType();
79     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
80   };
81 }
82 
83 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
84   return [=](const LegalityQuery &Query) {
85     const LLT Ty = Query.Types[TypeIdx];
86     const LLT EltTy = Ty.getElementType();
87     unsigned Size = Ty.getSizeInBits();
88     unsigned Pieces = (Size + 63) / 64;
89     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
90     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
91   };
92 }
93 
94 // Increase the number of vector elements to reach the next multiple of 32-bit
95 // type.
96 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
97   return [=](const LegalityQuery &Query) {
98     const LLT Ty = Query.Types[TypeIdx];
99 
100     const LLT EltTy = Ty.getElementType();
101     const int Size = Ty.getSizeInBits();
102     const int EltSize = EltTy.getSizeInBits();
103     const int NextMul32 = (Size + 31) / 32;
104 
105     assert(EltSize < 32);
106 
107     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
108     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
109   };
110 }
111 
112 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
113   return [=](const LegalityQuery &Query) {
114     const LLT QueryTy = Query.Types[TypeIdx];
115     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
116   };
117 }
118 
119 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
120   return [=](const LegalityQuery &Query) {
121     const LLT QueryTy = Query.Types[TypeIdx];
122     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
123   };
124 }
125 
126 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
127   return [=](const LegalityQuery &Query) {
128     const LLT QueryTy = Query.Types[TypeIdx];
129     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
130   };
131 }
132 
133 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
134 // v2s16.
135 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
136   return [=](const LegalityQuery &Query) {
137     const LLT Ty = Query.Types[TypeIdx];
138     if (Ty.isVector()) {
139       const int EltSize = Ty.getElementType().getSizeInBits();
140       return EltSize == 32 || EltSize == 64 ||
141             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
142              EltSize == 128 || EltSize == 256;
143     }
144 
145     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
146   };
147 }
148 
149 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
150   return [=](const LegalityQuery &Query) {
151     return Query.Types[TypeIdx].getElementType() == Type;
152   };
153 }
154 
155 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
156   return [=](const LegalityQuery &Query) {
157     const LLT Ty = Query.Types[TypeIdx];
158     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
159            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
160   };
161 }
162 
163 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
164                                          const GCNTargetMachine &TM)
165   :  ST(ST_) {
166   using namespace TargetOpcode;
167 
168   auto GetAddrSpacePtr = [&TM](unsigned AS) {
169     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
170   };
171 
172   const LLT S1 = LLT::scalar(1);
173   const LLT S8 = LLT::scalar(8);
174   const LLT S16 = LLT::scalar(16);
175   const LLT S32 = LLT::scalar(32);
176   const LLT S64 = LLT::scalar(64);
177   const LLT S96 = LLT::scalar(96);
178   const LLT S128 = LLT::scalar(128);
179   const LLT S256 = LLT::scalar(256);
180   const LLT S1024 = LLT::scalar(1024);
181 
182   const LLT V2S16 = LLT::vector(2, 16);
183   const LLT V4S16 = LLT::vector(4, 16);
184 
185   const LLT V2S32 = LLT::vector(2, 32);
186   const LLT V3S32 = LLT::vector(3, 32);
187   const LLT V4S32 = LLT::vector(4, 32);
188   const LLT V5S32 = LLT::vector(5, 32);
189   const LLT V6S32 = LLT::vector(6, 32);
190   const LLT V7S32 = LLT::vector(7, 32);
191   const LLT V8S32 = LLT::vector(8, 32);
192   const LLT V9S32 = LLT::vector(9, 32);
193   const LLT V10S32 = LLT::vector(10, 32);
194   const LLT V11S32 = LLT::vector(11, 32);
195   const LLT V12S32 = LLT::vector(12, 32);
196   const LLT V13S32 = LLT::vector(13, 32);
197   const LLT V14S32 = LLT::vector(14, 32);
198   const LLT V15S32 = LLT::vector(15, 32);
199   const LLT V16S32 = LLT::vector(16, 32);
200   const LLT V32S32 = LLT::vector(32, 32);
201 
202   const LLT V2S64 = LLT::vector(2, 64);
203   const LLT V3S64 = LLT::vector(3, 64);
204   const LLT V4S64 = LLT::vector(4, 64);
205   const LLT V5S64 = LLT::vector(5, 64);
206   const LLT V6S64 = LLT::vector(6, 64);
207   const LLT V7S64 = LLT::vector(7, 64);
208   const LLT V8S64 = LLT::vector(8, 64);
209   const LLT V16S64 = LLT::vector(16, 64);
210 
211   std::initializer_list<LLT> AllS32Vectors =
212     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
213      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
214   std::initializer_list<LLT> AllS64Vectors =
215     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
216 
217   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
218   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
219   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
220   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
221   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
222   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
223   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
224 
225   const LLT CodePtr = FlatPtr;
226 
227   const std::initializer_list<LLT> AddrSpaces64 = {
228     GlobalPtr, ConstantPtr, FlatPtr
229   };
230 
231   const std::initializer_list<LLT> AddrSpaces32 = {
232     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
233   };
234 
235   const std::initializer_list<LLT> FPTypesBase = {
236     S32, S64
237   };
238 
239   const std::initializer_list<LLT> FPTypes16 = {
240     S32, S64, S16
241   };
242 
243   const std::initializer_list<LLT> FPTypesPK16 = {
244     S32, S64, S16, V2S16
245   };
246 
247   setAction({G_BRCOND, S1}, Legal); // VCC branches
248   setAction({G_BRCOND, S32}, Legal); // SCC branches
249 
250   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
251   // elements for v3s16
252   getActionDefinitionsBuilder(G_PHI)
253     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
254     .legalFor(AllS32Vectors)
255     .legalFor(AllS64Vectors)
256     .legalFor(AddrSpaces64)
257     .legalFor(AddrSpaces32)
258     .clampScalar(0, S32, S256)
259     .widenScalarToNextPow2(0, 32)
260     .clampMaxNumElements(0, S32, 16)
261     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
262     .legalIf(isPointer(0));
263 
264   if (ST.has16BitInsts()) {
265     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
266       .legalFor({S32, S16})
267       .clampScalar(0, S16, S32)
268       .scalarize(0);
269   } else {
270     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
271       .legalFor({S32})
272       .clampScalar(0, S32, S32)
273       .scalarize(0);
274   }
275 
276   // FIXME: Not really legal. Placeholder for custom lowering.
277   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
278     .legalFor({S32, S64})
279     .clampScalar(0, S32, S64)
280     .widenScalarToNextPow2(0, 32)
281     .scalarize(0);
282 
283   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
284     .legalFor({S32})
285     .clampScalar(0, S32, S32)
286     .scalarize(0);
287 
288   // Report legal for any types we can handle anywhere. For the cases only legal
289   // on the SALU, RegBankSelect will be able to re-legalize.
290   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
291     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
292     .clampScalar(0, S32, S64)
293     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
294     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
295     .widenScalarToNextPow2(0)
296     .scalarize(0);
297 
298   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
299                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
300     .legalFor({{S32, S1}, {S32, S32}})
301     .clampScalar(0, S32, S32)
302     .scalarize(0); // TODO: Implement.
303 
304   getActionDefinitionsBuilder({G_SADDO, G_SSUBO})
305     .lower();
306 
307   getActionDefinitionsBuilder(G_BITCAST)
308     // Don't worry about the size constraint.
309     .legalIf(all(isRegisterType(0), isRegisterType(1)))
310     // FIXME: Testing hack
311     .legalForCartesianProduct({S16, LLT::vector(2, 8), });
312 
313   getActionDefinitionsBuilder(G_FCONSTANT)
314     .legalFor({S32, S64, S16})
315     .clampScalar(0, S16, S64);
316 
317   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
318     .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
319                ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
320     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
321     .clampScalarOrElt(0, S32, S1024)
322     .legalIf(isMultiple32(0))
323     .widenScalarToNextPow2(0, 32)
324     .clampMaxNumElements(0, S32, 16);
325 
326 
327   // FIXME: i1 operands to intrinsics should always be legal, but other i1
328   // values may not be legal.  We need to figure out how to distinguish
329   // between these two scenarios.
330   getActionDefinitionsBuilder(G_CONSTANT)
331     .legalFor({S1, S32, S64, S16, GlobalPtr,
332                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
333     .clampScalar(0, S32, S64)
334     .widenScalarToNextPow2(0)
335     .legalIf(isPointer(0));
336 
337   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
338   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
339     .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr});
340 
341 
342   auto &FPOpActions = getActionDefinitionsBuilder(
343     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
344     .legalFor({S32, S64});
345   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
346     .customFor({S32, S64});
347   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
348     .customFor({S32, S64});
349 
350   if (ST.has16BitInsts()) {
351     if (ST.hasVOP3PInsts())
352       FPOpActions.legalFor({S16, V2S16});
353     else
354       FPOpActions.legalFor({S16});
355 
356     TrigActions.customFor({S16});
357     FDIVActions.customFor({S16});
358   }
359 
360   auto &MinNumMaxNum = getActionDefinitionsBuilder({
361       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
362 
363   if (ST.hasVOP3PInsts()) {
364     MinNumMaxNum.customFor(FPTypesPK16)
365       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
366       .clampMaxNumElements(0, S16, 2)
367       .clampScalar(0, S16, S64)
368       .scalarize(0);
369   } else if (ST.has16BitInsts()) {
370     MinNumMaxNum.customFor(FPTypes16)
371       .clampScalar(0, S16, S64)
372       .scalarize(0);
373   } else {
374     MinNumMaxNum.customFor(FPTypesBase)
375       .clampScalar(0, S32, S64)
376       .scalarize(0);
377   }
378 
379   if (ST.hasVOP3PInsts())
380     FPOpActions.clampMaxNumElements(0, S16, 2);
381 
382   FPOpActions
383     .scalarize(0)
384     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
385 
386   TrigActions
387     .scalarize(0)
388     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
389 
390   FDIVActions
391     .scalarize(0)
392     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
393 
394   getActionDefinitionsBuilder({G_FNEG, G_FABS})
395     .legalFor(FPTypesPK16)
396     .clampMaxNumElements(0, S16, 2)
397     .scalarize(0)
398     .clampScalar(0, S16, S64);
399 
400   // TODO: Implement
401   getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
402 
403   if (ST.has16BitInsts()) {
404     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
405       .legalFor({S32, S64, S16})
406       .scalarize(0)
407       .clampScalar(0, S16, S64);
408   } else {
409     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
410       .legalFor({S32, S64})
411       .scalarize(0)
412       .clampScalar(0, S32, S64);
413   }
414 
415   getActionDefinitionsBuilder(G_FPTRUNC)
416     .legalFor({{S32, S64}, {S16, S32}})
417     .scalarize(0);
418 
419   getActionDefinitionsBuilder(G_FPEXT)
420     .legalFor({{S64, S32}, {S32, S16}})
421     .lowerFor({{S64, S16}}) // FIXME: Implement
422     .scalarize(0);
423 
424   // TODO: Verify V_BFI_B32 is generated from expanded bit ops.
425   getActionDefinitionsBuilder(G_FCOPYSIGN).lower();
426 
427   getActionDefinitionsBuilder(G_FSUB)
428       // Use actual fsub instruction
429       .legalFor({S32})
430       // Must use fadd + fneg
431       .lowerFor({S64, S16, V2S16})
432       .scalarize(0)
433       .clampScalar(0, S32, S64);
434 
435   // Whether this is legal depends on the floating point mode for the function.
436   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
437   if (ST.hasMadF16())
438     FMad.customFor({S32, S16});
439   else
440     FMad.customFor({S32});
441   FMad.scalarize(0)
442       .lower();
443 
444   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
445     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
446                {S32, S1}, {S64, S1}, {S16, S1},
447                {S96, S32},
448                // FIXME: Hack
449                {S64, LLT::scalar(33)},
450                {S32, S8}, {S32, LLT::scalar(24)}})
451     .scalarize(0)
452     .clampScalar(0, S32, S64);
453 
454   // TODO: Split s1->s64 during regbankselect for VALU.
455   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
456     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
457     .lowerFor({{S32, S64}})
458     .lowerIf(typeIs(1, S1))
459     .customFor({{S64, S64}});
460   if (ST.has16BitInsts())
461     IToFP.legalFor({{S16, S16}});
462   IToFP.clampScalar(1, S32, S64)
463        .scalarize(0);
464 
465   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
466     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}});
467   if (ST.has16BitInsts())
468     FPToI.legalFor({{S16, S16}});
469   else
470     FPToI.minScalar(1, S32);
471 
472   FPToI.minScalar(0, S32)
473        .scalarize(0);
474 
475   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
476     .scalarize(0)
477     .lower();
478 
479   if (ST.has16BitInsts()) {
480     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
481       .legalFor({S16, S32, S64})
482       .clampScalar(0, S16, S64)
483       .scalarize(0);
484   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
485     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
486       .legalFor({S32, S64})
487       .clampScalar(0, S32, S64)
488       .scalarize(0);
489   } else {
490     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
491       .legalFor({S32})
492       .customFor({S64})
493       .clampScalar(0, S32, S64)
494       .scalarize(0);
495   }
496 
497   getActionDefinitionsBuilder(G_PTR_ADD)
498     .legalForCartesianProduct(AddrSpaces64, {S64})
499     .legalForCartesianProduct(AddrSpaces32, {S32})
500     .scalarize(0);
501 
502   getActionDefinitionsBuilder(G_PTR_MASK)
503     .scalarize(0)
504     .alwaysLegal();
505 
506   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
507 
508   auto &CmpBuilder =
509     getActionDefinitionsBuilder(G_ICMP)
510     // The compare output type differs based on the register bank of the output,
511     // so make both s1 and s32 legal.
512     //
513     // Scalar compares producing output in scc will be promoted to s32, as that
514     // is the allocatable register type that will be needed for the copy from
515     // scc. This will be promoted during RegBankSelect, and we assume something
516     // before that won't try to use s32 result types.
517     //
518     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
519     // bank.
520     .legalForCartesianProduct(
521       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
522     .legalForCartesianProduct(
523       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
524   if (ST.has16BitInsts()) {
525     CmpBuilder.legalFor({{S1, S16}});
526   }
527 
528   CmpBuilder
529     .widenScalarToNextPow2(1)
530     .clampScalar(1, S32, S64)
531     .scalarize(0)
532     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
533 
534   getActionDefinitionsBuilder(G_FCMP)
535     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
536     .widenScalarToNextPow2(1)
537     .clampScalar(1, S32, S64)
538     .scalarize(0);
539 
540   // FIXME: fexp, flog2, flog10 needs to be custom lowered.
541   getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2,
542                                G_FLOG, G_FLOG2, G_FLOG10})
543     .legalFor({S32})
544     .scalarize(0);
545 
546   // The 64-bit versions produce 32-bit results, but only on the SALU.
547   getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF,
548                                G_CTTZ, G_CTTZ_ZERO_UNDEF,
549                                G_CTPOP})
550     .legalFor({{S32, S32}, {S32, S64}})
551     .clampScalar(0, S32, S32)
552     .clampScalar(1, S32, S64)
553     .scalarize(0)
554     .widenScalarToNextPow2(0, 32)
555     .widenScalarToNextPow2(1, 32);
556 
557   // TODO: Expand for > s32
558   getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE})
559     .legalFor({S32})
560     .clampScalar(0, S32, S32)
561     .scalarize(0);
562 
563   if (ST.has16BitInsts()) {
564     if (ST.hasVOP3PInsts()) {
565       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
566         .legalFor({S32, S16, V2S16})
567         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
568         .clampMaxNumElements(0, S16, 2)
569         .clampScalar(0, S16, S32)
570         .widenScalarToNextPow2(0)
571         .scalarize(0);
572     } else {
573       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
574         .legalFor({S32, S16})
575         .widenScalarToNextPow2(0)
576         .clampScalar(0, S16, S32)
577         .scalarize(0);
578     }
579   } else {
580     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
581       .legalFor({S32})
582       .clampScalar(0, S32, S32)
583       .widenScalarToNextPow2(0)
584       .scalarize(0);
585   }
586 
587   auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
588     return [=](const LegalityQuery &Query) {
589       return Query.Types[TypeIdx0].getSizeInBits() <
590              Query.Types[TypeIdx1].getSizeInBits();
591     };
592   };
593 
594   auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
595     return [=](const LegalityQuery &Query) {
596       return Query.Types[TypeIdx0].getSizeInBits() >
597              Query.Types[TypeIdx1].getSizeInBits();
598     };
599   };
600 
601   getActionDefinitionsBuilder(G_INTTOPTR)
602     // List the common cases
603     .legalForCartesianProduct(AddrSpaces64, {S64})
604     .legalForCartesianProduct(AddrSpaces32, {S32})
605     .scalarize(0)
606     // Accept any address space as long as the size matches
607     .legalIf(sameSize(0, 1))
608     .widenScalarIf(smallerThan(1, 0),
609       [](const LegalityQuery &Query) {
610         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
611       })
612     .narrowScalarIf(greaterThan(1, 0),
613       [](const LegalityQuery &Query) {
614         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
615       });
616 
617   getActionDefinitionsBuilder(G_PTRTOINT)
618     // List the common cases
619     .legalForCartesianProduct(AddrSpaces64, {S64})
620     .legalForCartesianProduct(AddrSpaces32, {S32})
621     .scalarize(0)
622     // Accept any address space as long as the size matches
623     .legalIf(sameSize(0, 1))
624     .widenScalarIf(smallerThan(0, 1),
625       [](const LegalityQuery &Query) {
626         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
627       })
628     .narrowScalarIf(
629       greaterThan(0, 1),
630       [](const LegalityQuery &Query) {
631         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
632       });
633 
634   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
635     .scalarize(0)
636     .custom();
637 
638   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
639   // handle some operations by just promoting the register during
640   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
641   auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned {
642     switch (AS) {
643     // FIXME: Private element size.
644     case AMDGPUAS::PRIVATE_ADDRESS:
645       return 32;
646     // FIXME: Check subtarget
647     case AMDGPUAS::LOCAL_ADDRESS:
648       return ST.useDS128() ? 128 : 64;
649 
650     // Treat constant and global as identical. SMRD loads are sometimes usable
651     // for global loads (ideally constant address space should be eliminated)
652     // depending on the context. Legality cannot be context dependent, but
653     // RegBankSelect can split the load as necessary depending on the pointer
654     // register bank/uniformity and if the memory is invariant or not written in
655     // a kernel.
656     case AMDGPUAS::CONSTANT_ADDRESS:
657     case AMDGPUAS::GLOBAL_ADDRESS:
658       return 512;
659     default:
660       return 128;
661     }
662   };
663 
664   const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool {
665     const LLT DstTy = Query.Types[0];
666 
667     // Split vector extloads.
668     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
669     unsigned Align = Query.MMODescrs[0].AlignInBits;
670 
671     if (MemSize < DstTy.getSizeInBits())
672       MemSize = std::max(MemSize, Align);
673 
674     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
675       return true;
676 
677     const LLT PtrTy = Query.Types[1];
678     unsigned AS = PtrTy.getAddressSpace();
679     if (MemSize > maxSizeForAddrSpace(AS))
680       return true;
681 
682     // Catch weird sized loads that don't evenly divide into the access sizes
683     // TODO: May be able to widen depending on alignment etc.
684     unsigned NumRegs = MemSize / 32;
685     if (NumRegs == 3 && !ST.hasDwordx3LoadStores())
686       return true;
687 
688     if (Align < MemSize) {
689       const SITargetLowering *TLI = ST.getTargetLowering();
690       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
691     }
692 
693     return false;
694   };
695 
696   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
697   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
698   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
699 
700   // TODO: Refine based on subtargets which support unaligned access or 128-bit
701   // LDS
702   // TODO: Unsupported flat for SI.
703 
704   for (unsigned Op : {G_LOAD, G_STORE}) {
705     const bool IsStore = Op == G_STORE;
706 
707     auto &Actions = getActionDefinitionsBuilder(Op);
708     // Whitelist the common cases.
709     // TODO: Pointer loads
710     // TODO: Wide constant loads
711     // TODO: Only CI+ has 3x loads
712     // TODO: Loads to s16 on gfx9
713     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
714                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
715                                       {V3S32, GlobalPtr, 96, GlobalAlign32},
716                                       {S96, GlobalPtr, 96, GlobalAlign32},
717                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
718                                       {S128, GlobalPtr, 128, GlobalAlign32},
719                                       {S64, GlobalPtr, 64, GlobalAlign32},
720                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
721                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
722                                       {S32, GlobalPtr, 8, GlobalAlign8},
723                                       {S32, GlobalPtr, 16, GlobalAlign16},
724 
725                                       {S32, LocalPtr, 32, 32},
726                                       {S64, LocalPtr, 64, 32},
727                                       {V2S32, LocalPtr, 64, 32},
728                                       {S32, LocalPtr, 8, 8},
729                                       {S32, LocalPtr, 16, 16},
730                                       {V2S16, LocalPtr, 32, 32},
731 
732                                       {S32, PrivatePtr, 32, 32},
733                                       {S32, PrivatePtr, 8, 8},
734                                       {S32, PrivatePtr, 16, 16},
735                                       {V2S16, PrivatePtr, 32, 32},
736 
737                                       {S32, FlatPtr, 32, GlobalAlign32},
738                                       {S32, FlatPtr, 16, GlobalAlign16},
739                                       {S32, FlatPtr, 8, GlobalAlign8},
740                                       {V2S16, FlatPtr, 32, GlobalAlign32},
741 
742                                       {S32, ConstantPtr, 32, GlobalAlign32},
743                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
744                                       {V3S32, ConstantPtr, 96, GlobalAlign32},
745                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
746                                       {S64, ConstantPtr, 64, GlobalAlign32},
747                                       {S128, ConstantPtr, 128, GlobalAlign32},
748                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
749     Actions
750         .customIf(typeIs(1, Constant32Ptr))
751         .narrowScalarIf(
752             [=](const LegalityQuery &Query) -> bool {
753               return !Query.Types[0].isVector() && needToSplitLoad(Query);
754             },
755             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
756               const LLT DstTy = Query.Types[0];
757               const LLT PtrTy = Query.Types[1];
758 
759               const unsigned DstSize = DstTy.getSizeInBits();
760               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
761 
762               // Split extloads.
763               if (DstSize > MemSize)
764                 return std::make_pair(0, LLT::scalar(MemSize));
765 
766               if (DstSize > 32 && (DstSize % 32 != 0)) {
767                 // FIXME: Need a way to specify non-extload of larger size if
768                 // suitably aligned.
769                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
770               }
771 
772               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
773               if (MemSize > MaxSize)
774                 return std::make_pair(0, LLT::scalar(MaxSize));
775 
776               unsigned Align = Query.MMODescrs[0].AlignInBits;
777               return std::make_pair(0, LLT::scalar(Align));
778             })
779         .fewerElementsIf(
780             [=](const LegalityQuery &Query) -> bool {
781               return Query.Types[0].isVector() && needToSplitLoad(Query);
782             },
783             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
784               const LLT DstTy = Query.Types[0];
785               const LLT PtrTy = Query.Types[1];
786 
787               LLT EltTy = DstTy.getElementType();
788               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
789 
790               // Split if it's too large for the address space.
791               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
792                 unsigned NumElts = DstTy.getNumElements();
793                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
794 
795                 // FIXME: Refine when odd breakdowns handled
796                 // The scalars will need to be re-legalized.
797                 if (NumPieces == 1 || NumPieces >= NumElts ||
798                     NumElts % NumPieces != 0)
799                   return std::make_pair(0, EltTy);
800 
801                 return std::make_pair(0,
802                                       LLT::vector(NumElts / NumPieces, EltTy));
803               }
804 
805               // Need to split because of alignment.
806               unsigned Align = Query.MMODescrs[0].AlignInBits;
807               unsigned EltSize = EltTy.getSizeInBits();
808               if (EltSize > Align &&
809                   (EltSize / Align < DstTy.getNumElements())) {
810                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
811               }
812 
813               // May need relegalization for the scalars.
814               return std::make_pair(0, EltTy);
815             })
816         .minScalar(0, S32);
817 
818     if (IsStore)
819       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
820 
821     // TODO: Need a bitcast lower option?
822     Actions
823         .legalIf([=](const LegalityQuery &Query) {
824           const LLT Ty0 = Query.Types[0];
825           unsigned Size = Ty0.getSizeInBits();
826           unsigned MemSize = Query.MMODescrs[0].SizeInBits;
827           unsigned Align = Query.MMODescrs[0].AlignInBits;
828 
829           // FIXME: Widening store from alignment not valid.
830           if (MemSize < Size)
831             MemSize = std::max(MemSize, Align);
832 
833           // No extending vector loads.
834           if (Size > MemSize && Ty0.isVector())
835             return false;
836 
837           switch (MemSize) {
838           case 8:
839           case 16:
840             return Size == 32;
841           case 32:
842           case 64:
843           case 128:
844             return true;
845           case 96:
846             return ST.hasDwordx3LoadStores();
847           case 256:
848           case 512:
849             return true;
850           default:
851             return false;
852           }
853         })
854         .widenScalarToNextPow2(0)
855         // TODO: v3s32->v4s32 with alignment
856         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
857   }
858 
859   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
860                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
861                                                   {S32, GlobalPtr, 16, 2 * 8},
862                                                   {S32, LocalPtr, 8, 8},
863                                                   {S32, LocalPtr, 16, 16},
864                                                   {S32, PrivatePtr, 8, 8},
865                                                   {S32, PrivatePtr, 16, 16},
866                                                   {S32, ConstantPtr, 8, 8},
867                                                   {S32, ConstantPtr, 16, 2 * 8}});
868   if (ST.hasFlatAddressSpace()) {
869     ExtLoads.legalForTypesWithMemDesc(
870         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
871   }
872 
873   ExtLoads.clampScalar(0, S32, S32)
874           .widenScalarToNextPow2(0)
875           .unsupportedIfMemSizeNotPow2()
876           .lower();
877 
878   auto &Atomics = getActionDefinitionsBuilder(
879     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
880      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
881      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
882      G_ATOMICRMW_UMIN})
883     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
884                {S64, GlobalPtr}, {S64, LocalPtr}});
885   if (ST.hasFlatAddressSpace()) {
886     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
887   }
888 
889   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
890     .legalFor({{S32, LocalPtr}});
891 
892   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
893   // demarshalling
894   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
895     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
896                 {S32, FlatPtr}, {S64, FlatPtr}})
897     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
898                {S32, RegionPtr}, {S64, RegionPtr}});
899 
900   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS)
901     .lower();
902 
903   // TODO: Pointer types, any 32-bit or 64-bit vector
904 
905   // Condition should be s32 for scalar, s1 for vector.
906   getActionDefinitionsBuilder(G_SELECT)
907     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
908           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
909           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
910     .clampScalar(0, S16, S64)
911     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
912     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
913     .scalarize(1)
914     .clampMaxNumElements(0, S32, 2)
915     .clampMaxNumElements(0, LocalPtr, 2)
916     .clampMaxNumElements(0, PrivatePtr, 2)
917     .scalarize(0)
918     .widenScalarToNextPow2(0)
919     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
920 
921   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
922   // be more flexible with the shift amount type.
923   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
924     .legalFor({{S32, S32}, {S64, S32}});
925   if (ST.has16BitInsts()) {
926     if (ST.hasVOP3PInsts()) {
927       Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
928             .clampMaxNumElements(0, S16, 2);
929     } else
930       Shifts.legalFor({{S16, S32}, {S16, S16}});
931 
932     // TODO: Support 16-bit shift amounts
933     Shifts.clampScalar(1, S32, S32);
934     Shifts.clampScalar(0, S16, S64);
935     Shifts.widenScalarToNextPow2(0, 16);
936   } else {
937     // Make sure we legalize the shift amount type first, as the general
938     // expansion for the shifted type will produce much worse code if it hasn't
939     // been truncated already.
940     Shifts.clampScalar(1, S32, S32);
941     Shifts.clampScalar(0, S32, S64);
942     Shifts.widenScalarToNextPow2(0, 32);
943   }
944   Shifts.scalarize(0);
945 
946   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
947     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
948     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
949     unsigned IdxTypeIdx = 2;
950 
951     getActionDefinitionsBuilder(Op)
952       .customIf([=](const LegalityQuery &Query) {
953           const LLT EltTy = Query.Types[EltTypeIdx];
954           const LLT VecTy = Query.Types[VecTypeIdx];
955           const LLT IdxTy = Query.Types[IdxTypeIdx];
956           return (EltTy.getSizeInBits() == 16 ||
957                   EltTy.getSizeInBits() % 32 == 0) &&
958                  VecTy.getSizeInBits() % 32 == 0 &&
959                  VecTy.getSizeInBits() <= 1024 &&
960                  IdxTy.getSizeInBits() == 32;
961         })
962       .clampScalar(EltTypeIdx, S32, S64)
963       .clampScalar(VecTypeIdx, S32, S64)
964       .clampScalar(IdxTypeIdx, S32, S32);
965   }
966 
967   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
968     .unsupportedIf([=](const LegalityQuery &Query) {
969         const LLT &EltTy = Query.Types[1].getElementType();
970         return Query.Types[0] != EltTy;
971       });
972 
973   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
974     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
975     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
976 
977     // FIXME: Doesn't handle extract of illegal sizes.
978     getActionDefinitionsBuilder(Op)
979       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
980       // FIXME: Multiples of 16 should not be legal.
981       .legalIf([=](const LegalityQuery &Query) {
982           const LLT BigTy = Query.Types[BigTyIdx];
983           const LLT LitTy = Query.Types[LitTyIdx];
984           return (BigTy.getSizeInBits() % 32 == 0) &&
985                  (LitTy.getSizeInBits() % 16 == 0);
986         })
987       .widenScalarIf(
988         [=](const LegalityQuery &Query) {
989           const LLT BigTy = Query.Types[BigTyIdx];
990           return (BigTy.getScalarSizeInBits() < 16);
991         },
992         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
993       .widenScalarIf(
994         [=](const LegalityQuery &Query) {
995           const LLT LitTy = Query.Types[LitTyIdx];
996           return (LitTy.getScalarSizeInBits() < 16);
997         },
998         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
999       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1000       .widenScalarToNextPow2(BigTyIdx, 32);
1001 
1002   }
1003 
1004   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1005     .legalForCartesianProduct(AllS32Vectors, {S32})
1006     .legalForCartesianProduct(AllS64Vectors, {S64})
1007     .clampNumElements(0, V16S32, V32S32)
1008     .clampNumElements(0, V2S64, V16S64)
1009     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1010 
1011   if (ST.hasScalarPackInsts())
1012     BuildVector.legalFor({V2S16, S32});
1013 
1014   BuildVector
1015     .minScalarSameAs(1, 0)
1016     .legalIf(isRegisterType(0))
1017     .minScalarOrElt(0, S32);
1018 
1019   if (ST.hasScalarPackInsts()) {
1020     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1021       .legalFor({V2S16, S32})
1022       .lower();
1023   } else {
1024     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1025       .lower();
1026   }
1027 
1028   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1029     .legalIf(isRegisterType(0));
1030 
1031   // TODO: Don't fully scalarize v2s16 pieces
1032   getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1033 
1034   // Merge/Unmerge
1035   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1036     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1037     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1038 
1039     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1040       const LLT &Ty = Query.Types[TypeIdx];
1041       if (Ty.isVector()) {
1042         const LLT &EltTy = Ty.getElementType();
1043         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
1044           return true;
1045         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1046           return true;
1047       }
1048       return false;
1049     };
1050 
1051     auto &Builder = getActionDefinitionsBuilder(Op)
1052       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1053       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1054       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1055       // valid.
1056       .clampScalar(LitTyIdx, S16, S256)
1057       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1058       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1059       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1060                            elementTypeIs(1, S16)),
1061                        changeTo(1, V2S16))
1062       // Break up vectors with weird elements into scalars
1063       .fewerElementsIf(
1064         [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
1065         scalarize(0))
1066       .fewerElementsIf(
1067         [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
1068         scalarize(1))
1069       .clampScalar(BigTyIdx, S32, S1024)
1070       .lowerFor({{S16, V2S16}});
1071 
1072     if (Op == G_MERGE_VALUES) {
1073       Builder.widenScalarIf(
1074         // TODO: Use 16-bit shifts if legal for 8-bit values?
1075         [=](const LegalityQuery &Query) {
1076           const LLT Ty = Query.Types[LitTyIdx];
1077           return Ty.getSizeInBits() < 32;
1078         },
1079         changeTo(LitTyIdx, S32));
1080     }
1081 
1082     Builder.widenScalarIf(
1083       [=](const LegalityQuery &Query) {
1084         const LLT Ty = Query.Types[BigTyIdx];
1085         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1086           Ty.getSizeInBits() % 16 != 0;
1087       },
1088       [=](const LegalityQuery &Query) {
1089         // Pick the next power of 2, or a multiple of 64 over 128.
1090         // Whichever is smaller.
1091         const LLT &Ty = Query.Types[BigTyIdx];
1092         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1093         if (NewSizeInBits >= 256) {
1094           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1095           if (RoundedTo < NewSizeInBits)
1096             NewSizeInBits = RoundedTo;
1097         }
1098         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1099       })
1100       .legalIf([=](const LegalityQuery &Query) {
1101           const LLT &BigTy = Query.Types[BigTyIdx];
1102           const LLT &LitTy = Query.Types[LitTyIdx];
1103 
1104           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1105             return false;
1106           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1107             return false;
1108 
1109           return BigTy.getSizeInBits() % 16 == 0 &&
1110                  LitTy.getSizeInBits() % 16 == 0 &&
1111                  BigTy.getSizeInBits() <= 1024;
1112         })
1113       // Any vectors left are the wrong size. Scalarize them.
1114       .scalarize(0)
1115       .scalarize(1);
1116   }
1117 
1118   getActionDefinitionsBuilder(G_SEXT_INREG).lower();
1119 
1120   getActionDefinitionsBuilder({G_READ_REGISTER, G_WRITE_REGISTER}).lower();
1121 
1122   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1123     .legalFor({S64});
1124 
1125   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1126         G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1127         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1128     .unsupported();
1129 
1130   computeTables();
1131   verify(*ST.getInstrInfo());
1132 }
1133 
1134 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1135                                          MachineRegisterInfo &MRI,
1136                                          MachineIRBuilder &B,
1137                                          GISelChangeObserver &Observer) const {
1138   switch (MI.getOpcode()) {
1139   case TargetOpcode::G_ADDRSPACE_CAST:
1140     return legalizeAddrSpaceCast(MI, MRI, B);
1141   case TargetOpcode::G_FRINT:
1142     return legalizeFrint(MI, MRI, B);
1143   case TargetOpcode::G_FCEIL:
1144     return legalizeFceil(MI, MRI, B);
1145   case TargetOpcode::G_INTRINSIC_TRUNC:
1146     return legalizeIntrinsicTrunc(MI, MRI, B);
1147   case TargetOpcode::G_SITOFP:
1148     return legalizeITOFP(MI, MRI, B, true);
1149   case TargetOpcode::G_UITOFP:
1150     return legalizeITOFP(MI, MRI, B, false);
1151   case TargetOpcode::G_FMINNUM:
1152   case TargetOpcode::G_FMAXNUM:
1153   case TargetOpcode::G_FMINNUM_IEEE:
1154   case TargetOpcode::G_FMAXNUM_IEEE:
1155     return legalizeMinNumMaxNum(MI, MRI, B);
1156   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1157     return legalizeExtractVectorElt(MI, MRI, B);
1158   case TargetOpcode::G_INSERT_VECTOR_ELT:
1159     return legalizeInsertVectorElt(MI, MRI, B);
1160   case TargetOpcode::G_FSIN:
1161   case TargetOpcode::G_FCOS:
1162     return legalizeSinCos(MI, MRI, B);
1163   case TargetOpcode::G_GLOBAL_VALUE:
1164     return legalizeGlobalValue(MI, MRI, B);
1165   case TargetOpcode::G_LOAD:
1166     return legalizeLoad(MI, MRI, B, Observer);
1167   case TargetOpcode::G_FMAD:
1168     return legalizeFMad(MI, MRI, B);
1169   case TargetOpcode::G_FDIV:
1170     return legalizeFDIV(MI, MRI, B);
1171   case TargetOpcode::G_ATOMIC_CMPXCHG:
1172     return legalizeAtomicCmpXChg(MI, MRI, B);
1173   default:
1174     return false;
1175   }
1176 
1177   llvm_unreachable("expected switch to return");
1178 }
1179 
1180 Register AMDGPULegalizerInfo::getSegmentAperture(
1181   unsigned AS,
1182   MachineRegisterInfo &MRI,
1183   MachineIRBuilder &B) const {
1184   MachineFunction &MF = B.getMF();
1185   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1186   const LLT S32 = LLT::scalar(32);
1187 
1188   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1189 
1190   if (ST.hasApertureRegs()) {
1191     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1192     // getreg.
1193     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1194         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1195         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1196     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1197         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1198         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1199     unsigned Encoding =
1200         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1201         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1202         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1203 
1204     Register ApertureReg = MRI.createGenericVirtualRegister(S32);
1205     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1206 
1207     B.buildInstr(AMDGPU::S_GETREG_B32)
1208       .addDef(GetReg)
1209       .addImm(Encoding);
1210     MRI.setType(GetReg, S32);
1211 
1212     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1213     B.buildInstr(TargetOpcode::G_SHL)
1214       .addDef(ApertureReg)
1215       .addUse(GetReg)
1216       .addUse(ShiftAmt.getReg(0));
1217 
1218     return ApertureReg;
1219   }
1220 
1221   Register QueuePtr = MRI.createGenericVirtualRegister(
1222     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1223 
1224   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1225   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1226     return Register();
1227 
1228   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1229   // private_segment_aperture_base_hi.
1230   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1231 
1232   // TODO: can we be smarter about machine pointer info?
1233   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1234   MachineMemOperand *MMO = MF.getMachineMemOperand(
1235     PtrInfo,
1236     MachineMemOperand::MOLoad |
1237     MachineMemOperand::MODereferenceable |
1238     MachineMemOperand::MOInvariant,
1239     4,
1240     MinAlign(64, StructOffset));
1241 
1242   Register LoadResult = MRI.createGenericVirtualRegister(S32);
1243   Register LoadAddr;
1244 
1245   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1246   B.buildLoad(LoadResult, LoadAddr, *MMO);
1247   return LoadResult;
1248 }
1249 
1250 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1251   MachineInstr &MI, MachineRegisterInfo &MRI,
1252   MachineIRBuilder &B) const {
1253   MachineFunction &MF = B.getMF();
1254 
1255   B.setInstr(MI);
1256 
1257   const LLT S32 = LLT::scalar(32);
1258   Register Dst = MI.getOperand(0).getReg();
1259   Register Src = MI.getOperand(1).getReg();
1260 
1261   LLT DstTy = MRI.getType(Dst);
1262   LLT SrcTy = MRI.getType(Src);
1263   unsigned DestAS = DstTy.getAddressSpace();
1264   unsigned SrcAS = SrcTy.getAddressSpace();
1265 
1266   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1267   // vector element.
1268   assert(!DstTy.isVector());
1269 
1270   const AMDGPUTargetMachine &TM
1271     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1272 
1273   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1274   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1275     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1276     return true;
1277   }
1278 
1279   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1280     // Truncate.
1281     B.buildExtract(Dst, Src, 0);
1282     MI.eraseFromParent();
1283     return true;
1284   }
1285 
1286   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1287     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1288     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1289 
1290     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1291     // another. Merge operands are required to be the same type, but creating an
1292     // extra ptrtoint would be kind of pointless.
1293     auto HighAddr = B.buildConstant(
1294       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1295     B.buildMerge(Dst, {Src, HighAddr.getReg(0)});
1296     MI.eraseFromParent();
1297     return true;
1298   }
1299 
1300   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1301     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1302            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1303     unsigned NullVal = TM.getNullPointerValue(DestAS);
1304 
1305     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1306     auto FlatNull = B.buildConstant(SrcTy, 0);
1307 
1308     Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy);
1309 
1310     // Extract low 32-bits of the pointer.
1311     B.buildExtract(PtrLo32, Src, 0);
1312 
1313     Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1314     B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
1315     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1316 
1317     MI.eraseFromParent();
1318     return true;
1319   }
1320 
1321   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1322     return false;
1323 
1324   if (!ST.hasFlatAddressSpace())
1325     return false;
1326 
1327   auto SegmentNull =
1328       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1329   auto FlatNull =
1330       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1331 
1332   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1333   if (!ApertureReg.isValid())
1334     return false;
1335 
1336   Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1337   B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
1338 
1339   Register BuildPtr = MRI.createGenericVirtualRegister(DstTy);
1340 
1341   // Coerce the type of the low half of the result so we can use merge_values.
1342   Register SrcAsInt = MRI.createGenericVirtualRegister(S32);
1343   B.buildInstr(TargetOpcode::G_PTRTOINT)
1344     .addDef(SrcAsInt)
1345     .addUse(Src);
1346 
1347   // TODO: Should we allow mismatched types but matching sizes in merges to
1348   // avoid the ptrtoint?
1349   B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
1350   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
1351 
1352   MI.eraseFromParent();
1353   return true;
1354 }
1355 
1356 bool AMDGPULegalizerInfo::legalizeFrint(
1357   MachineInstr &MI, MachineRegisterInfo &MRI,
1358   MachineIRBuilder &B) const {
1359   B.setInstr(MI);
1360 
1361   Register Src = MI.getOperand(1).getReg();
1362   LLT Ty = MRI.getType(Src);
1363   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1364 
1365   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1366   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1367 
1368   auto C1 = B.buildFConstant(Ty, C1Val);
1369   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1370 
1371   // TODO: Should this propagate fast-math-flags?
1372   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1373   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1374 
1375   auto C2 = B.buildFConstant(Ty, C2Val);
1376   auto Fabs = B.buildFAbs(Ty, Src);
1377 
1378   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1379   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1380   return true;
1381 }
1382 
1383 bool AMDGPULegalizerInfo::legalizeFceil(
1384   MachineInstr &MI, MachineRegisterInfo &MRI,
1385   MachineIRBuilder &B) const {
1386   B.setInstr(MI);
1387 
1388   const LLT S1 = LLT::scalar(1);
1389   const LLT S64 = LLT::scalar(64);
1390 
1391   Register Src = MI.getOperand(1).getReg();
1392   assert(MRI.getType(Src) == S64);
1393 
1394   // result = trunc(src)
1395   // if (src > 0.0 && src != result)
1396   //   result += 1.0
1397 
1398   auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src});
1399 
1400   const auto Zero = B.buildFConstant(S64, 0.0);
1401   const auto One = B.buildFConstant(S64, 1.0);
1402   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1403   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1404   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1405   auto Add = B.buildSelect(S64, And, One, Zero);
1406 
1407   // TODO: Should this propagate fast-math-flags?
1408   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1409   return true;
1410 }
1411 
1412 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1413                                               MachineIRBuilder &B) {
1414   const unsigned FractBits = 52;
1415   const unsigned ExpBits = 11;
1416   LLT S32 = LLT::scalar(32);
1417 
1418   auto Const0 = B.buildConstant(S32, FractBits - 32);
1419   auto Const1 = B.buildConstant(S32, ExpBits);
1420 
1421   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1422     .addUse(Const0.getReg(0))
1423     .addUse(Const1.getReg(0));
1424 
1425   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1426 }
1427 
1428 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1429   MachineInstr &MI, MachineRegisterInfo &MRI,
1430   MachineIRBuilder &B) const {
1431   B.setInstr(MI);
1432 
1433   const LLT S1 = LLT::scalar(1);
1434   const LLT S32 = LLT::scalar(32);
1435   const LLT S64 = LLT::scalar(64);
1436 
1437   Register Src = MI.getOperand(1).getReg();
1438   assert(MRI.getType(Src) == S64);
1439 
1440   // TODO: Should this use extract since the low half is unused?
1441   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1442   Register Hi = Unmerge.getReg(1);
1443 
1444   // Extract the upper half, since this is where we will find the sign and
1445   // exponent.
1446   auto Exp = extractF64Exponent(Hi, B);
1447 
1448   const unsigned FractBits = 52;
1449 
1450   // Extract the sign bit.
1451   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1452   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1453 
1454   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1455 
1456   const auto Zero32 = B.buildConstant(S32, 0);
1457 
1458   // Extend back to 64-bits.
1459   auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)});
1460 
1461   auto Shr = B.buildAShr(S64, FractMask, Exp);
1462   auto Not = B.buildNot(S64, Shr);
1463   auto Tmp0 = B.buildAnd(S64, Src, Not);
1464   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1465 
1466   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1467   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1468 
1469   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1470   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1471   return true;
1472 }
1473 
1474 bool AMDGPULegalizerInfo::legalizeITOFP(
1475   MachineInstr &MI, MachineRegisterInfo &MRI,
1476   MachineIRBuilder &B, bool Signed) const {
1477   B.setInstr(MI);
1478 
1479   Register Dst = MI.getOperand(0).getReg();
1480   Register Src = MI.getOperand(1).getReg();
1481 
1482   const LLT S64 = LLT::scalar(64);
1483   const LLT S32 = LLT::scalar(32);
1484 
1485   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1486 
1487   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1488 
1489   auto CvtHi = Signed ?
1490     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1491     B.buildUITOFP(S64, Unmerge.getReg(1));
1492 
1493   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1494 
1495   auto ThirtyTwo = B.buildConstant(S32, 32);
1496   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1497     .addUse(CvtHi.getReg(0))
1498     .addUse(ThirtyTwo.getReg(0));
1499 
1500   // TODO: Should this propagate fast-math-flags?
1501   B.buildFAdd(Dst, LdExp, CvtLo);
1502   MI.eraseFromParent();
1503   return true;
1504 }
1505 
1506 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1507   MachineInstr &MI, MachineRegisterInfo &MRI,
1508   MachineIRBuilder &B) const {
1509   MachineFunction &MF = B.getMF();
1510   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1511 
1512   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1513                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1514 
1515   // With ieee_mode disabled, the instructions have the correct behavior
1516   // already for G_FMINNUM/G_FMAXNUM
1517   if (!MFI->getMode().IEEE)
1518     return !IsIEEEOp;
1519 
1520   if (IsIEEEOp)
1521     return true;
1522 
1523   MachineIRBuilder HelperBuilder(MI);
1524   GISelObserverWrapper DummyObserver;
1525   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1526   HelperBuilder.setInstr(MI);
1527   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1528 }
1529 
1530 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1531   MachineInstr &MI, MachineRegisterInfo &MRI,
1532   MachineIRBuilder &B) const {
1533   // TODO: Should move some of this into LegalizerHelper.
1534 
1535   // TODO: Promote dynamic indexing of s16 to s32
1536   // TODO: Dynamic s64 indexing is only legal for SGPR.
1537   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
1538   if (!IdxVal) // Dynamic case will be selected to register indexing.
1539     return true;
1540 
1541   Register Dst = MI.getOperand(0).getReg();
1542   Register Vec = MI.getOperand(1).getReg();
1543 
1544   LLT VecTy = MRI.getType(Vec);
1545   LLT EltTy = VecTy.getElementType();
1546   assert(EltTy == MRI.getType(Dst));
1547 
1548   B.setInstr(MI);
1549 
1550   if (IdxVal.getValue() < VecTy.getNumElements())
1551     B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits());
1552   else
1553     B.buildUndef(Dst);
1554 
1555   MI.eraseFromParent();
1556   return true;
1557 }
1558 
1559 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1560   MachineInstr &MI, MachineRegisterInfo &MRI,
1561   MachineIRBuilder &B) const {
1562   // TODO: Should move some of this into LegalizerHelper.
1563 
1564   // TODO: Promote dynamic indexing of s16 to s32
1565   // TODO: Dynamic s64 indexing is only legal for SGPR.
1566   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI);
1567   if (!IdxVal) // Dynamic case will be selected to register indexing.
1568     return true;
1569 
1570   Register Dst = MI.getOperand(0).getReg();
1571   Register Vec = MI.getOperand(1).getReg();
1572   Register Ins = MI.getOperand(2).getReg();
1573 
1574   LLT VecTy = MRI.getType(Vec);
1575   LLT EltTy = VecTy.getElementType();
1576   assert(EltTy == MRI.getType(Ins));
1577 
1578   B.setInstr(MI);
1579 
1580   if (IdxVal.getValue() < VecTy.getNumElements())
1581     B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits());
1582   else
1583     B.buildUndef(Dst);
1584 
1585   MI.eraseFromParent();
1586   return true;
1587 }
1588 
1589 bool AMDGPULegalizerInfo::legalizeSinCos(
1590   MachineInstr &MI, MachineRegisterInfo &MRI,
1591   MachineIRBuilder &B) const {
1592   B.setInstr(MI);
1593 
1594   Register DstReg = MI.getOperand(0).getReg();
1595   Register SrcReg = MI.getOperand(1).getReg();
1596   LLT Ty = MRI.getType(DstReg);
1597   unsigned Flags = MI.getFlags();
1598 
1599   Register TrigVal;
1600   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1601   if (ST.hasTrigReducedRange()) {
1602     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1603     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1604       .addUse(MulVal.getReg(0))
1605       .setMIFlags(Flags).getReg(0);
1606   } else
1607     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1608 
1609   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1610     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1611   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1612     .addUse(TrigVal)
1613     .setMIFlags(Flags);
1614   MI.eraseFromParent();
1615   return true;
1616 }
1617 
1618 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1619   Register DstReg, LLT PtrTy,
1620   MachineIRBuilder &B, const GlobalValue *GV,
1621   unsigned Offset, unsigned GAFlags) const {
1622   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1623   // to the following code sequence:
1624   //
1625   // For constant address space:
1626   //   s_getpc_b64 s[0:1]
1627   //   s_add_u32 s0, s0, $symbol
1628   //   s_addc_u32 s1, s1, 0
1629   //
1630   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1631   //   a fixup or relocation is emitted to replace $symbol with a literal
1632   //   constant, which is a pc-relative offset from the encoding of the $symbol
1633   //   operand to the global variable.
1634   //
1635   // For global address space:
1636   //   s_getpc_b64 s[0:1]
1637   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1638   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1639   //
1640   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1641   //   fixups or relocations are emitted to replace $symbol@*@lo and
1642   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1643   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
1644   //   operand to the global variable.
1645   //
1646   // What we want here is an offset from the value returned by s_getpc
1647   // (which is the address of the s_add_u32 instruction) to the global
1648   // variable, but since the encoding of $symbol starts 4 bytes after the start
1649   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1650   // small. This requires us to add 4 to the global variable offset in order to
1651   // compute the correct address.
1652 
1653   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1654 
1655   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1656     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1657 
1658   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1659     .addDef(PCReg);
1660 
1661   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1662   if (GAFlags == SIInstrInfo::MO_NONE)
1663     MIB.addImm(0);
1664   else
1665     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1666 
1667   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1668 
1669   if (PtrTy.getSizeInBits() == 32)
1670     B.buildExtract(DstReg, PCReg, 0);
1671   return true;
1672  }
1673 
1674 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1675   MachineInstr &MI, MachineRegisterInfo &MRI,
1676   MachineIRBuilder &B) const {
1677   Register DstReg = MI.getOperand(0).getReg();
1678   LLT Ty = MRI.getType(DstReg);
1679   unsigned AS = Ty.getAddressSpace();
1680 
1681   const GlobalValue *GV = MI.getOperand(1).getGlobal();
1682   MachineFunction &MF = B.getMF();
1683   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1684   B.setInstr(MI);
1685 
1686   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1687     if (!MFI->isEntryFunction()) {
1688       const Function &Fn = MF.getFunction();
1689       DiagnosticInfoUnsupported BadLDSDecl(
1690         Fn, "local memory global used by non-kernel function", MI.getDebugLoc());
1691       Fn.getContext().diagnose(BadLDSDecl);
1692     }
1693 
1694     // TODO: We could emit code to handle the initialization somewhere.
1695     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1696       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1697       MI.eraseFromParent();
1698       return true;
1699     }
1700 
1701     const Function &Fn = MF.getFunction();
1702     DiagnosticInfoUnsupported BadInit(
1703       Fn, "unsupported initializer for address space", MI.getDebugLoc());
1704     Fn.getContext().diagnose(BadInit);
1705     return true;
1706   }
1707 
1708   const SITargetLowering *TLI = ST.getTargetLowering();
1709 
1710   if (TLI->shouldEmitFixup(GV)) {
1711     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
1712     MI.eraseFromParent();
1713     return true;
1714   }
1715 
1716   if (TLI->shouldEmitPCReloc(GV)) {
1717     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
1718     MI.eraseFromParent();
1719     return true;
1720   }
1721 
1722   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1723   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
1724 
1725   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
1726     MachinePointerInfo::getGOT(MF),
1727     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1728     MachineMemOperand::MOInvariant,
1729     8 /*Size*/, 8 /*Align*/);
1730 
1731   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
1732 
1733   if (Ty.getSizeInBits() == 32) {
1734     // Truncate if this is a 32-bit constant adrdess.
1735     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
1736     B.buildExtract(DstReg, Load, 0);
1737   } else
1738     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
1739 
1740   MI.eraseFromParent();
1741   return true;
1742 }
1743 
1744 bool AMDGPULegalizerInfo::legalizeLoad(
1745   MachineInstr &MI, MachineRegisterInfo &MRI,
1746   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
1747   B.setInstr(MI);
1748   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1749   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
1750   Observer.changingInstr(MI);
1751   MI.getOperand(1).setReg(Cast.getReg(0));
1752   Observer.changedInstr(MI);
1753   return true;
1754 }
1755 
1756 bool AMDGPULegalizerInfo::legalizeFMad(
1757   MachineInstr &MI, MachineRegisterInfo &MRI,
1758   MachineIRBuilder &B) const {
1759   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1760   assert(Ty.isScalar());
1761 
1762   MachineFunction &MF = B.getMF();
1763   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1764 
1765   // TODO: Always legal with future ftz flag.
1766   if (Ty == LLT::scalar(32) && !MFI->getMode().FP32Denormals)
1767     return true;
1768   if (Ty == LLT::scalar(16) && !MFI->getMode().FP64FP16Denormals)
1769     return true;
1770 
1771 
1772   MachineIRBuilder HelperBuilder(MI);
1773   GISelObserverWrapper DummyObserver;
1774   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1775   HelperBuilder.setMBB(*MI.getParent());
1776   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
1777 }
1778 
1779 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
1780   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
1781   Register DstReg = MI.getOperand(0).getReg();
1782   Register PtrReg = MI.getOperand(1).getReg();
1783   Register CmpVal = MI.getOperand(2).getReg();
1784   Register NewVal = MI.getOperand(3).getReg();
1785 
1786   assert(SITargetLowering::isFlatGlobalAddrSpace(
1787            MRI.getType(PtrReg).getAddressSpace()) &&
1788          "this should not have been custom lowered");
1789 
1790   LLT ValTy = MRI.getType(CmpVal);
1791   LLT VecTy = LLT::vector(2, ValTy);
1792 
1793   B.setInstr(MI);
1794   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
1795 
1796   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
1797     .addDef(DstReg)
1798     .addUse(PtrReg)
1799     .addUse(PackedVal)
1800     .setMemRefs(MI.memoperands());
1801 
1802   MI.eraseFromParent();
1803   return true;
1804 }
1805 
1806 // Return the use branch instruction, otherwise null if the usage is invalid.
1807 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
1808                                        MachineRegisterInfo &MRI) {
1809   Register CondDef = MI.getOperand(0).getReg();
1810   if (!MRI.hasOneNonDBGUse(CondDef))
1811     return nullptr;
1812 
1813   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
1814   return UseMI.getParent() == MI.getParent() &&
1815     UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr;
1816 }
1817 
1818 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
1819                                                 Register Reg, LLT Ty) const {
1820   Register LiveIn = MRI.getLiveInVirtReg(Reg);
1821   if (LiveIn)
1822     return LiveIn;
1823 
1824   Register NewReg = MRI.createGenericVirtualRegister(Ty);
1825   MRI.addLiveIn(Reg, NewReg);
1826   return NewReg;
1827 }
1828 
1829 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
1830                                          const ArgDescriptor *Arg) const {
1831   if (!Arg->isRegister() || !Arg->getRegister().isValid())
1832     return false; // TODO: Handle these
1833 
1834   assert(Arg->getRegister().isPhysical());
1835 
1836   MachineRegisterInfo &MRI = *B.getMRI();
1837 
1838   LLT Ty = MRI.getType(DstReg);
1839   Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
1840 
1841   if (Arg->isMasked()) {
1842     // TODO: Should we try to emit this once in the entry block?
1843     const LLT S32 = LLT::scalar(32);
1844     const unsigned Mask = Arg->getMask();
1845     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
1846 
1847     Register AndMaskSrc = LiveIn;
1848 
1849     if (Shift != 0) {
1850       auto ShiftAmt = B.buildConstant(S32, Shift);
1851       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
1852     }
1853 
1854     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
1855   } else
1856     B.buildCopy(DstReg, LiveIn);
1857 
1858   // Insert the argument copy if it doens't already exist.
1859   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
1860   if (!MRI.getVRegDef(LiveIn)) {
1861     // FIXME: Should have scoped insert pt
1862     MachineBasicBlock &OrigInsBB = B.getMBB();
1863     auto OrigInsPt = B.getInsertPt();
1864 
1865     MachineBasicBlock &EntryMBB = B.getMF().front();
1866     EntryMBB.addLiveIn(Arg->getRegister());
1867     B.setInsertPt(EntryMBB, EntryMBB.begin());
1868     B.buildCopy(LiveIn, Arg->getRegister());
1869 
1870     B.setInsertPt(OrigInsBB, OrigInsPt);
1871   }
1872 
1873   return true;
1874 }
1875 
1876 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
1877   MachineInstr &MI,
1878   MachineRegisterInfo &MRI,
1879   MachineIRBuilder &B,
1880   AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
1881   B.setInstr(MI);
1882 
1883   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1884 
1885   const ArgDescriptor *Arg;
1886   const TargetRegisterClass *RC;
1887   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
1888   if (!Arg) {
1889     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
1890     return false;
1891   }
1892 
1893   if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
1894     MI.eraseFromParent();
1895     return true;
1896   }
1897 
1898   return false;
1899 }
1900 
1901 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
1902                                        MachineRegisterInfo &MRI,
1903                                        MachineIRBuilder &B) const {
1904   B.setInstr(MI);
1905   Register Dst = MI.getOperand(0).getReg();
1906   LLT DstTy = MRI.getType(Dst);
1907   LLT S16 = LLT::scalar(16);
1908   LLT S32 = LLT::scalar(32);
1909   LLT S64 = LLT::scalar(64);
1910 
1911   if (legalizeFastUnsafeFDIV(MI, MRI, B))
1912     return true;
1913 
1914   if (DstTy == S16)
1915     return legalizeFDIV16(MI, MRI, B);
1916   if (DstTy == S32)
1917     return legalizeFDIV32(MI, MRI, B);
1918   if (DstTy == S64)
1919     return legalizeFDIV64(MI, MRI, B);
1920 
1921   return false;
1922 }
1923 
1924 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
1925                                                  MachineRegisterInfo &MRI,
1926                                                  MachineIRBuilder &B) const {
1927   Register Res = MI.getOperand(0).getReg();
1928   Register LHS = MI.getOperand(1).getReg();
1929   Register RHS = MI.getOperand(2).getReg();
1930 
1931   uint16_t Flags = MI.getFlags();
1932 
1933   LLT ResTy = MRI.getType(Res);
1934   LLT S32 = LLT::scalar(32);
1935   LLT S64 = LLT::scalar(64);
1936 
1937   const MachineFunction &MF = B.getMF();
1938   bool Unsafe =
1939     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
1940 
1941   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
1942     return false;
1943 
1944   if (!Unsafe && ResTy == S32 &&
1945       MF.getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals)
1946     return false;
1947 
1948   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
1949     // 1 / x -> RCP(x)
1950     if (CLHS->isExactlyValue(1.0)) {
1951       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
1952         .addUse(RHS)
1953         .setMIFlags(Flags);
1954 
1955       MI.eraseFromParent();
1956       return true;
1957     }
1958 
1959     // -1 / x -> RCP( FNEG(x) )
1960     if (CLHS->isExactlyValue(-1.0)) {
1961       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
1962       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
1963         .addUse(FNeg.getReg(0))
1964         .setMIFlags(Flags);
1965 
1966       MI.eraseFromParent();
1967       return true;
1968     }
1969   }
1970 
1971   // x / y -> x * (1.0 / y)
1972   if (Unsafe) {
1973     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
1974       .addUse(RHS)
1975       .setMIFlags(Flags);
1976     B.buildFMul(Res, LHS, RCP, Flags);
1977 
1978     MI.eraseFromParent();
1979     return true;
1980   }
1981 
1982   return false;
1983 }
1984 
1985 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
1986                                          MachineRegisterInfo &MRI,
1987                                          MachineIRBuilder &B) const {
1988   B.setInstr(MI);
1989   Register Res = MI.getOperand(0).getReg();
1990   Register LHS = MI.getOperand(1).getReg();
1991   Register RHS = MI.getOperand(2).getReg();
1992 
1993   uint16_t Flags = MI.getFlags();
1994 
1995   LLT S16 = LLT::scalar(16);
1996   LLT S32 = LLT::scalar(32);
1997 
1998   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
1999   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2000 
2001   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2002     .addUse(RHSExt.getReg(0))
2003     .setMIFlags(Flags);
2004 
2005   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2006   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2007 
2008   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2009     .addUse(RDst.getReg(0))
2010     .addUse(RHS)
2011     .addUse(LHS)
2012     .setMIFlags(Flags);
2013 
2014   MI.eraseFromParent();
2015   return true;
2016 }
2017 
2018 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2019 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2020 static void toggleSPDenormMode(bool Enable,
2021                                MachineIRBuilder &B,
2022                                const GCNSubtarget &ST,
2023                                AMDGPU::SIModeRegisterDefaults Mode) {
2024   // Set SP denorm mode to this value.
2025   unsigned SPDenormMode =
2026     Enable ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
2027 
2028   if (ST.hasDenormModeInst()) {
2029     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2030     unsigned DPDenormModeDefault = Mode.FP64FP16Denormals
2031                                    ? FP_DENORM_FLUSH_NONE
2032                                    : FP_DENORM_FLUSH_IN_FLUSH_OUT;
2033 
2034     unsigned NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2035     B.buildInstr(AMDGPU::S_DENORM_MODE)
2036       .addImm(NewDenormModeValue);
2037 
2038   } else {
2039     // Select FP32 bit field in mode register.
2040     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2041                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2042                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2043 
2044     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2045       .addImm(SPDenormMode)
2046       .addImm(SPDenormModeBitField);
2047   }
2048 }
2049 
2050 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2051                                          MachineRegisterInfo &MRI,
2052                                          MachineIRBuilder &B) const {
2053   B.setInstr(MI);
2054   Register Res = MI.getOperand(0).getReg();
2055   Register LHS = MI.getOperand(1).getReg();
2056   Register RHS = MI.getOperand(2).getReg();
2057   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2058   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2059 
2060   uint16_t Flags = MI.getFlags();
2061 
2062   LLT S32 = LLT::scalar(32);
2063   LLT S1 = LLT::scalar(1);
2064 
2065   auto One = B.buildFConstant(S32, 1.0f);
2066 
2067   auto DenominatorScaled =
2068     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2069       .addUse(RHS)
2070       .addUse(LHS)
2071       .addImm(1)
2072       .setMIFlags(Flags);
2073   auto NumeratorScaled =
2074     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2075       .addUse(LHS)
2076       .addUse(RHS)
2077       .addImm(0)
2078       .setMIFlags(Flags);
2079 
2080   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2081     .addUse(DenominatorScaled.getReg(0))
2082     .setMIFlags(Flags);
2083   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2084 
2085   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2086   // aren't modeled as reading it.
2087   if (!Mode.FP32Denormals)
2088     toggleSPDenormMode(true, B, ST, Mode);
2089 
2090   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2091   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2092   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2093   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2094   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2095   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2096 
2097   if (!Mode.FP32Denormals)
2098     toggleSPDenormMode(false, B, ST, Mode);
2099 
2100   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2101     .addUse(Fma4.getReg(0))
2102     .addUse(Fma1.getReg(0))
2103     .addUse(Fma3.getReg(0))
2104     .addUse(NumeratorScaled.getReg(1))
2105     .setMIFlags(Flags);
2106 
2107   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2108     .addUse(Fmas.getReg(0))
2109     .addUse(RHS)
2110     .addUse(LHS)
2111     .setMIFlags(Flags);
2112 
2113   MI.eraseFromParent();
2114   return true;
2115 }
2116 
2117 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
2118                                          MachineRegisterInfo &MRI,
2119                                          MachineIRBuilder &B) const {
2120   B.setInstr(MI);
2121   Register Res = MI.getOperand(0).getReg();
2122   Register LHS = MI.getOperand(1).getReg();
2123   Register RHS = MI.getOperand(2).getReg();
2124 
2125   uint16_t Flags = MI.getFlags();
2126 
2127   LLT S64 = LLT::scalar(64);
2128   LLT S1 = LLT::scalar(1);
2129 
2130   auto One = B.buildFConstant(S64, 1.0);
2131 
2132   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2133     .addUse(LHS)
2134     .addUse(RHS)
2135     .addImm(1)
2136     .setMIFlags(Flags);
2137 
2138   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
2139 
2140   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
2141     .addUse(DivScale0.getReg(0))
2142     .setMIFlags(Flags);
2143 
2144   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
2145   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
2146   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
2147 
2148   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2149     .addUse(LHS)
2150     .addUse(RHS)
2151     .addImm(0)
2152     .setMIFlags(Flags);
2153 
2154   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
2155   auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags);
2156   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
2157 
2158   Register Scale;
2159   if (!ST.hasUsableDivScaleConditionOutput()) {
2160     // Workaround a hardware bug on SI where the condition output from div_scale
2161     // is not usable.
2162 
2163     Scale = MRI.createGenericVirtualRegister(S1);
2164 
2165     LLT S32 = LLT::scalar(32);
2166 
2167     auto NumUnmerge = B.buildUnmerge(S32, LHS);
2168     auto DenUnmerge = B.buildUnmerge(S32, RHS);
2169     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
2170     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
2171 
2172     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
2173                               Scale1Unmerge.getReg(1));
2174     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
2175                               Scale0Unmerge.getReg(1));
2176     B.buildXor(Scale, CmpNum, CmpDen);
2177   } else {
2178     Scale = DivScale1.getReg(1);
2179   }
2180 
2181   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
2182     .addUse(Fma4.getReg(0))
2183     .addUse(Fma3.getReg(0))
2184     .addUse(Mul.getReg(0))
2185     .addUse(Scale)
2186     .setMIFlags(Flags);
2187 
2188   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
2189     .addUse(Fmas.getReg(0))
2190     .addUse(RHS)
2191     .addUse(LHS)
2192     .setMIFlags(Flags);
2193 
2194   MI.eraseFromParent();
2195   return true;
2196 }
2197 
2198 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
2199                                                  MachineRegisterInfo &MRI,
2200                                                  MachineIRBuilder &B) const {
2201   B.setInstr(MI);
2202   Register Res = MI.getOperand(0).getReg();
2203   Register LHS = MI.getOperand(2).getReg();
2204   Register RHS = MI.getOperand(3).getReg();
2205   uint16_t Flags = MI.getFlags();
2206 
2207   LLT S32 = LLT::scalar(32);
2208   LLT S1 = LLT::scalar(1);
2209 
2210   auto Abs = B.buildFAbs(S32, RHS, Flags);
2211   const APFloat C0Val(1.0f);
2212 
2213   auto C0 = B.buildConstant(S32, 0x6f800000);
2214   auto C1 = B.buildConstant(S32, 0x2f800000);
2215   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
2216 
2217   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
2218   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
2219 
2220   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
2221 
2222   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2223     .addUse(Mul0.getReg(0))
2224     .setMIFlags(Flags);
2225 
2226   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
2227 
2228   B.buildFMul(Res, Sel, Mul1, Flags);
2229 
2230   MI.eraseFromParent();
2231   return true;
2232 }
2233 
2234 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
2235                                                  MachineRegisterInfo &MRI,
2236                                                  MachineIRBuilder &B) const {
2237   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2238   if (!MFI->isEntryFunction()) {
2239     return legalizePreloadedArgIntrin(MI, MRI, B,
2240                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
2241   }
2242 
2243   B.setInstr(MI);
2244 
2245   uint64_t Offset =
2246     ST.getTargetLowering()->getImplicitParameterOffset(
2247       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
2248   Register DstReg = MI.getOperand(0).getReg();
2249   LLT DstTy = MRI.getType(DstReg);
2250   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
2251 
2252   const ArgDescriptor *Arg;
2253   const TargetRegisterClass *RC;
2254   std::tie(Arg, RC)
2255     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2256   if (!Arg)
2257     return false;
2258 
2259   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
2260   if (!loadInputValue(KernargPtrReg, B, Arg))
2261     return false;
2262 
2263   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
2264   MI.eraseFromParent();
2265   return true;
2266 }
2267 
2268 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
2269                                               MachineRegisterInfo &MRI,
2270                                               MachineIRBuilder &B,
2271                                               unsigned AddrSpace) const {
2272   B.setInstr(MI);
2273   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
2274   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
2275   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
2276   MI.eraseFromParent();
2277   return true;
2278 }
2279 
2280 /// Handle register layout difference for f16 images for some subtargets.
2281 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
2282                                              MachineRegisterInfo &MRI,
2283                                              Register Reg) const {
2284   if (!ST.hasUnpackedD16VMem())
2285     return Reg;
2286 
2287   const LLT S16 = LLT::scalar(16);
2288   const LLT S32 = LLT::scalar(32);
2289   LLT StoreVT = MRI.getType(Reg);
2290   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
2291 
2292   auto Unmerge = B.buildUnmerge(S16, Reg);
2293 
2294   SmallVector<Register, 4> WideRegs;
2295   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
2296     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
2297 
2298   int NumElts = StoreVT.getNumElements();
2299 
2300   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
2301 }
2302 
2303 bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI,
2304                                                  MachineRegisterInfo &MRI,
2305                                                  MachineIRBuilder &B,
2306                                                  bool IsFormat) const {
2307   // TODO: Reject f16 format on targets where unsupported.
2308   Register VData = MI.getOperand(1).getReg();
2309   LLT Ty = MRI.getType(VData);
2310 
2311   B.setInstr(MI);
2312 
2313   const LLT S32 = LLT::scalar(32);
2314   const LLT S16 = LLT::scalar(16);
2315 
2316   // Fixup illegal register types for i8 stores.
2317   if (Ty == LLT::scalar(8) || Ty == S16) {
2318     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
2319     MI.getOperand(1).setReg(AnyExt);
2320     return true;
2321   }
2322 
2323   if (Ty.isVector()) {
2324     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
2325       if (IsFormat)
2326         MI.getOperand(1).setReg(handleD16VData(B, MRI, VData));
2327       return true;
2328     }
2329 
2330     return Ty.getElementType() == S32 && Ty.getNumElements() <= 4;
2331   }
2332 
2333   return Ty == S32;
2334 }
2335 
2336 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
2337                                             MachineRegisterInfo &MRI,
2338                                             MachineIRBuilder &B) const {
2339   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
2340   auto IntrID = MI.getIntrinsicID();
2341   switch (IntrID) {
2342   case Intrinsic::amdgcn_if:
2343   case Intrinsic::amdgcn_else: {
2344     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
2345       const SIRegisterInfo *TRI
2346         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
2347 
2348       B.setInstr(*BrCond);
2349       Register Def = MI.getOperand(1).getReg();
2350       Register Use = MI.getOperand(3).getReg();
2351 
2352       if (IntrID == Intrinsic::amdgcn_if) {
2353         B.buildInstr(AMDGPU::SI_IF)
2354           .addDef(Def)
2355           .addUse(Use)
2356           .addMBB(BrCond->getOperand(1).getMBB());
2357       } else {
2358         B.buildInstr(AMDGPU::SI_ELSE)
2359           .addDef(Def)
2360           .addUse(Use)
2361           .addMBB(BrCond->getOperand(1).getMBB())
2362           .addImm(0);
2363       }
2364 
2365       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
2366       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
2367       MI.eraseFromParent();
2368       BrCond->eraseFromParent();
2369       return true;
2370     }
2371 
2372     return false;
2373   }
2374   case Intrinsic::amdgcn_loop: {
2375     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
2376       const SIRegisterInfo *TRI
2377         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
2378 
2379       B.setInstr(*BrCond);
2380       Register Reg = MI.getOperand(2).getReg();
2381       B.buildInstr(AMDGPU::SI_LOOP)
2382         .addUse(Reg)
2383         .addMBB(BrCond->getOperand(1).getMBB());
2384       MI.eraseFromParent();
2385       BrCond->eraseFromParent();
2386       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
2387       return true;
2388     }
2389 
2390     return false;
2391   }
2392   case Intrinsic::amdgcn_kernarg_segment_ptr:
2393     return legalizePreloadedArgIntrin(
2394       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2395   case Intrinsic::amdgcn_implicitarg_ptr:
2396     return legalizeImplicitArgPtr(MI, MRI, B);
2397   case Intrinsic::amdgcn_workitem_id_x:
2398     return legalizePreloadedArgIntrin(MI, MRI, B,
2399                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
2400   case Intrinsic::amdgcn_workitem_id_y:
2401     return legalizePreloadedArgIntrin(MI, MRI, B,
2402                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
2403   case Intrinsic::amdgcn_workitem_id_z:
2404     return legalizePreloadedArgIntrin(MI, MRI, B,
2405                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
2406   case Intrinsic::amdgcn_workgroup_id_x:
2407     return legalizePreloadedArgIntrin(MI, MRI, B,
2408                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
2409   case Intrinsic::amdgcn_workgroup_id_y:
2410     return legalizePreloadedArgIntrin(MI, MRI, B,
2411                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
2412   case Intrinsic::amdgcn_workgroup_id_z:
2413     return legalizePreloadedArgIntrin(MI, MRI, B,
2414                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
2415   case Intrinsic::amdgcn_dispatch_ptr:
2416     return legalizePreloadedArgIntrin(MI, MRI, B,
2417                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
2418   case Intrinsic::amdgcn_queue_ptr:
2419     return legalizePreloadedArgIntrin(MI, MRI, B,
2420                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
2421   case Intrinsic::amdgcn_implicit_buffer_ptr:
2422     return legalizePreloadedArgIntrin(
2423       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
2424   case Intrinsic::amdgcn_dispatch_id:
2425     return legalizePreloadedArgIntrin(MI, MRI, B,
2426                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
2427   case Intrinsic::amdgcn_fdiv_fast:
2428     return legalizeFDIVFastIntrin(MI, MRI, B);
2429   case Intrinsic::amdgcn_is_shared:
2430     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
2431   case Intrinsic::amdgcn_is_private:
2432     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
2433   case Intrinsic::amdgcn_wavefrontsize: {
2434     B.setInstr(MI);
2435     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
2436     MI.eraseFromParent();
2437     return true;
2438   }
2439   case Intrinsic::amdgcn_raw_buffer_store:
2440     return legalizeRawBufferStore(MI, MRI, B, false);
2441   case Intrinsic::amdgcn_raw_buffer_store_format:
2442     return legalizeRawBufferStore(MI, MRI, B, true);
2443   default:
2444     return true;
2445   }
2446 
2447   return true;
2448 }
2449