1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
19 #endif
20 
21 #include "AMDGPU.h"
22 #include "AMDGPULegalizerInfo.h"
23 #include "AMDGPUTargetMachine.h"
24 #include "SIMachineFunctionInfo.h"
25 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/CodeGen/TargetOpcodes.h"
28 #include "llvm/CodeGen/ValueTypes.h"
29 #include "llvm/IR/DerivedTypes.h"
30 #include "llvm/IR/DiagnosticInfo.h"
31 #include "llvm/IR/Type.h"
32 #include "llvm/Support/Debug.h"
33 
34 #define DEBUG_TYPE "amdgpu-legalinfo"
35 
36 using namespace llvm;
37 using namespace LegalizeActions;
38 using namespace LegalizeMutations;
39 using namespace LegalityPredicates;
40 
41 
42 static LegalityPredicate isMultiple32(unsigned TypeIdx,
43                                       unsigned MaxSize = 1024) {
44   return [=](const LegalityQuery &Query) {
45     const LLT Ty = Query.Types[TypeIdx];
46     const LLT EltTy = Ty.getScalarType();
47     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
48   };
49 }
50 
51 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
52   return [=](const LegalityQuery &Query) {
53     return Query.Types[TypeIdx].getSizeInBits() == Size;
54   };
55 }
56 
57 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
58   return [=](const LegalityQuery &Query) {
59     const LLT Ty = Query.Types[TypeIdx];
60     return Ty.isVector() &&
61            Ty.getNumElements() % 2 != 0 &&
62            Ty.getElementType().getSizeInBits() < 32 &&
63            Ty.getSizeInBits() % 32 != 0;
64   };
65 }
66 
67 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
68   return [=](const LegalityQuery &Query) {
69     const LLT Ty = Query.Types[TypeIdx];
70     const LLT EltTy = Ty.getScalarType();
71     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
72   };
73 }
74 
75 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
76   return [=](const LegalityQuery &Query) {
77     const LLT Ty = Query.Types[TypeIdx];
78     const LLT EltTy = Ty.getElementType();
79     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
80   };
81 }
82 
83 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
84   return [=](const LegalityQuery &Query) {
85     const LLT Ty = Query.Types[TypeIdx];
86     const LLT EltTy = Ty.getElementType();
87     unsigned Size = Ty.getSizeInBits();
88     unsigned Pieces = (Size + 63) / 64;
89     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
90     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
91   };
92 }
93 
94 // Increase the number of vector elements to reach the next multiple of 32-bit
95 // type.
96 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
97   return [=](const LegalityQuery &Query) {
98     const LLT Ty = Query.Types[TypeIdx];
99 
100     const LLT EltTy = Ty.getElementType();
101     const int Size = Ty.getSizeInBits();
102     const int EltSize = EltTy.getSizeInBits();
103     const int NextMul32 = (Size + 31) / 32;
104 
105     assert(EltSize < 32);
106 
107     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
108     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
109   };
110 }
111 
112 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
113   return [=](const LegalityQuery &Query) {
114     const LLT QueryTy = Query.Types[TypeIdx];
115     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
116   };
117 }
118 
119 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
120   return [=](const LegalityQuery &Query) {
121     const LLT QueryTy = Query.Types[TypeIdx];
122     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
123   };
124 }
125 
126 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
127   return [=](const LegalityQuery &Query) {
128     const LLT QueryTy = Query.Types[TypeIdx];
129     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
130   };
131 }
132 
133 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
134 // v2s16.
135 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
136   return [=](const LegalityQuery &Query) {
137     const LLT Ty = Query.Types[TypeIdx];
138     if (Ty.isVector()) {
139       const int EltSize = Ty.getElementType().getSizeInBits();
140       return EltSize == 32 || EltSize == 64 ||
141             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
142              EltSize == 128 || EltSize == 256;
143     }
144 
145     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
146   };
147 }
148 
149 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
150   return [=](const LegalityQuery &Query) {
151     return Query.Types[TypeIdx].getElementType() == Type;
152   };
153 }
154 
155 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
156   return [=](const LegalityQuery &Query) {
157     const LLT Ty = Query.Types[TypeIdx];
158     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
159            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
160   };
161 }
162 
163 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
164                                          const GCNTargetMachine &TM)
165   :  ST(ST_) {
166   using namespace TargetOpcode;
167 
168   auto GetAddrSpacePtr = [&TM](unsigned AS) {
169     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
170   };
171 
172   const LLT S1 = LLT::scalar(1);
173   const LLT S8 = LLT::scalar(8);
174   const LLT S16 = LLT::scalar(16);
175   const LLT S32 = LLT::scalar(32);
176   const LLT S64 = LLT::scalar(64);
177   const LLT S96 = LLT::scalar(96);
178   const LLT S128 = LLT::scalar(128);
179   const LLT S256 = LLT::scalar(256);
180   const LLT S1024 = LLT::scalar(1024);
181 
182   const LLT V2S16 = LLT::vector(2, 16);
183   const LLT V4S16 = LLT::vector(4, 16);
184 
185   const LLT V2S32 = LLT::vector(2, 32);
186   const LLT V3S32 = LLT::vector(3, 32);
187   const LLT V4S32 = LLT::vector(4, 32);
188   const LLT V5S32 = LLT::vector(5, 32);
189   const LLT V6S32 = LLT::vector(6, 32);
190   const LLT V7S32 = LLT::vector(7, 32);
191   const LLT V8S32 = LLT::vector(8, 32);
192   const LLT V9S32 = LLT::vector(9, 32);
193   const LLT V10S32 = LLT::vector(10, 32);
194   const LLT V11S32 = LLT::vector(11, 32);
195   const LLT V12S32 = LLT::vector(12, 32);
196   const LLT V13S32 = LLT::vector(13, 32);
197   const LLT V14S32 = LLT::vector(14, 32);
198   const LLT V15S32 = LLT::vector(15, 32);
199   const LLT V16S32 = LLT::vector(16, 32);
200   const LLT V32S32 = LLT::vector(32, 32);
201 
202   const LLT V2S64 = LLT::vector(2, 64);
203   const LLT V3S64 = LLT::vector(3, 64);
204   const LLT V4S64 = LLT::vector(4, 64);
205   const LLT V5S64 = LLT::vector(5, 64);
206   const LLT V6S64 = LLT::vector(6, 64);
207   const LLT V7S64 = LLT::vector(7, 64);
208   const LLT V8S64 = LLT::vector(8, 64);
209   const LLT V16S64 = LLT::vector(16, 64);
210 
211   std::initializer_list<LLT> AllS32Vectors =
212     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
213      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
214   std::initializer_list<LLT> AllS64Vectors =
215     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
216 
217   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
218   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
219   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
220   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
221   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
222   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
223   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
224 
225   const LLT CodePtr = FlatPtr;
226 
227   const std::initializer_list<LLT> AddrSpaces64 = {
228     GlobalPtr, ConstantPtr, FlatPtr
229   };
230 
231   const std::initializer_list<LLT> AddrSpaces32 = {
232     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
233   };
234 
235   const std::initializer_list<LLT> FPTypesBase = {
236     S32, S64
237   };
238 
239   const std::initializer_list<LLT> FPTypes16 = {
240     S32, S64, S16
241   };
242 
243   const std::initializer_list<LLT> FPTypesPK16 = {
244     S32, S64, S16, V2S16
245   };
246 
247   setAction({G_BRCOND, S1}, Legal); // VCC branches
248   setAction({G_BRCOND, S32}, Legal); // SCC branches
249 
250   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
251   // elements for v3s16
252   getActionDefinitionsBuilder(G_PHI)
253     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
254     .legalFor(AllS32Vectors)
255     .legalFor(AllS64Vectors)
256     .legalFor(AddrSpaces64)
257     .legalFor(AddrSpaces32)
258     .clampScalar(0, S32, S256)
259     .widenScalarToNextPow2(0, 32)
260     .clampMaxNumElements(0, S32, 16)
261     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
262     .legalIf(isPointer(0));
263 
264   if (ST.has16BitInsts()) {
265     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
266       .legalFor({S32, S16})
267       .clampScalar(0, S16, S32)
268       .scalarize(0);
269   } else {
270     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
271       .legalFor({S32})
272       .clampScalar(0, S32, S32)
273       .scalarize(0);
274   }
275 
276   // FIXME: Not really legal. Placeholder for custom lowering.
277   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
278     .legalFor({S32, S64})
279     .clampScalar(0, S32, S64)
280     .widenScalarToNextPow2(0, 32)
281     .scalarize(0);
282 
283   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
284     .legalFor({S32})
285     .clampScalar(0, S32, S32)
286     .scalarize(0);
287 
288   // Report legal for any types we can handle anywhere. For the cases only legal
289   // on the SALU, RegBankSelect will be able to re-legalize.
290   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
291     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
292     .clampScalar(0, S32, S64)
293     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
294     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
295     .widenScalarToNextPow2(0)
296     .scalarize(0);
297 
298   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
299                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
300     .legalFor({{S32, S1}, {S32, S32}})
301     .clampScalar(0, S32, S32)
302     .scalarize(0); // TODO: Implement.
303 
304   getActionDefinitionsBuilder({G_SADDO, G_SSUBO})
305     .lower();
306 
307   getActionDefinitionsBuilder(G_BITCAST)
308     // Don't worry about the size constraint.
309     .legalIf(all(isRegisterType(0), isRegisterType(1)))
310     // FIXME: Testing hack
311     .legalForCartesianProduct({S16, LLT::vector(2, 8), });
312 
313   getActionDefinitionsBuilder(G_FCONSTANT)
314     .legalFor({S32, S64, S16})
315     .clampScalar(0, S16, S64);
316 
317   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
318     .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
319                ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
320     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
321     .clampScalarOrElt(0, S32, S1024)
322     .legalIf(isMultiple32(0))
323     .widenScalarToNextPow2(0, 32)
324     .clampMaxNumElements(0, S32, 16);
325 
326 
327   // FIXME: i1 operands to intrinsics should always be legal, but other i1
328   // values may not be legal.  We need to figure out how to distinguish
329   // between these two scenarios.
330   getActionDefinitionsBuilder(G_CONSTANT)
331     .legalFor({S1, S32, S64, S16, GlobalPtr,
332                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
333     .clampScalar(0, S32, S64)
334     .widenScalarToNextPow2(0)
335     .legalIf(isPointer(0));
336 
337   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
338   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
339     .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr});
340 
341 
342   auto &FPOpActions = getActionDefinitionsBuilder(
343     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
344     .legalFor({S32, S64});
345   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
346     .customFor({S32, S64});
347   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
348     .customFor({S32, S64});
349 
350   if (ST.has16BitInsts()) {
351     if (ST.hasVOP3PInsts())
352       FPOpActions.legalFor({S16, V2S16});
353     else
354       FPOpActions.legalFor({S16});
355 
356     TrigActions.customFor({S16});
357     FDIVActions.customFor({S16});
358   }
359 
360   auto &MinNumMaxNum = getActionDefinitionsBuilder({
361       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
362 
363   if (ST.hasVOP3PInsts()) {
364     MinNumMaxNum.customFor(FPTypesPK16)
365       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
366       .clampMaxNumElements(0, S16, 2)
367       .clampScalar(0, S16, S64)
368       .scalarize(0);
369   } else if (ST.has16BitInsts()) {
370     MinNumMaxNum.customFor(FPTypes16)
371       .clampScalar(0, S16, S64)
372       .scalarize(0);
373   } else {
374     MinNumMaxNum.customFor(FPTypesBase)
375       .clampScalar(0, S32, S64)
376       .scalarize(0);
377   }
378 
379   if (ST.hasVOP3PInsts())
380     FPOpActions.clampMaxNumElements(0, S16, 2);
381 
382   FPOpActions
383     .scalarize(0)
384     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
385 
386   TrigActions
387     .scalarize(0)
388     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
389 
390   FDIVActions
391     .scalarize(0)
392     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
393 
394   getActionDefinitionsBuilder({G_FNEG, G_FABS})
395     .legalFor(FPTypesPK16)
396     .clampMaxNumElements(0, S16, 2)
397     .scalarize(0)
398     .clampScalar(0, S16, S64);
399 
400   // TODO: Implement
401   getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
402 
403   if (ST.has16BitInsts()) {
404     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
405       .legalFor({S32, S64, S16})
406       .scalarize(0)
407       .clampScalar(0, S16, S64);
408   } else {
409     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
410       .legalFor({S32, S64})
411       .scalarize(0)
412       .clampScalar(0, S32, S64);
413   }
414 
415   getActionDefinitionsBuilder(G_FPTRUNC)
416     .legalFor({{S32, S64}, {S16, S32}})
417     .scalarize(0);
418 
419   getActionDefinitionsBuilder(G_FPEXT)
420     .legalFor({{S64, S32}, {S32, S16}})
421     .lowerFor({{S64, S16}}) // FIXME: Implement
422     .scalarize(0);
423 
424   // TODO: Verify V_BFI_B32 is generated from expanded bit ops.
425   getActionDefinitionsBuilder(G_FCOPYSIGN).lower();
426 
427   getActionDefinitionsBuilder(G_FSUB)
428       // Use actual fsub instruction
429       .legalFor({S32})
430       // Must use fadd + fneg
431       .lowerFor({S64, S16, V2S16})
432       .scalarize(0)
433       .clampScalar(0, S32, S64);
434 
435   // Whether this is legal depends on the floating point mode for the function.
436   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
437   if (ST.hasMadF16())
438     FMad.customFor({S32, S16});
439   else
440     FMad.customFor({S32});
441   FMad.scalarize(0)
442       .lower();
443 
444   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
445     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
446                {S32, S1}, {S64, S1}, {S16, S1},
447                {S96, S32},
448                // FIXME: Hack
449                {S64, LLT::scalar(33)},
450                {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}})
451     .scalarize(0);
452 
453   // TODO: Split s1->s64 during regbankselect for VALU.
454   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
455     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
456     .lowerFor({{S32, S64}})
457     .lowerIf(typeIs(1, S1))
458     .customFor({{S64, S64}});
459   if (ST.has16BitInsts())
460     IToFP.legalFor({{S16, S16}});
461   IToFP.clampScalar(1, S32, S64)
462        .scalarize(0);
463 
464   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
465     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}});
466   if (ST.has16BitInsts())
467     FPToI.legalFor({{S16, S16}});
468   else
469     FPToI.minScalar(1, S32);
470 
471   FPToI.minScalar(0, S32)
472        .scalarize(0);
473 
474   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
475     .scalarize(0)
476     .lower();
477 
478   if (ST.has16BitInsts()) {
479     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
480       .legalFor({S16, S32, S64})
481       .clampScalar(0, S16, S64)
482       .scalarize(0);
483   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
484     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
485       .legalFor({S32, S64})
486       .clampScalar(0, S32, S64)
487       .scalarize(0);
488   } else {
489     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
490       .legalFor({S32})
491       .customFor({S64})
492       .clampScalar(0, S32, S64)
493       .scalarize(0);
494   }
495 
496   getActionDefinitionsBuilder(G_PTR_ADD)
497     .legalForCartesianProduct(AddrSpaces64, {S64})
498     .legalForCartesianProduct(AddrSpaces32, {S32})
499     .scalarize(0);
500 
501   getActionDefinitionsBuilder(G_PTR_MASK)
502     .scalarize(0)
503     .alwaysLegal();
504 
505   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
506 
507   auto &CmpBuilder =
508     getActionDefinitionsBuilder(G_ICMP)
509     // The compare output type differs based on the register bank of the output,
510     // so make both s1 and s32 legal.
511     //
512     // Scalar compares producing output in scc will be promoted to s32, as that
513     // is the allocatable register type that will be needed for the copy from
514     // scc. This will be promoted during RegBankSelect, and we assume something
515     // before that won't try to use s32 result types.
516     //
517     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
518     // bank.
519     .legalForCartesianProduct(
520       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
521     .legalForCartesianProduct(
522       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
523   if (ST.has16BitInsts()) {
524     CmpBuilder.legalFor({{S1, S16}});
525   }
526 
527   CmpBuilder
528     .widenScalarToNextPow2(1)
529     .clampScalar(1, S32, S64)
530     .scalarize(0)
531     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
532 
533   getActionDefinitionsBuilder(G_FCMP)
534     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
535     .widenScalarToNextPow2(1)
536     .clampScalar(1, S32, S64)
537     .scalarize(0);
538 
539   // FIXME: fexp, flog2, flog10 needs to be custom lowered.
540   getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2,
541                                G_FLOG, G_FLOG2, G_FLOG10})
542     .legalFor({S32})
543     .scalarize(0);
544 
545   // The 64-bit versions produce 32-bit results, but only on the SALU.
546   getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF,
547                                G_CTTZ, G_CTTZ_ZERO_UNDEF,
548                                G_CTPOP})
549     .legalFor({{S32, S32}, {S32, S64}})
550     .clampScalar(0, S32, S32)
551     .clampScalar(1, S32, S64)
552     .scalarize(0)
553     .widenScalarToNextPow2(0, 32)
554     .widenScalarToNextPow2(1, 32);
555 
556   // TODO: Expand for > s32
557   getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE})
558     .legalFor({S32})
559     .clampScalar(0, S32, S32)
560     .scalarize(0);
561 
562   if (ST.has16BitInsts()) {
563     if (ST.hasVOP3PInsts()) {
564       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
565         .legalFor({S32, S16, V2S16})
566         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
567         .clampMaxNumElements(0, S16, 2)
568         .clampScalar(0, S16, S32)
569         .widenScalarToNextPow2(0)
570         .scalarize(0);
571     } else {
572       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
573         .legalFor({S32, S16})
574         .widenScalarToNextPow2(0)
575         .clampScalar(0, S16, S32)
576         .scalarize(0);
577     }
578   } else {
579     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
580       .legalFor({S32})
581       .clampScalar(0, S32, S32)
582       .widenScalarToNextPow2(0)
583       .scalarize(0);
584   }
585 
586   auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
587     return [=](const LegalityQuery &Query) {
588       return Query.Types[TypeIdx0].getSizeInBits() <
589              Query.Types[TypeIdx1].getSizeInBits();
590     };
591   };
592 
593   auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
594     return [=](const LegalityQuery &Query) {
595       return Query.Types[TypeIdx0].getSizeInBits() >
596              Query.Types[TypeIdx1].getSizeInBits();
597     };
598   };
599 
600   getActionDefinitionsBuilder(G_INTTOPTR)
601     // List the common cases
602     .legalForCartesianProduct(AddrSpaces64, {S64})
603     .legalForCartesianProduct(AddrSpaces32, {S32})
604     .scalarize(0)
605     // Accept any address space as long as the size matches
606     .legalIf(sameSize(0, 1))
607     .widenScalarIf(smallerThan(1, 0),
608       [](const LegalityQuery &Query) {
609         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
610       })
611     .narrowScalarIf(greaterThan(1, 0),
612       [](const LegalityQuery &Query) {
613         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
614       });
615 
616   getActionDefinitionsBuilder(G_PTRTOINT)
617     // List the common cases
618     .legalForCartesianProduct(AddrSpaces64, {S64})
619     .legalForCartesianProduct(AddrSpaces32, {S32})
620     .scalarize(0)
621     // Accept any address space as long as the size matches
622     .legalIf(sameSize(0, 1))
623     .widenScalarIf(smallerThan(0, 1),
624       [](const LegalityQuery &Query) {
625         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
626       })
627     .narrowScalarIf(
628       greaterThan(0, 1),
629       [](const LegalityQuery &Query) {
630         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
631       });
632 
633   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
634     .scalarize(0)
635     .custom();
636 
637   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
638   // handle some operations by just promoting the register during
639   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
640   auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned {
641     switch (AS) {
642     // FIXME: Private element size.
643     case AMDGPUAS::PRIVATE_ADDRESS:
644       return 32;
645     // FIXME: Check subtarget
646     case AMDGPUAS::LOCAL_ADDRESS:
647       return ST.useDS128() ? 128 : 64;
648 
649     // Treat constant and global as identical. SMRD loads are sometimes usable
650     // for global loads (ideally constant address space should be eliminated)
651     // depending on the context. Legality cannot be context dependent, but
652     // RegBankSelect can split the load as necessary depending on the pointer
653     // register bank/uniformity and if the memory is invariant or not written in
654     // a kernel.
655     case AMDGPUAS::CONSTANT_ADDRESS:
656     case AMDGPUAS::GLOBAL_ADDRESS:
657       return 512;
658     default:
659       return 128;
660     }
661   };
662 
663   const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool {
664     const LLT DstTy = Query.Types[0];
665 
666     // Split vector extloads.
667     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
668     unsigned Align = Query.MMODescrs[0].AlignInBits;
669 
670     if (MemSize < DstTy.getSizeInBits())
671       MemSize = std::max(MemSize, Align);
672 
673     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
674       return true;
675 
676     const LLT PtrTy = Query.Types[1];
677     unsigned AS = PtrTy.getAddressSpace();
678     if (MemSize > maxSizeForAddrSpace(AS))
679       return true;
680 
681     // Catch weird sized loads that don't evenly divide into the access sizes
682     // TODO: May be able to widen depending on alignment etc.
683     unsigned NumRegs = MemSize / 32;
684     if (NumRegs == 3 && !ST.hasDwordx3LoadStores())
685       return true;
686 
687     if (Align < MemSize) {
688       const SITargetLowering *TLI = ST.getTargetLowering();
689       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
690     }
691 
692     return false;
693   };
694 
695   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
696   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
697   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
698 
699   // TODO: Refine based on subtargets which support unaligned access or 128-bit
700   // LDS
701   // TODO: Unsupported flat for SI.
702 
703   for (unsigned Op : {G_LOAD, G_STORE}) {
704     const bool IsStore = Op == G_STORE;
705 
706     auto &Actions = getActionDefinitionsBuilder(Op);
707     // Whitelist the common cases.
708     // TODO: Pointer loads
709     // TODO: Wide constant loads
710     // TODO: Only CI+ has 3x loads
711     // TODO: Loads to s16 on gfx9
712     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
713                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
714                                       {V3S32, GlobalPtr, 96, GlobalAlign32},
715                                       {S96, GlobalPtr, 96, GlobalAlign32},
716                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
717                                       {S128, GlobalPtr, 128, GlobalAlign32},
718                                       {S64, GlobalPtr, 64, GlobalAlign32},
719                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
720                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
721                                       {S32, GlobalPtr, 8, GlobalAlign8},
722                                       {S32, GlobalPtr, 16, GlobalAlign16},
723 
724                                       {S32, LocalPtr, 32, 32},
725                                       {S64, LocalPtr, 64, 32},
726                                       {V2S32, LocalPtr, 64, 32},
727                                       {S32, LocalPtr, 8, 8},
728                                       {S32, LocalPtr, 16, 16},
729                                       {V2S16, LocalPtr, 32, 32},
730 
731                                       {S32, PrivatePtr, 32, 32},
732                                       {S32, PrivatePtr, 8, 8},
733                                       {S32, PrivatePtr, 16, 16},
734                                       {V2S16, PrivatePtr, 32, 32},
735 
736                                       {S32, FlatPtr, 32, GlobalAlign32},
737                                       {S32, FlatPtr, 16, GlobalAlign16},
738                                       {S32, FlatPtr, 8, GlobalAlign8},
739                                       {V2S16, FlatPtr, 32, GlobalAlign32},
740 
741                                       {S32, ConstantPtr, 32, GlobalAlign32},
742                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
743                                       {V3S32, ConstantPtr, 96, GlobalAlign32},
744                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
745                                       {S64, ConstantPtr, 64, GlobalAlign32},
746                                       {S128, ConstantPtr, 128, GlobalAlign32},
747                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
748     Actions
749         .customIf(typeIs(1, Constant32Ptr))
750         .narrowScalarIf(
751             [=](const LegalityQuery &Query) -> bool {
752               return !Query.Types[0].isVector() && needToSplitLoad(Query);
753             },
754             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
755               const LLT DstTy = Query.Types[0];
756               const LLT PtrTy = Query.Types[1];
757 
758               const unsigned DstSize = DstTy.getSizeInBits();
759               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
760 
761               // Split extloads.
762               if (DstSize > MemSize)
763                 return std::make_pair(0, LLT::scalar(MemSize));
764 
765               if (DstSize > 32 && (DstSize % 32 != 0)) {
766                 // FIXME: Need a way to specify non-extload of larger size if
767                 // suitably aligned.
768                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
769               }
770 
771               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
772               if (MemSize > MaxSize)
773                 return std::make_pair(0, LLT::scalar(MaxSize));
774 
775               unsigned Align = Query.MMODescrs[0].AlignInBits;
776               return std::make_pair(0, LLT::scalar(Align));
777             })
778         .fewerElementsIf(
779             [=](const LegalityQuery &Query) -> bool {
780               return Query.Types[0].isVector() && needToSplitLoad(Query);
781             },
782             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
783               const LLT DstTy = Query.Types[0];
784               const LLT PtrTy = Query.Types[1];
785 
786               LLT EltTy = DstTy.getElementType();
787               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
788 
789               // Split if it's too large for the address space.
790               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
791                 unsigned NumElts = DstTy.getNumElements();
792                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
793 
794                 // FIXME: Refine when odd breakdowns handled
795                 // The scalars will need to be re-legalized.
796                 if (NumPieces == 1 || NumPieces >= NumElts ||
797                     NumElts % NumPieces != 0)
798                   return std::make_pair(0, EltTy);
799 
800                 return std::make_pair(0,
801                                       LLT::vector(NumElts / NumPieces, EltTy));
802               }
803 
804               // Need to split because of alignment.
805               unsigned Align = Query.MMODescrs[0].AlignInBits;
806               unsigned EltSize = EltTy.getSizeInBits();
807               if (EltSize > Align &&
808                   (EltSize / Align < DstTy.getNumElements())) {
809                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
810               }
811 
812               // May need relegalization for the scalars.
813               return std::make_pair(0, EltTy);
814             })
815         .minScalar(0, S32);
816 
817     if (IsStore)
818       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
819 
820     // TODO: Need a bitcast lower option?
821     Actions
822         .legalIf([=](const LegalityQuery &Query) {
823           const LLT Ty0 = Query.Types[0];
824           unsigned Size = Ty0.getSizeInBits();
825           unsigned MemSize = Query.MMODescrs[0].SizeInBits;
826           unsigned Align = Query.MMODescrs[0].AlignInBits;
827 
828           // FIXME: Widening store from alignment not valid.
829           if (MemSize < Size)
830             MemSize = std::max(MemSize, Align);
831 
832           // No extending vector loads.
833           if (Size > MemSize && Ty0.isVector())
834             return false;
835 
836           switch (MemSize) {
837           case 8:
838           case 16:
839             return Size == 32;
840           case 32:
841           case 64:
842           case 128:
843             return true;
844           case 96:
845             return ST.hasDwordx3LoadStores();
846           case 256:
847           case 512:
848             return true;
849           default:
850             return false;
851           }
852         })
853         .widenScalarToNextPow2(0)
854         // TODO: v3s32->v4s32 with alignment
855         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
856   }
857 
858   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
859                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
860                                                   {S32, GlobalPtr, 16, 2 * 8},
861                                                   {S32, LocalPtr, 8, 8},
862                                                   {S32, LocalPtr, 16, 16},
863                                                   {S32, PrivatePtr, 8, 8},
864                                                   {S32, PrivatePtr, 16, 16},
865                                                   {S32, ConstantPtr, 8, 8},
866                                                   {S32, ConstantPtr, 16, 2 * 8}});
867   if (ST.hasFlatAddressSpace()) {
868     ExtLoads.legalForTypesWithMemDesc(
869         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
870   }
871 
872   ExtLoads.clampScalar(0, S32, S32)
873           .widenScalarToNextPow2(0)
874           .unsupportedIfMemSizeNotPow2()
875           .lower();
876 
877   auto &Atomics = getActionDefinitionsBuilder(
878     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
879      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
880      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
881      G_ATOMICRMW_UMIN})
882     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
883                {S64, GlobalPtr}, {S64, LocalPtr}});
884   if (ST.hasFlatAddressSpace()) {
885     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
886   }
887 
888   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
889     .legalFor({{S32, LocalPtr}});
890 
891   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
892   // demarshalling
893   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
894     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
895                 {S32, FlatPtr}, {S64, FlatPtr}})
896     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
897                {S32, RegionPtr}, {S64, RegionPtr}});
898 
899   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS)
900     .lower();
901 
902   // TODO: Pointer types, any 32-bit or 64-bit vector
903 
904   // Condition should be s32 for scalar, s1 for vector.
905   getActionDefinitionsBuilder(G_SELECT)
906     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
907           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
908           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
909     .clampScalar(0, S16, S64)
910     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
911     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
912     .scalarize(1)
913     .clampMaxNumElements(0, S32, 2)
914     .clampMaxNumElements(0, LocalPtr, 2)
915     .clampMaxNumElements(0, PrivatePtr, 2)
916     .scalarize(0)
917     .widenScalarToNextPow2(0)
918     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
919 
920   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
921   // be more flexible with the shift amount type.
922   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
923     .legalFor({{S32, S32}, {S64, S32}});
924   if (ST.has16BitInsts()) {
925     if (ST.hasVOP3PInsts()) {
926       Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
927             .clampMaxNumElements(0, S16, 2);
928     } else
929       Shifts.legalFor({{S16, S32}, {S16, S16}});
930 
931     // TODO: Support 16-bit shift amounts
932     Shifts.clampScalar(1, S32, S32);
933     Shifts.clampScalar(0, S16, S64);
934     Shifts.widenScalarToNextPow2(0, 16);
935   } else {
936     // Make sure we legalize the shift amount type first, as the general
937     // expansion for the shifted type will produce much worse code if it hasn't
938     // been truncated already.
939     Shifts.clampScalar(1, S32, S32);
940     Shifts.clampScalar(0, S32, S64);
941     Shifts.widenScalarToNextPow2(0, 32);
942   }
943   Shifts.scalarize(0);
944 
945   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
946     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
947     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
948     unsigned IdxTypeIdx = 2;
949 
950     getActionDefinitionsBuilder(Op)
951       .customIf([=](const LegalityQuery &Query) {
952           const LLT EltTy = Query.Types[EltTypeIdx];
953           const LLT VecTy = Query.Types[VecTypeIdx];
954           const LLT IdxTy = Query.Types[IdxTypeIdx];
955           return (EltTy.getSizeInBits() == 16 ||
956                   EltTy.getSizeInBits() % 32 == 0) &&
957                  VecTy.getSizeInBits() % 32 == 0 &&
958                  VecTy.getSizeInBits() <= 1024 &&
959                  IdxTy.getSizeInBits() == 32;
960         })
961       .clampScalar(EltTypeIdx, S32, S64)
962       .clampScalar(VecTypeIdx, S32, S64)
963       .clampScalar(IdxTypeIdx, S32, S32);
964   }
965 
966   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
967     .unsupportedIf([=](const LegalityQuery &Query) {
968         const LLT &EltTy = Query.Types[1].getElementType();
969         return Query.Types[0] != EltTy;
970       });
971 
972   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
973     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
974     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
975 
976     // FIXME: Doesn't handle extract of illegal sizes.
977     getActionDefinitionsBuilder(Op)
978       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
979       // FIXME: Multiples of 16 should not be legal.
980       .legalIf([=](const LegalityQuery &Query) {
981           const LLT BigTy = Query.Types[BigTyIdx];
982           const LLT LitTy = Query.Types[LitTyIdx];
983           return (BigTy.getSizeInBits() % 32 == 0) &&
984                  (LitTy.getSizeInBits() % 16 == 0);
985         })
986       .widenScalarIf(
987         [=](const LegalityQuery &Query) {
988           const LLT BigTy = Query.Types[BigTyIdx];
989           return (BigTy.getScalarSizeInBits() < 16);
990         },
991         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
992       .widenScalarIf(
993         [=](const LegalityQuery &Query) {
994           const LLT LitTy = Query.Types[LitTyIdx];
995           return (LitTy.getScalarSizeInBits() < 16);
996         },
997         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
998       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
999       .widenScalarToNextPow2(BigTyIdx, 32);
1000 
1001   }
1002 
1003   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1004     .legalForCartesianProduct(AllS32Vectors, {S32})
1005     .legalForCartesianProduct(AllS64Vectors, {S64})
1006     .clampNumElements(0, V16S32, V32S32)
1007     .clampNumElements(0, V2S64, V16S64)
1008     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1009 
1010   if (ST.hasScalarPackInsts())
1011     BuildVector.legalFor({V2S16, S32});
1012 
1013   BuildVector
1014     .minScalarSameAs(1, 0)
1015     .legalIf(isRegisterType(0))
1016     .minScalarOrElt(0, S32);
1017 
1018   if (ST.hasScalarPackInsts()) {
1019     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1020       .legalFor({V2S16, S32})
1021       .lower();
1022   } else {
1023     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1024       .lower();
1025   }
1026 
1027   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1028     .legalIf(isRegisterType(0));
1029 
1030   // TODO: Don't fully scalarize v2s16 pieces
1031   getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1032 
1033   // Merge/Unmerge
1034   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1035     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1036     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1037 
1038     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1039       const LLT &Ty = Query.Types[TypeIdx];
1040       if (Ty.isVector()) {
1041         const LLT &EltTy = Ty.getElementType();
1042         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
1043           return true;
1044         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1045           return true;
1046       }
1047       return false;
1048     };
1049 
1050     auto &Builder = getActionDefinitionsBuilder(Op)
1051       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1052       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1053       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1054       // valid.
1055       .clampScalar(LitTyIdx, S16, S256)
1056       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1057       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1058       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1059                            elementTypeIs(1, S16)),
1060                        changeTo(1, V2S16))
1061       // Break up vectors with weird elements into scalars
1062       .fewerElementsIf(
1063         [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
1064         scalarize(0))
1065       .fewerElementsIf(
1066         [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
1067         scalarize(1))
1068       .clampScalar(BigTyIdx, S32, S1024)
1069       .lowerFor({{S16, V2S16}});
1070 
1071     if (Op == G_MERGE_VALUES) {
1072       Builder.widenScalarIf(
1073         // TODO: Use 16-bit shifts if legal for 8-bit values?
1074         [=](const LegalityQuery &Query) {
1075           const LLT Ty = Query.Types[LitTyIdx];
1076           return Ty.getSizeInBits() < 32;
1077         },
1078         changeTo(LitTyIdx, S32));
1079     }
1080 
1081     Builder.widenScalarIf(
1082       [=](const LegalityQuery &Query) {
1083         const LLT Ty = Query.Types[BigTyIdx];
1084         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1085           Ty.getSizeInBits() % 16 != 0;
1086       },
1087       [=](const LegalityQuery &Query) {
1088         // Pick the next power of 2, or a multiple of 64 over 128.
1089         // Whichever is smaller.
1090         const LLT &Ty = Query.Types[BigTyIdx];
1091         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1092         if (NewSizeInBits >= 256) {
1093           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1094           if (RoundedTo < NewSizeInBits)
1095             NewSizeInBits = RoundedTo;
1096         }
1097         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1098       })
1099       .legalIf([=](const LegalityQuery &Query) {
1100           const LLT &BigTy = Query.Types[BigTyIdx];
1101           const LLT &LitTy = Query.Types[LitTyIdx];
1102 
1103           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1104             return false;
1105           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1106             return false;
1107 
1108           return BigTy.getSizeInBits() % 16 == 0 &&
1109                  LitTy.getSizeInBits() % 16 == 0 &&
1110                  BigTy.getSizeInBits() <= 1024;
1111         })
1112       // Any vectors left are the wrong size. Scalarize them.
1113       .scalarize(0)
1114       .scalarize(1);
1115   }
1116 
1117   getActionDefinitionsBuilder(G_SEXT_INREG).lower();
1118 
1119   getActionDefinitionsBuilder({G_READ_REGISTER, G_WRITE_REGISTER}).lower();
1120 
1121   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1122     .legalFor({S64});
1123 
1124   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1125         G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1126         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1127     .unsupported();
1128 
1129   computeTables();
1130   verify(*ST.getInstrInfo());
1131 }
1132 
1133 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1134                                          MachineRegisterInfo &MRI,
1135                                          MachineIRBuilder &B,
1136                                          GISelChangeObserver &Observer) const {
1137   switch (MI.getOpcode()) {
1138   case TargetOpcode::G_ADDRSPACE_CAST:
1139     return legalizeAddrSpaceCast(MI, MRI, B);
1140   case TargetOpcode::G_FRINT:
1141     return legalizeFrint(MI, MRI, B);
1142   case TargetOpcode::G_FCEIL:
1143     return legalizeFceil(MI, MRI, B);
1144   case TargetOpcode::G_INTRINSIC_TRUNC:
1145     return legalizeIntrinsicTrunc(MI, MRI, B);
1146   case TargetOpcode::G_SITOFP:
1147     return legalizeITOFP(MI, MRI, B, true);
1148   case TargetOpcode::G_UITOFP:
1149     return legalizeITOFP(MI, MRI, B, false);
1150   case TargetOpcode::G_FMINNUM:
1151   case TargetOpcode::G_FMAXNUM:
1152   case TargetOpcode::G_FMINNUM_IEEE:
1153   case TargetOpcode::G_FMAXNUM_IEEE:
1154     return legalizeMinNumMaxNum(MI, MRI, B);
1155   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1156     return legalizeExtractVectorElt(MI, MRI, B);
1157   case TargetOpcode::G_INSERT_VECTOR_ELT:
1158     return legalizeInsertVectorElt(MI, MRI, B);
1159   case TargetOpcode::G_FSIN:
1160   case TargetOpcode::G_FCOS:
1161     return legalizeSinCos(MI, MRI, B);
1162   case TargetOpcode::G_GLOBAL_VALUE:
1163     return legalizeGlobalValue(MI, MRI, B);
1164   case TargetOpcode::G_LOAD:
1165     return legalizeLoad(MI, MRI, B, Observer);
1166   case TargetOpcode::G_FMAD:
1167     return legalizeFMad(MI, MRI, B);
1168   case TargetOpcode::G_FDIV:
1169     return legalizeFDIV(MI, MRI, B);
1170   case TargetOpcode::G_ATOMIC_CMPXCHG:
1171     return legalizeAtomicCmpXChg(MI, MRI, B);
1172   default:
1173     return false;
1174   }
1175 
1176   llvm_unreachable("expected switch to return");
1177 }
1178 
1179 Register AMDGPULegalizerInfo::getSegmentAperture(
1180   unsigned AS,
1181   MachineRegisterInfo &MRI,
1182   MachineIRBuilder &B) const {
1183   MachineFunction &MF = B.getMF();
1184   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1185   const LLT S32 = LLT::scalar(32);
1186 
1187   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1188 
1189   if (ST.hasApertureRegs()) {
1190     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1191     // getreg.
1192     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1193         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1194         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1195     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1196         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1197         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1198     unsigned Encoding =
1199         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1200         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1201         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1202 
1203     Register ApertureReg = MRI.createGenericVirtualRegister(S32);
1204     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1205 
1206     B.buildInstr(AMDGPU::S_GETREG_B32)
1207       .addDef(GetReg)
1208       .addImm(Encoding);
1209     MRI.setType(GetReg, S32);
1210 
1211     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1212     B.buildInstr(TargetOpcode::G_SHL)
1213       .addDef(ApertureReg)
1214       .addUse(GetReg)
1215       .addUse(ShiftAmt.getReg(0));
1216 
1217     return ApertureReg;
1218   }
1219 
1220   Register QueuePtr = MRI.createGenericVirtualRegister(
1221     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1222 
1223   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1224   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1225     return Register();
1226 
1227   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1228   // private_segment_aperture_base_hi.
1229   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1230 
1231   // TODO: can we be smarter about machine pointer info?
1232   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1233   MachineMemOperand *MMO = MF.getMachineMemOperand(
1234     PtrInfo,
1235     MachineMemOperand::MOLoad |
1236     MachineMemOperand::MODereferenceable |
1237     MachineMemOperand::MOInvariant,
1238     4,
1239     MinAlign(64, StructOffset));
1240 
1241   Register LoadResult = MRI.createGenericVirtualRegister(S32);
1242   Register LoadAddr;
1243 
1244   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1245   B.buildLoad(LoadResult, LoadAddr, *MMO);
1246   return LoadResult;
1247 }
1248 
1249 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1250   MachineInstr &MI, MachineRegisterInfo &MRI,
1251   MachineIRBuilder &B) const {
1252   MachineFunction &MF = B.getMF();
1253 
1254   B.setInstr(MI);
1255 
1256   const LLT S32 = LLT::scalar(32);
1257   Register Dst = MI.getOperand(0).getReg();
1258   Register Src = MI.getOperand(1).getReg();
1259 
1260   LLT DstTy = MRI.getType(Dst);
1261   LLT SrcTy = MRI.getType(Src);
1262   unsigned DestAS = DstTy.getAddressSpace();
1263   unsigned SrcAS = SrcTy.getAddressSpace();
1264 
1265   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1266   // vector element.
1267   assert(!DstTy.isVector());
1268 
1269   const AMDGPUTargetMachine &TM
1270     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1271 
1272   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1273   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1274     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1275     return true;
1276   }
1277 
1278   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1279     // Truncate.
1280     B.buildExtract(Dst, Src, 0);
1281     MI.eraseFromParent();
1282     return true;
1283   }
1284 
1285   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1286     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1287     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1288 
1289     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1290     // another. Merge operands are required to be the same type, but creating an
1291     // extra ptrtoint would be kind of pointless.
1292     auto HighAddr = B.buildConstant(
1293       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1294     B.buildMerge(Dst, {Src, HighAddr.getReg(0)});
1295     MI.eraseFromParent();
1296     return true;
1297   }
1298 
1299   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1300     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1301            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1302     unsigned NullVal = TM.getNullPointerValue(DestAS);
1303 
1304     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1305     auto FlatNull = B.buildConstant(SrcTy, 0);
1306 
1307     Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy);
1308 
1309     // Extract low 32-bits of the pointer.
1310     B.buildExtract(PtrLo32, Src, 0);
1311 
1312     Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1313     B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
1314     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1315 
1316     MI.eraseFromParent();
1317     return true;
1318   }
1319 
1320   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1321     return false;
1322 
1323   if (!ST.hasFlatAddressSpace())
1324     return false;
1325 
1326   auto SegmentNull =
1327       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1328   auto FlatNull =
1329       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1330 
1331   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1332   if (!ApertureReg.isValid())
1333     return false;
1334 
1335   Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1336   B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
1337 
1338   Register BuildPtr = MRI.createGenericVirtualRegister(DstTy);
1339 
1340   // Coerce the type of the low half of the result so we can use merge_values.
1341   Register SrcAsInt = MRI.createGenericVirtualRegister(S32);
1342   B.buildInstr(TargetOpcode::G_PTRTOINT)
1343     .addDef(SrcAsInt)
1344     .addUse(Src);
1345 
1346   // TODO: Should we allow mismatched types but matching sizes in merges to
1347   // avoid the ptrtoint?
1348   B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
1349   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
1350 
1351   MI.eraseFromParent();
1352   return true;
1353 }
1354 
1355 bool AMDGPULegalizerInfo::legalizeFrint(
1356   MachineInstr &MI, MachineRegisterInfo &MRI,
1357   MachineIRBuilder &B) const {
1358   B.setInstr(MI);
1359 
1360   Register Src = MI.getOperand(1).getReg();
1361   LLT Ty = MRI.getType(Src);
1362   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1363 
1364   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1365   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1366 
1367   auto C1 = B.buildFConstant(Ty, C1Val);
1368   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1369 
1370   // TODO: Should this propagate fast-math-flags?
1371   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1372   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1373 
1374   auto C2 = B.buildFConstant(Ty, C2Val);
1375   auto Fabs = B.buildFAbs(Ty, Src);
1376 
1377   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1378   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1379   return true;
1380 }
1381 
1382 bool AMDGPULegalizerInfo::legalizeFceil(
1383   MachineInstr &MI, MachineRegisterInfo &MRI,
1384   MachineIRBuilder &B) const {
1385   B.setInstr(MI);
1386 
1387   const LLT S1 = LLT::scalar(1);
1388   const LLT S64 = LLT::scalar(64);
1389 
1390   Register Src = MI.getOperand(1).getReg();
1391   assert(MRI.getType(Src) == S64);
1392 
1393   // result = trunc(src)
1394   // if (src > 0.0 && src != result)
1395   //   result += 1.0
1396 
1397   auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src});
1398 
1399   const auto Zero = B.buildFConstant(S64, 0.0);
1400   const auto One = B.buildFConstant(S64, 1.0);
1401   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1402   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1403   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1404   auto Add = B.buildSelect(S64, And, One, Zero);
1405 
1406   // TODO: Should this propagate fast-math-flags?
1407   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1408   return true;
1409 }
1410 
1411 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1412                                               MachineIRBuilder &B) {
1413   const unsigned FractBits = 52;
1414   const unsigned ExpBits = 11;
1415   LLT S32 = LLT::scalar(32);
1416 
1417   auto Const0 = B.buildConstant(S32, FractBits - 32);
1418   auto Const1 = B.buildConstant(S32, ExpBits);
1419 
1420   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1421     .addUse(Const0.getReg(0))
1422     .addUse(Const1.getReg(0));
1423 
1424   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1425 }
1426 
1427 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1428   MachineInstr &MI, MachineRegisterInfo &MRI,
1429   MachineIRBuilder &B) const {
1430   B.setInstr(MI);
1431 
1432   const LLT S1 = LLT::scalar(1);
1433   const LLT S32 = LLT::scalar(32);
1434   const LLT S64 = LLT::scalar(64);
1435 
1436   Register Src = MI.getOperand(1).getReg();
1437   assert(MRI.getType(Src) == S64);
1438 
1439   // TODO: Should this use extract since the low half is unused?
1440   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1441   Register Hi = Unmerge.getReg(1);
1442 
1443   // Extract the upper half, since this is where we will find the sign and
1444   // exponent.
1445   auto Exp = extractF64Exponent(Hi, B);
1446 
1447   const unsigned FractBits = 52;
1448 
1449   // Extract the sign bit.
1450   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1451   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1452 
1453   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1454 
1455   const auto Zero32 = B.buildConstant(S32, 0);
1456 
1457   // Extend back to 64-bits.
1458   auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)});
1459 
1460   auto Shr = B.buildAShr(S64, FractMask, Exp);
1461   auto Not = B.buildNot(S64, Shr);
1462   auto Tmp0 = B.buildAnd(S64, Src, Not);
1463   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1464 
1465   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1466   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1467 
1468   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1469   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1470   return true;
1471 }
1472 
1473 bool AMDGPULegalizerInfo::legalizeITOFP(
1474   MachineInstr &MI, MachineRegisterInfo &MRI,
1475   MachineIRBuilder &B, bool Signed) const {
1476   B.setInstr(MI);
1477 
1478   Register Dst = MI.getOperand(0).getReg();
1479   Register Src = MI.getOperand(1).getReg();
1480 
1481   const LLT S64 = LLT::scalar(64);
1482   const LLT S32 = LLT::scalar(32);
1483 
1484   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1485 
1486   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1487 
1488   auto CvtHi = Signed ?
1489     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1490     B.buildUITOFP(S64, Unmerge.getReg(1));
1491 
1492   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1493 
1494   auto ThirtyTwo = B.buildConstant(S32, 32);
1495   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1496     .addUse(CvtHi.getReg(0))
1497     .addUse(ThirtyTwo.getReg(0));
1498 
1499   // TODO: Should this propagate fast-math-flags?
1500   B.buildFAdd(Dst, LdExp, CvtLo);
1501   MI.eraseFromParent();
1502   return true;
1503 }
1504 
1505 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1506   MachineInstr &MI, MachineRegisterInfo &MRI,
1507   MachineIRBuilder &B) const {
1508   MachineFunction &MF = B.getMF();
1509   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1510 
1511   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1512                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1513 
1514   // With ieee_mode disabled, the instructions have the correct behavior
1515   // already for G_FMINNUM/G_FMAXNUM
1516   if (!MFI->getMode().IEEE)
1517     return !IsIEEEOp;
1518 
1519   if (IsIEEEOp)
1520     return true;
1521 
1522   MachineIRBuilder HelperBuilder(MI);
1523   GISelObserverWrapper DummyObserver;
1524   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1525   HelperBuilder.setInstr(MI);
1526   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1527 }
1528 
1529 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1530   MachineInstr &MI, MachineRegisterInfo &MRI,
1531   MachineIRBuilder &B) const {
1532   // TODO: Should move some of this into LegalizerHelper.
1533 
1534   // TODO: Promote dynamic indexing of s16 to s32
1535   // TODO: Dynamic s64 indexing is only legal for SGPR.
1536   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
1537   if (!IdxVal) // Dynamic case will be selected to register indexing.
1538     return true;
1539 
1540   Register Dst = MI.getOperand(0).getReg();
1541   Register Vec = MI.getOperand(1).getReg();
1542 
1543   LLT VecTy = MRI.getType(Vec);
1544   LLT EltTy = VecTy.getElementType();
1545   assert(EltTy == MRI.getType(Dst));
1546 
1547   B.setInstr(MI);
1548 
1549   if (IdxVal.getValue() < VecTy.getNumElements())
1550     B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits());
1551   else
1552     B.buildUndef(Dst);
1553 
1554   MI.eraseFromParent();
1555   return true;
1556 }
1557 
1558 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1559   MachineInstr &MI, MachineRegisterInfo &MRI,
1560   MachineIRBuilder &B) const {
1561   // TODO: Should move some of this into LegalizerHelper.
1562 
1563   // TODO: Promote dynamic indexing of s16 to s32
1564   // TODO: Dynamic s64 indexing is only legal for SGPR.
1565   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI);
1566   if (!IdxVal) // Dynamic case will be selected to register indexing.
1567     return true;
1568 
1569   Register Dst = MI.getOperand(0).getReg();
1570   Register Vec = MI.getOperand(1).getReg();
1571   Register Ins = MI.getOperand(2).getReg();
1572 
1573   LLT VecTy = MRI.getType(Vec);
1574   LLT EltTy = VecTy.getElementType();
1575   assert(EltTy == MRI.getType(Ins));
1576 
1577   B.setInstr(MI);
1578 
1579   if (IdxVal.getValue() < VecTy.getNumElements())
1580     B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits());
1581   else
1582     B.buildUndef(Dst);
1583 
1584   MI.eraseFromParent();
1585   return true;
1586 }
1587 
1588 bool AMDGPULegalizerInfo::legalizeSinCos(
1589   MachineInstr &MI, MachineRegisterInfo &MRI,
1590   MachineIRBuilder &B) const {
1591   B.setInstr(MI);
1592 
1593   Register DstReg = MI.getOperand(0).getReg();
1594   Register SrcReg = MI.getOperand(1).getReg();
1595   LLT Ty = MRI.getType(DstReg);
1596   unsigned Flags = MI.getFlags();
1597 
1598   Register TrigVal;
1599   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1600   if (ST.hasTrigReducedRange()) {
1601     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1602     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1603       .addUse(MulVal.getReg(0))
1604       .setMIFlags(Flags).getReg(0);
1605   } else
1606     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1607 
1608   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1609     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1610   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1611     .addUse(TrigVal)
1612     .setMIFlags(Flags);
1613   MI.eraseFromParent();
1614   return true;
1615 }
1616 
1617 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1618   Register DstReg, LLT PtrTy,
1619   MachineIRBuilder &B, const GlobalValue *GV,
1620   unsigned Offset, unsigned GAFlags) const {
1621   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1622   // to the following code sequence:
1623   //
1624   // For constant address space:
1625   //   s_getpc_b64 s[0:1]
1626   //   s_add_u32 s0, s0, $symbol
1627   //   s_addc_u32 s1, s1, 0
1628   //
1629   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1630   //   a fixup or relocation is emitted to replace $symbol with a literal
1631   //   constant, which is a pc-relative offset from the encoding of the $symbol
1632   //   operand to the global variable.
1633   //
1634   // For global address space:
1635   //   s_getpc_b64 s[0:1]
1636   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1637   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1638   //
1639   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1640   //   fixups or relocations are emitted to replace $symbol@*@lo and
1641   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1642   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
1643   //   operand to the global variable.
1644   //
1645   // What we want here is an offset from the value returned by s_getpc
1646   // (which is the address of the s_add_u32 instruction) to the global
1647   // variable, but since the encoding of $symbol starts 4 bytes after the start
1648   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1649   // small. This requires us to add 4 to the global variable offset in order to
1650   // compute the correct address.
1651 
1652   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1653 
1654   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1655     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1656 
1657   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1658     .addDef(PCReg);
1659 
1660   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1661   if (GAFlags == SIInstrInfo::MO_NONE)
1662     MIB.addImm(0);
1663   else
1664     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1665 
1666   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1667 
1668   if (PtrTy.getSizeInBits() == 32)
1669     B.buildExtract(DstReg, PCReg, 0);
1670   return true;
1671  }
1672 
1673 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1674   MachineInstr &MI, MachineRegisterInfo &MRI,
1675   MachineIRBuilder &B) const {
1676   Register DstReg = MI.getOperand(0).getReg();
1677   LLT Ty = MRI.getType(DstReg);
1678   unsigned AS = Ty.getAddressSpace();
1679 
1680   const GlobalValue *GV = MI.getOperand(1).getGlobal();
1681   MachineFunction &MF = B.getMF();
1682   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1683   B.setInstr(MI);
1684 
1685   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1686     if (!MFI->isEntryFunction()) {
1687       const Function &Fn = MF.getFunction();
1688       DiagnosticInfoUnsupported BadLDSDecl(
1689         Fn, "local memory global used by non-kernel function", MI.getDebugLoc());
1690       Fn.getContext().diagnose(BadLDSDecl);
1691     }
1692 
1693     // TODO: We could emit code to handle the initialization somewhere.
1694     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1695       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1696       MI.eraseFromParent();
1697       return true;
1698     }
1699 
1700     const Function &Fn = MF.getFunction();
1701     DiagnosticInfoUnsupported BadInit(
1702       Fn, "unsupported initializer for address space", MI.getDebugLoc());
1703     Fn.getContext().diagnose(BadInit);
1704     return true;
1705   }
1706 
1707   const SITargetLowering *TLI = ST.getTargetLowering();
1708 
1709   if (TLI->shouldEmitFixup(GV)) {
1710     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
1711     MI.eraseFromParent();
1712     return true;
1713   }
1714 
1715   if (TLI->shouldEmitPCReloc(GV)) {
1716     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
1717     MI.eraseFromParent();
1718     return true;
1719   }
1720 
1721   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1722   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
1723 
1724   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
1725     MachinePointerInfo::getGOT(MF),
1726     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1727     MachineMemOperand::MOInvariant,
1728     8 /*Size*/, 8 /*Align*/);
1729 
1730   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
1731 
1732   if (Ty.getSizeInBits() == 32) {
1733     // Truncate if this is a 32-bit constant adrdess.
1734     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
1735     B.buildExtract(DstReg, Load, 0);
1736   } else
1737     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
1738 
1739   MI.eraseFromParent();
1740   return true;
1741 }
1742 
1743 bool AMDGPULegalizerInfo::legalizeLoad(
1744   MachineInstr &MI, MachineRegisterInfo &MRI,
1745   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
1746   B.setInstr(MI);
1747   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1748   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
1749   Observer.changingInstr(MI);
1750   MI.getOperand(1).setReg(Cast.getReg(0));
1751   Observer.changedInstr(MI);
1752   return true;
1753 }
1754 
1755 bool AMDGPULegalizerInfo::legalizeFMad(
1756   MachineInstr &MI, MachineRegisterInfo &MRI,
1757   MachineIRBuilder &B) const {
1758   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1759   assert(Ty.isScalar());
1760 
1761   MachineFunction &MF = B.getMF();
1762   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1763 
1764   // TODO: Always legal with future ftz flag.
1765   if (Ty == LLT::scalar(32) && !MFI->getMode().FP32Denormals)
1766     return true;
1767   if (Ty == LLT::scalar(16) && !MFI->getMode().FP64FP16Denormals)
1768     return true;
1769 
1770 
1771   MachineIRBuilder HelperBuilder(MI);
1772   GISelObserverWrapper DummyObserver;
1773   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1774   HelperBuilder.setMBB(*MI.getParent());
1775   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
1776 }
1777 
1778 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
1779   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
1780   Register DstReg = MI.getOperand(0).getReg();
1781   Register PtrReg = MI.getOperand(1).getReg();
1782   Register CmpVal = MI.getOperand(2).getReg();
1783   Register NewVal = MI.getOperand(3).getReg();
1784 
1785   assert(SITargetLowering::isFlatGlobalAddrSpace(
1786            MRI.getType(PtrReg).getAddressSpace()) &&
1787          "this should not have been custom lowered");
1788 
1789   LLT ValTy = MRI.getType(CmpVal);
1790   LLT VecTy = LLT::vector(2, ValTy);
1791 
1792   B.setInstr(MI);
1793   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
1794 
1795   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
1796     .addDef(DstReg)
1797     .addUse(PtrReg)
1798     .addUse(PackedVal)
1799     .setMemRefs(MI.memoperands());
1800 
1801   MI.eraseFromParent();
1802   return true;
1803 }
1804 
1805 // Return the use branch instruction, otherwise null if the usage is invalid.
1806 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
1807                                        MachineRegisterInfo &MRI) {
1808   Register CondDef = MI.getOperand(0).getReg();
1809   if (!MRI.hasOneNonDBGUse(CondDef))
1810     return nullptr;
1811 
1812   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
1813   return UseMI.getParent() == MI.getParent() &&
1814     UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr;
1815 }
1816 
1817 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
1818                                                 Register Reg, LLT Ty) const {
1819   Register LiveIn = MRI.getLiveInVirtReg(Reg);
1820   if (LiveIn)
1821     return LiveIn;
1822 
1823   Register NewReg = MRI.createGenericVirtualRegister(Ty);
1824   MRI.addLiveIn(Reg, NewReg);
1825   return NewReg;
1826 }
1827 
1828 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
1829                                          const ArgDescriptor *Arg) const {
1830   if (!Arg->isRegister() || !Arg->getRegister().isValid())
1831     return false; // TODO: Handle these
1832 
1833   assert(Arg->getRegister().isPhysical());
1834 
1835   MachineRegisterInfo &MRI = *B.getMRI();
1836 
1837   LLT Ty = MRI.getType(DstReg);
1838   Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
1839 
1840   if (Arg->isMasked()) {
1841     // TODO: Should we try to emit this once in the entry block?
1842     const LLT S32 = LLT::scalar(32);
1843     const unsigned Mask = Arg->getMask();
1844     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
1845 
1846     Register AndMaskSrc = LiveIn;
1847 
1848     if (Shift != 0) {
1849       auto ShiftAmt = B.buildConstant(S32, Shift);
1850       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
1851     }
1852 
1853     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
1854   } else
1855     B.buildCopy(DstReg, LiveIn);
1856 
1857   // Insert the argument copy if it doens't already exist.
1858   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
1859   if (!MRI.getVRegDef(LiveIn)) {
1860     // FIXME: Should have scoped insert pt
1861     MachineBasicBlock &OrigInsBB = B.getMBB();
1862     auto OrigInsPt = B.getInsertPt();
1863 
1864     MachineBasicBlock &EntryMBB = B.getMF().front();
1865     EntryMBB.addLiveIn(Arg->getRegister());
1866     B.setInsertPt(EntryMBB, EntryMBB.begin());
1867     B.buildCopy(LiveIn, Arg->getRegister());
1868 
1869     B.setInsertPt(OrigInsBB, OrigInsPt);
1870   }
1871 
1872   return true;
1873 }
1874 
1875 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
1876   MachineInstr &MI,
1877   MachineRegisterInfo &MRI,
1878   MachineIRBuilder &B,
1879   AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
1880   B.setInstr(MI);
1881 
1882   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1883 
1884   const ArgDescriptor *Arg;
1885   const TargetRegisterClass *RC;
1886   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
1887   if (!Arg) {
1888     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
1889     return false;
1890   }
1891 
1892   if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
1893     MI.eraseFromParent();
1894     return true;
1895   }
1896 
1897   return false;
1898 }
1899 
1900 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
1901                                        MachineRegisterInfo &MRI,
1902                                        MachineIRBuilder &B) const {
1903   B.setInstr(MI);
1904   Register Dst = MI.getOperand(0).getReg();
1905   LLT DstTy = MRI.getType(Dst);
1906   LLT S16 = LLT::scalar(16);
1907   LLT S32 = LLT::scalar(32);
1908   LLT S64 = LLT::scalar(64);
1909 
1910   if (legalizeFastUnsafeFDIV(MI, MRI, B))
1911     return true;
1912 
1913   if (DstTy == S16)
1914     return legalizeFDIV16(MI, MRI, B);
1915   if (DstTy == S32)
1916     return legalizeFDIV32(MI, MRI, B);
1917   if (DstTy == S64)
1918     return legalizeFDIV64(MI, MRI, B);
1919 
1920   return false;
1921 }
1922 
1923 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
1924                                                  MachineRegisterInfo &MRI,
1925                                                  MachineIRBuilder &B) const {
1926   Register Res = MI.getOperand(0).getReg();
1927   Register LHS = MI.getOperand(1).getReg();
1928   Register RHS = MI.getOperand(2).getReg();
1929 
1930   uint16_t Flags = MI.getFlags();
1931 
1932   LLT ResTy = MRI.getType(Res);
1933   LLT S32 = LLT::scalar(32);
1934   LLT S64 = LLT::scalar(64);
1935 
1936   const MachineFunction &MF = B.getMF();
1937   bool Unsafe =
1938     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
1939 
1940   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
1941     return false;
1942 
1943   if (!Unsafe && ResTy == S32 &&
1944       MF.getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals)
1945     return false;
1946 
1947   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
1948     // 1 / x -> RCP(x)
1949     if (CLHS->isExactlyValue(1.0)) {
1950       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
1951         .addUse(RHS)
1952         .setMIFlags(Flags);
1953 
1954       MI.eraseFromParent();
1955       return true;
1956     }
1957 
1958     // -1 / x -> RCP( FNEG(x) )
1959     if (CLHS->isExactlyValue(-1.0)) {
1960       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
1961       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
1962         .addUse(FNeg.getReg(0))
1963         .setMIFlags(Flags);
1964 
1965       MI.eraseFromParent();
1966       return true;
1967     }
1968   }
1969 
1970   // x / y -> x * (1.0 / y)
1971   if (Unsafe) {
1972     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
1973       .addUse(RHS)
1974       .setMIFlags(Flags);
1975     B.buildFMul(Res, LHS, RCP, Flags);
1976 
1977     MI.eraseFromParent();
1978     return true;
1979   }
1980 
1981   return false;
1982 }
1983 
1984 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
1985                                          MachineRegisterInfo &MRI,
1986                                          MachineIRBuilder &B) const {
1987   B.setInstr(MI);
1988   Register Res = MI.getOperand(0).getReg();
1989   Register LHS = MI.getOperand(1).getReg();
1990   Register RHS = MI.getOperand(2).getReg();
1991 
1992   uint16_t Flags = MI.getFlags();
1993 
1994   LLT S16 = LLT::scalar(16);
1995   LLT S32 = LLT::scalar(32);
1996 
1997   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
1998   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
1999 
2000   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2001     .addUse(RHSExt.getReg(0))
2002     .setMIFlags(Flags);
2003 
2004   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2005   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2006 
2007   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2008     .addUse(RDst.getReg(0))
2009     .addUse(RHS)
2010     .addUse(LHS)
2011     .setMIFlags(Flags);
2012 
2013   MI.eraseFromParent();
2014   return true;
2015 }
2016 
2017 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2018 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2019 static void toggleSPDenormMode(bool Enable,
2020                                MachineIRBuilder &B,
2021                                const GCNSubtarget &ST,
2022                                AMDGPU::SIModeRegisterDefaults Mode) {
2023   // Set SP denorm mode to this value.
2024   unsigned SPDenormMode =
2025     Enable ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
2026 
2027   if (ST.hasDenormModeInst()) {
2028     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2029     unsigned DPDenormModeDefault = Mode.FP64FP16Denormals
2030                                    ? FP_DENORM_FLUSH_NONE
2031                                    : FP_DENORM_FLUSH_IN_FLUSH_OUT;
2032 
2033     unsigned NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2034     B.buildInstr(AMDGPU::S_DENORM_MODE)
2035       .addImm(NewDenormModeValue);
2036 
2037   } else {
2038     // Select FP32 bit field in mode register.
2039     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2040                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2041                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2042 
2043     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2044       .addImm(SPDenormMode)
2045       .addImm(SPDenormModeBitField);
2046   }
2047 }
2048 
2049 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2050                                          MachineRegisterInfo &MRI,
2051                                          MachineIRBuilder &B) const {
2052   B.setInstr(MI);
2053   Register Res = MI.getOperand(0).getReg();
2054   Register LHS = MI.getOperand(1).getReg();
2055   Register RHS = MI.getOperand(2).getReg();
2056   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2057   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2058 
2059   uint16_t Flags = MI.getFlags();
2060 
2061   LLT S32 = LLT::scalar(32);
2062   LLT S1 = LLT::scalar(1);
2063 
2064   auto One = B.buildFConstant(S32, 1.0f);
2065 
2066   auto DenominatorScaled =
2067     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2068       .addUse(RHS)
2069       .addUse(LHS)
2070       .addImm(1)
2071       .setMIFlags(Flags);
2072   auto NumeratorScaled =
2073     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2074       .addUse(LHS)
2075       .addUse(RHS)
2076       .addImm(0)
2077       .setMIFlags(Flags);
2078 
2079   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2080     .addUse(DenominatorScaled.getReg(0))
2081     .setMIFlags(Flags);
2082   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2083 
2084   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2085   // aren't modeled as reading it.
2086   if (!Mode.FP32Denormals)
2087     toggleSPDenormMode(true, B, ST, Mode);
2088 
2089   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2090   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2091   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2092   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2093   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2094   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2095 
2096   if (!Mode.FP32Denormals)
2097     toggleSPDenormMode(false, B, ST, Mode);
2098 
2099   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2100     .addUse(Fma4.getReg(0))
2101     .addUse(Fma1.getReg(0))
2102     .addUse(Fma3.getReg(0))
2103     .addUse(NumeratorScaled.getReg(1))
2104     .setMIFlags(Flags);
2105 
2106   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2107     .addUse(Fmas.getReg(0))
2108     .addUse(RHS)
2109     .addUse(LHS)
2110     .setMIFlags(Flags);
2111 
2112   MI.eraseFromParent();
2113   return true;
2114 }
2115 
2116 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
2117                                          MachineRegisterInfo &MRI,
2118                                          MachineIRBuilder &B) const {
2119   B.setInstr(MI);
2120   Register Res = MI.getOperand(0).getReg();
2121   Register LHS = MI.getOperand(1).getReg();
2122   Register RHS = MI.getOperand(2).getReg();
2123 
2124   uint16_t Flags = MI.getFlags();
2125 
2126   LLT S64 = LLT::scalar(64);
2127   LLT S1 = LLT::scalar(1);
2128 
2129   auto One = B.buildFConstant(S64, 1.0);
2130 
2131   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2132     .addUse(LHS)
2133     .addUse(RHS)
2134     .addImm(1)
2135     .setMIFlags(Flags);
2136 
2137   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
2138 
2139   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
2140     .addUse(DivScale0.getReg(0))
2141     .setMIFlags(Flags);
2142 
2143   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
2144   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
2145   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
2146 
2147   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2148     .addUse(LHS)
2149     .addUse(RHS)
2150     .addImm(0)
2151     .setMIFlags(Flags);
2152 
2153   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
2154   auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags);
2155   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
2156 
2157   Register Scale;
2158   if (!ST.hasUsableDivScaleConditionOutput()) {
2159     // Workaround a hardware bug on SI where the condition output from div_scale
2160     // is not usable.
2161 
2162     Scale = MRI.createGenericVirtualRegister(S1);
2163 
2164     LLT S32 = LLT::scalar(32);
2165 
2166     auto NumUnmerge = B.buildUnmerge(S32, LHS);
2167     auto DenUnmerge = B.buildUnmerge(S32, RHS);
2168     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
2169     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
2170 
2171     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
2172                               Scale1Unmerge.getReg(1));
2173     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
2174                               Scale0Unmerge.getReg(1));
2175     B.buildXor(Scale, CmpNum, CmpDen);
2176   } else {
2177     Scale = DivScale1.getReg(1);
2178   }
2179 
2180   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
2181     .addUse(Fma4.getReg(0))
2182     .addUse(Fma3.getReg(0))
2183     .addUse(Mul.getReg(0))
2184     .addUse(Scale)
2185     .setMIFlags(Flags);
2186 
2187   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
2188     .addUse(Fmas.getReg(0))
2189     .addUse(RHS)
2190     .addUse(LHS)
2191     .setMIFlags(Flags);
2192 
2193   MI.eraseFromParent();
2194   return true;
2195 }
2196 
2197 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
2198                                                  MachineRegisterInfo &MRI,
2199                                                  MachineIRBuilder &B) const {
2200   B.setInstr(MI);
2201   Register Res = MI.getOperand(0).getReg();
2202   Register LHS = MI.getOperand(2).getReg();
2203   Register RHS = MI.getOperand(3).getReg();
2204   uint16_t Flags = MI.getFlags();
2205 
2206   LLT S32 = LLT::scalar(32);
2207   LLT S1 = LLT::scalar(1);
2208 
2209   auto Abs = B.buildFAbs(S32, RHS, Flags);
2210   const APFloat C0Val(1.0f);
2211 
2212   auto C0 = B.buildConstant(S32, 0x6f800000);
2213   auto C1 = B.buildConstant(S32, 0x2f800000);
2214   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
2215 
2216   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
2217   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
2218 
2219   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
2220 
2221   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2222     .addUse(Mul0.getReg(0))
2223     .setMIFlags(Flags);
2224 
2225   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
2226 
2227   B.buildFMul(Res, Sel, Mul1, Flags);
2228 
2229   MI.eraseFromParent();
2230   return true;
2231 }
2232 
2233 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
2234                                                  MachineRegisterInfo &MRI,
2235                                                  MachineIRBuilder &B) const {
2236   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2237   if (!MFI->isEntryFunction()) {
2238     return legalizePreloadedArgIntrin(MI, MRI, B,
2239                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
2240   }
2241 
2242   B.setInstr(MI);
2243 
2244   uint64_t Offset =
2245     ST.getTargetLowering()->getImplicitParameterOffset(
2246       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
2247   Register DstReg = MI.getOperand(0).getReg();
2248   LLT DstTy = MRI.getType(DstReg);
2249   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
2250 
2251   const ArgDescriptor *Arg;
2252   const TargetRegisterClass *RC;
2253   std::tie(Arg, RC)
2254     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2255   if (!Arg)
2256     return false;
2257 
2258   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
2259   if (!loadInputValue(KernargPtrReg, B, Arg))
2260     return false;
2261 
2262   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
2263   MI.eraseFromParent();
2264   return true;
2265 }
2266 
2267 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
2268                                               MachineRegisterInfo &MRI,
2269                                               MachineIRBuilder &B,
2270                                               unsigned AddrSpace) const {
2271   B.setInstr(MI);
2272   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
2273   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
2274   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
2275   MI.eraseFromParent();
2276   return true;
2277 }
2278 
2279 /// Handle register layout difference for f16 images for some subtargets.
2280 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
2281                                              MachineRegisterInfo &MRI,
2282                                              Register Reg) const {
2283   if (!ST.hasUnpackedD16VMem())
2284     return Reg;
2285 
2286   const LLT S16 = LLT::scalar(16);
2287   const LLT S32 = LLT::scalar(32);
2288   LLT StoreVT = MRI.getType(Reg);
2289   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
2290 
2291   auto Unmerge = B.buildUnmerge(S16, Reg);
2292 
2293   SmallVector<Register, 4> WideRegs;
2294   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
2295     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
2296 
2297   int NumElts = StoreVT.getNumElements();
2298 
2299   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
2300 }
2301 
2302 bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI,
2303                                                  MachineRegisterInfo &MRI,
2304                                                  MachineIRBuilder &B,
2305                                                  bool IsFormat) const {
2306   // TODO: Reject f16 format on targets where unsupported.
2307   Register VData = MI.getOperand(1).getReg();
2308   LLT Ty = MRI.getType(VData);
2309 
2310   B.setInstr(MI);
2311 
2312   const LLT S32 = LLT::scalar(32);
2313   const LLT S16 = LLT::scalar(16);
2314 
2315   // Fixup illegal register types for i8 stores.
2316   if (Ty == LLT::scalar(8) || Ty == S16) {
2317     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
2318     MI.getOperand(1).setReg(AnyExt);
2319     return true;
2320   }
2321 
2322   if (Ty.isVector()) {
2323     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
2324       if (IsFormat)
2325         MI.getOperand(1).setReg(handleD16VData(B, MRI, VData));
2326       return true;
2327     }
2328 
2329     return Ty.getElementType() == S32 && Ty.getNumElements() <= 4;
2330   }
2331 
2332   return Ty == S32;
2333 }
2334 
2335 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
2336                                             MachineRegisterInfo &MRI,
2337                                             MachineIRBuilder &B) const {
2338   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
2339   auto IntrID = MI.getIntrinsicID();
2340   switch (IntrID) {
2341   case Intrinsic::amdgcn_if:
2342   case Intrinsic::amdgcn_else: {
2343     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
2344       const SIRegisterInfo *TRI
2345         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
2346 
2347       B.setInstr(*BrCond);
2348       Register Def = MI.getOperand(1).getReg();
2349       Register Use = MI.getOperand(3).getReg();
2350 
2351       if (IntrID == Intrinsic::amdgcn_if) {
2352         B.buildInstr(AMDGPU::SI_IF)
2353           .addDef(Def)
2354           .addUse(Use)
2355           .addMBB(BrCond->getOperand(1).getMBB());
2356       } else {
2357         B.buildInstr(AMDGPU::SI_ELSE)
2358           .addDef(Def)
2359           .addUse(Use)
2360           .addMBB(BrCond->getOperand(1).getMBB())
2361           .addImm(0);
2362       }
2363 
2364       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
2365       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
2366       MI.eraseFromParent();
2367       BrCond->eraseFromParent();
2368       return true;
2369     }
2370 
2371     return false;
2372   }
2373   case Intrinsic::amdgcn_loop: {
2374     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
2375       const SIRegisterInfo *TRI
2376         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
2377 
2378       B.setInstr(*BrCond);
2379       Register Reg = MI.getOperand(2).getReg();
2380       B.buildInstr(AMDGPU::SI_LOOP)
2381         .addUse(Reg)
2382         .addMBB(BrCond->getOperand(1).getMBB());
2383       MI.eraseFromParent();
2384       BrCond->eraseFromParent();
2385       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
2386       return true;
2387     }
2388 
2389     return false;
2390   }
2391   case Intrinsic::amdgcn_kernarg_segment_ptr:
2392     return legalizePreloadedArgIntrin(
2393       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2394   case Intrinsic::amdgcn_implicitarg_ptr:
2395     return legalizeImplicitArgPtr(MI, MRI, B);
2396   case Intrinsic::amdgcn_workitem_id_x:
2397     return legalizePreloadedArgIntrin(MI, MRI, B,
2398                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
2399   case Intrinsic::amdgcn_workitem_id_y:
2400     return legalizePreloadedArgIntrin(MI, MRI, B,
2401                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
2402   case Intrinsic::amdgcn_workitem_id_z:
2403     return legalizePreloadedArgIntrin(MI, MRI, B,
2404                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
2405   case Intrinsic::amdgcn_workgroup_id_x:
2406     return legalizePreloadedArgIntrin(MI, MRI, B,
2407                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
2408   case Intrinsic::amdgcn_workgroup_id_y:
2409     return legalizePreloadedArgIntrin(MI, MRI, B,
2410                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
2411   case Intrinsic::amdgcn_workgroup_id_z:
2412     return legalizePreloadedArgIntrin(MI, MRI, B,
2413                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
2414   case Intrinsic::amdgcn_dispatch_ptr:
2415     return legalizePreloadedArgIntrin(MI, MRI, B,
2416                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
2417   case Intrinsic::amdgcn_queue_ptr:
2418     return legalizePreloadedArgIntrin(MI, MRI, B,
2419                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
2420   case Intrinsic::amdgcn_implicit_buffer_ptr:
2421     return legalizePreloadedArgIntrin(
2422       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
2423   case Intrinsic::amdgcn_dispatch_id:
2424     return legalizePreloadedArgIntrin(MI, MRI, B,
2425                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
2426   case Intrinsic::amdgcn_fdiv_fast:
2427     return legalizeFDIVFastIntrin(MI, MRI, B);
2428   case Intrinsic::amdgcn_is_shared:
2429     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
2430   case Intrinsic::amdgcn_is_private:
2431     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
2432   case Intrinsic::amdgcn_wavefrontsize: {
2433     B.setInstr(MI);
2434     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
2435     MI.eraseFromParent();
2436     return true;
2437   }
2438   case Intrinsic::amdgcn_raw_buffer_store:
2439     return legalizeRawBufferStore(MI, MRI, B, false);
2440   case Intrinsic::amdgcn_raw_buffer_store_format:
2441     return legalizeRawBufferStore(MI, MRI, B, true);
2442   default:
2443     return true;
2444   }
2445 
2446   return true;
2447 }
2448