1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
19 #endif
20 
21 #include "AMDGPU.h"
22 #include "AMDGPULegalizerInfo.h"
23 #include "AMDGPUTargetMachine.h"
24 #include "SIMachineFunctionInfo.h"
25 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/CodeGen/TargetOpcodes.h"
28 #include "llvm/CodeGen/ValueTypes.h"
29 #include "llvm/IR/DerivedTypes.h"
30 #include "llvm/IR/DiagnosticInfo.h"
31 #include "llvm/IR/Type.h"
32 #include "llvm/Support/Debug.h"
33 
34 #define DEBUG_TYPE "amdgpu-legalinfo"
35 
36 using namespace llvm;
37 using namespace LegalizeActions;
38 using namespace LegalizeMutations;
39 using namespace LegalityPredicates;
40 
41 
42 static LegalityPredicate isMultiple32(unsigned TypeIdx,
43                                       unsigned MaxSize = 1024) {
44   return [=](const LegalityQuery &Query) {
45     const LLT Ty = Query.Types[TypeIdx];
46     const LLT EltTy = Ty.getScalarType();
47     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
48   };
49 }
50 
51 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
52   return [=](const LegalityQuery &Query) {
53     return Query.Types[TypeIdx].getSizeInBits() == Size;
54   };
55 }
56 
57 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
58   return [=](const LegalityQuery &Query) {
59     const LLT Ty = Query.Types[TypeIdx];
60     return Ty.isVector() &&
61            Ty.getNumElements() % 2 != 0 &&
62            Ty.getElementType().getSizeInBits() < 32 &&
63            Ty.getSizeInBits() % 32 != 0;
64   };
65 }
66 
67 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
68   return [=](const LegalityQuery &Query) {
69     const LLT Ty = Query.Types[TypeIdx];
70     const LLT EltTy = Ty.getScalarType();
71     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
72   };
73 }
74 
75 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
76   return [=](const LegalityQuery &Query) {
77     const LLT Ty = Query.Types[TypeIdx];
78     const LLT EltTy = Ty.getElementType();
79     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
80   };
81 }
82 
83 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
84   return [=](const LegalityQuery &Query) {
85     const LLT Ty = Query.Types[TypeIdx];
86     const LLT EltTy = Ty.getElementType();
87     unsigned Size = Ty.getSizeInBits();
88     unsigned Pieces = (Size + 63) / 64;
89     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
90     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
91   };
92 }
93 
94 // Increase the number of vector elements to reach the next multiple of 32-bit
95 // type.
96 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
97   return [=](const LegalityQuery &Query) {
98     const LLT Ty = Query.Types[TypeIdx];
99 
100     const LLT EltTy = Ty.getElementType();
101     const int Size = Ty.getSizeInBits();
102     const int EltSize = EltTy.getSizeInBits();
103     const int NextMul32 = (Size + 31) / 32;
104 
105     assert(EltSize < 32);
106 
107     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
108     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
109   };
110 }
111 
112 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
113   return [=](const LegalityQuery &Query) {
114     const LLT QueryTy = Query.Types[TypeIdx];
115     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
116   };
117 }
118 
119 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
120   return [=](const LegalityQuery &Query) {
121     const LLT QueryTy = Query.Types[TypeIdx];
122     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
123   };
124 }
125 
126 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
127   return [=](const LegalityQuery &Query) {
128     const LLT QueryTy = Query.Types[TypeIdx];
129     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
130   };
131 }
132 
133 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
134 // v2s16.
135 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
136   return [=](const LegalityQuery &Query) {
137     const LLT Ty = Query.Types[TypeIdx];
138     if (Ty.isVector()) {
139       const int EltSize = Ty.getElementType().getSizeInBits();
140       return EltSize == 32 || EltSize == 64 ||
141             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
142              EltSize == 128 || EltSize == 256;
143     }
144 
145     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
146   };
147 }
148 
149 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
150   return [=](const LegalityQuery &Query) {
151     return Query.Types[TypeIdx].getElementType() == Type;
152   };
153 }
154 
155 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
156   return [=](const LegalityQuery &Query) {
157     const LLT Ty = Query.Types[TypeIdx];
158     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
159            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
160   };
161 }
162 
163 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
164                                          const GCNTargetMachine &TM)
165   :  ST(ST_) {
166   using namespace TargetOpcode;
167 
168   auto GetAddrSpacePtr = [&TM](unsigned AS) {
169     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
170   };
171 
172   const LLT S1 = LLT::scalar(1);
173   const LLT S8 = LLT::scalar(8);
174   const LLT S16 = LLT::scalar(16);
175   const LLT S32 = LLT::scalar(32);
176   const LLT S64 = LLT::scalar(64);
177   const LLT S96 = LLT::scalar(96);
178   const LLT S128 = LLT::scalar(128);
179   const LLT S256 = LLT::scalar(256);
180   const LLT S1024 = LLT::scalar(1024);
181 
182   const LLT V2S16 = LLT::vector(2, 16);
183   const LLT V4S16 = LLT::vector(4, 16);
184 
185   const LLT V2S32 = LLT::vector(2, 32);
186   const LLT V3S32 = LLT::vector(3, 32);
187   const LLT V4S32 = LLT::vector(4, 32);
188   const LLT V5S32 = LLT::vector(5, 32);
189   const LLT V6S32 = LLT::vector(6, 32);
190   const LLT V7S32 = LLT::vector(7, 32);
191   const LLT V8S32 = LLT::vector(8, 32);
192   const LLT V9S32 = LLT::vector(9, 32);
193   const LLT V10S32 = LLT::vector(10, 32);
194   const LLT V11S32 = LLT::vector(11, 32);
195   const LLT V12S32 = LLT::vector(12, 32);
196   const LLT V13S32 = LLT::vector(13, 32);
197   const LLT V14S32 = LLT::vector(14, 32);
198   const LLT V15S32 = LLT::vector(15, 32);
199   const LLT V16S32 = LLT::vector(16, 32);
200   const LLT V32S32 = LLT::vector(32, 32);
201 
202   const LLT V2S64 = LLT::vector(2, 64);
203   const LLT V3S64 = LLT::vector(3, 64);
204   const LLT V4S64 = LLT::vector(4, 64);
205   const LLT V5S64 = LLT::vector(5, 64);
206   const LLT V6S64 = LLT::vector(6, 64);
207   const LLT V7S64 = LLT::vector(7, 64);
208   const LLT V8S64 = LLT::vector(8, 64);
209   const LLT V16S64 = LLT::vector(16, 64);
210 
211   std::initializer_list<LLT> AllS32Vectors =
212     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
213      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
214   std::initializer_list<LLT> AllS64Vectors =
215     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
216 
217   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
218   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
219   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
220   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
221   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
222   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
223   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
224 
225   const LLT CodePtr = FlatPtr;
226 
227   const std::initializer_list<LLT> AddrSpaces64 = {
228     GlobalPtr, ConstantPtr, FlatPtr
229   };
230 
231   const std::initializer_list<LLT> AddrSpaces32 = {
232     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
233   };
234 
235   const std::initializer_list<LLT> FPTypesBase = {
236     S32, S64
237   };
238 
239   const std::initializer_list<LLT> FPTypes16 = {
240     S32, S64, S16
241   };
242 
243   const std::initializer_list<LLT> FPTypesPK16 = {
244     S32, S64, S16, V2S16
245   };
246 
247   const LLT MinLegalScalarShiftTy = ST.has16BitInsts() ? S16 : S32;
248 
249   setAction({G_BRCOND, S1}, Legal); // VCC branches
250   setAction({G_BRCOND, S32}, Legal); // SCC branches
251 
252   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
253   // elements for v3s16
254   getActionDefinitionsBuilder(G_PHI)
255     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
256     .legalFor(AllS32Vectors)
257     .legalFor(AllS64Vectors)
258     .legalFor(AddrSpaces64)
259     .legalFor(AddrSpaces32)
260     .clampScalar(0, S32, S256)
261     .widenScalarToNextPow2(0, 32)
262     .clampMaxNumElements(0, S32, 16)
263     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
264     .legalIf(isPointer(0));
265 
266   if (ST.has16BitInsts()) {
267     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
268       .legalFor({S32, S16})
269       .clampScalar(0, S16, S32)
270       .scalarize(0);
271   } else {
272     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
273       .legalFor({S32})
274       .clampScalar(0, S32, S32)
275       .scalarize(0);
276   }
277 
278   // FIXME: Not really legal. Placeholder for custom lowering.
279   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
280     .legalFor({S32, S64})
281     .clampScalar(0, S32, S64)
282     .widenScalarToNextPow2(0, 32)
283     .scalarize(0);
284 
285   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
286     .legalFor({S32})
287     .clampScalar(0, S32, S32)
288     .scalarize(0);
289 
290   // Report legal for any types we can handle anywhere. For the cases only legal
291   // on the SALU, RegBankSelect will be able to re-legalize.
292   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
293     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
294     .clampScalar(0, S32, S64)
295     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
296     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
297     .widenScalarToNextPow2(0)
298     .scalarize(0);
299 
300   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
301                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
302     .legalFor({{S32, S1}, {S32, S32}})
303     .clampScalar(0, S32, S32)
304     .scalarize(0); // TODO: Implement.
305 
306   getActionDefinitionsBuilder(G_BITCAST)
307     // Don't worry about the size constraint.
308     .legalIf(all(isRegisterType(0), isRegisterType(1)))
309     // FIXME: Testing hack
310     .legalForCartesianProduct({S16, LLT::vector(2, 8), })
311     .lower();
312 
313 
314   getActionDefinitionsBuilder(G_CONSTANT)
315     .legalFor({S1, S32, S64, S16, GlobalPtr,
316                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
317     .clampScalar(0, S32, S64)
318     .widenScalarToNextPow2(0)
319     .legalIf(isPointer(0));
320 
321   getActionDefinitionsBuilder(G_FCONSTANT)
322     .legalFor({S32, S64, S16})
323     .clampScalar(0, S16, S64);
324 
325   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
326     .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
327                ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
328     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
329     .clampScalarOrElt(0, S32, S1024)
330     .legalIf(isMultiple32(0))
331     .widenScalarToNextPow2(0, 32)
332     .clampMaxNumElements(0, S32, 16);
333 
334   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
335   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
336     .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr});
337   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
338 
339   auto &FPOpActions = getActionDefinitionsBuilder(
340     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
341     .legalFor({S32, S64});
342   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
343     .customFor({S32, S64});
344   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
345     .customFor({S32, S64});
346 
347   if (ST.has16BitInsts()) {
348     if (ST.hasVOP3PInsts())
349       FPOpActions.legalFor({S16, V2S16});
350     else
351       FPOpActions.legalFor({S16});
352 
353     TrigActions.customFor({S16});
354     FDIVActions.customFor({S16});
355   }
356 
357   auto &MinNumMaxNum = getActionDefinitionsBuilder({
358       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
359 
360   if (ST.hasVOP3PInsts()) {
361     MinNumMaxNum.customFor(FPTypesPK16)
362       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
363       .clampMaxNumElements(0, S16, 2)
364       .clampScalar(0, S16, S64)
365       .scalarize(0);
366   } else if (ST.has16BitInsts()) {
367     MinNumMaxNum.customFor(FPTypes16)
368       .clampScalar(0, S16, S64)
369       .scalarize(0);
370   } else {
371     MinNumMaxNum.customFor(FPTypesBase)
372       .clampScalar(0, S32, S64)
373       .scalarize(0);
374   }
375 
376   if (ST.hasVOP3PInsts())
377     FPOpActions.clampMaxNumElements(0, S16, 2);
378 
379   FPOpActions
380     .scalarize(0)
381     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
382 
383   TrigActions
384     .scalarize(0)
385     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
386 
387   FDIVActions
388     .scalarize(0)
389     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
390 
391   getActionDefinitionsBuilder({G_FNEG, G_FABS})
392     .legalFor(FPTypesPK16)
393     .clampMaxNumElements(0, S16, 2)
394     .scalarize(0)
395     .clampScalar(0, S16, S64);
396 
397   if (ST.has16BitInsts()) {
398     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
399       .legalFor({S32, S64, S16})
400       .scalarize(0)
401       .clampScalar(0, S16, S64);
402   } else {
403     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
404       .legalFor({S32, S64})
405       .scalarize(0)
406       .clampScalar(0, S32, S64);
407   }
408 
409   getActionDefinitionsBuilder(G_FPTRUNC)
410     .legalFor({{S32, S64}, {S16, S32}})
411     .scalarize(0);
412 
413   getActionDefinitionsBuilder(G_FPEXT)
414     .legalFor({{S64, S32}, {S32, S16}})
415     .lowerFor({{S64, S16}}) // FIXME: Implement
416     .scalarize(0);
417 
418   getActionDefinitionsBuilder(G_FSUB)
419       // Use actual fsub instruction
420       .legalFor({S32})
421       // Must use fadd + fneg
422       .lowerFor({S64, S16, V2S16})
423       .scalarize(0)
424       .clampScalar(0, S32, S64);
425 
426   // Whether this is legal depends on the floating point mode for the function.
427   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
428   if (ST.hasMadF16())
429     FMad.customFor({S32, S16});
430   else
431     FMad.customFor({S32});
432   FMad.scalarize(0)
433       .lower();
434 
435   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
436     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
437                {S32, S1}, {S64, S1}, {S16, S1},
438                {S96, S32},
439                // FIXME: Hack
440                {S64, LLT::scalar(33)},
441                {S32, S8}, {S32, LLT::scalar(24)}})
442     .scalarize(0)
443     .clampScalar(0, S32, S64);
444 
445   // TODO: Split s1->s64 during regbankselect for VALU.
446   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
447     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
448     .lowerFor({{S32, S64}})
449     .lowerIf(typeIs(1, S1))
450     .customFor({{S64, S64}});
451   if (ST.has16BitInsts())
452     IToFP.legalFor({{S16, S16}});
453   IToFP.clampScalar(1, S32, S64)
454        .scalarize(0);
455 
456   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
457     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}});
458   if (ST.has16BitInsts())
459     FPToI.legalFor({{S16, S16}});
460   else
461     FPToI.minScalar(1, S32);
462 
463   FPToI.minScalar(0, S32)
464        .scalarize(0);
465 
466   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
467     .scalarize(0)
468     .lower();
469 
470   if (ST.has16BitInsts()) {
471     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
472       .legalFor({S16, S32, S64})
473       .clampScalar(0, S16, S64)
474       .scalarize(0);
475   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
476     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
477       .legalFor({S32, S64})
478       .clampScalar(0, S32, S64)
479       .scalarize(0);
480   } else {
481     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
482       .legalFor({S32})
483       .customFor({S64})
484       .clampScalar(0, S32, S64)
485       .scalarize(0);
486   }
487 
488   getActionDefinitionsBuilder({G_PTR_ADD, G_PTR_MASK})
489     .scalarize(0)
490     .alwaysLegal();
491 
492   auto &CmpBuilder =
493     getActionDefinitionsBuilder(G_ICMP)
494     // The compare output type differs based on the register bank of the output,
495     // so make both s1 and s32 legal.
496     //
497     // Scalar compares producing output in scc will be promoted to s32, as that
498     // is the allocatable register type that will be needed for the copy from
499     // scc. This will be promoted during RegBankSelect, and we assume something
500     // before that won't try to use s32 result types.
501     //
502     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
503     // bank.
504     .legalForCartesianProduct(
505       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
506     .legalForCartesianProduct(
507       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
508   if (ST.has16BitInsts()) {
509     CmpBuilder.legalFor({{S1, S16}});
510   }
511 
512   CmpBuilder
513     .widenScalarToNextPow2(1)
514     .clampScalar(1, S32, S64)
515     .scalarize(0)
516     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
517 
518   getActionDefinitionsBuilder(G_FCMP)
519     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
520     .widenScalarToNextPow2(1)
521     .clampScalar(1, S32, S64)
522     .scalarize(0);
523 
524   // FIXME: fexp, flog2, flog10 needs to be custom lowered.
525   getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2,
526                                G_FLOG, G_FLOG2, G_FLOG10})
527     .legalFor({S32})
528     .scalarize(0);
529 
530   // The 64-bit versions produce 32-bit results, but only on the SALU.
531   getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF,
532                                G_CTTZ, G_CTTZ_ZERO_UNDEF,
533                                G_CTPOP})
534     .legalFor({{S32, S32}, {S32, S64}})
535     .clampScalar(0, S32, S32)
536     .clampScalar(1, S32, S64)
537     .scalarize(0)
538     .widenScalarToNextPow2(0, 32)
539     .widenScalarToNextPow2(1, 32);
540 
541   // TODO: Expand for > s32
542   getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE})
543     .legalFor({S32})
544     .clampScalar(0, S32, S32)
545     .scalarize(0);
546 
547   if (ST.has16BitInsts()) {
548     if (ST.hasVOP3PInsts()) {
549       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
550         .legalFor({S32, S16, V2S16})
551         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
552         .clampMaxNumElements(0, S16, 2)
553         .clampScalar(0, S16, S32)
554         .widenScalarToNextPow2(0)
555         .scalarize(0);
556     } else {
557       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
558         .legalFor({S32, S16})
559         .widenScalarToNextPow2(0)
560         .clampScalar(0, S16, S32)
561         .scalarize(0);
562     }
563   } else {
564     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
565       .legalFor({S32})
566       .clampScalar(0, S32, S32)
567       .widenScalarToNextPow2(0)
568       .scalarize(0);
569   }
570 
571   auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
572     return [=](const LegalityQuery &Query) {
573       return Query.Types[TypeIdx0].getSizeInBits() <
574              Query.Types[TypeIdx1].getSizeInBits();
575     };
576   };
577 
578   auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
579     return [=](const LegalityQuery &Query) {
580       return Query.Types[TypeIdx0].getSizeInBits() >
581              Query.Types[TypeIdx1].getSizeInBits();
582     };
583   };
584 
585   getActionDefinitionsBuilder(G_INTTOPTR)
586     // List the common cases
587     .legalForCartesianProduct(AddrSpaces64, {S64})
588     .legalForCartesianProduct(AddrSpaces32, {S32})
589     .scalarize(0)
590     // Accept any address space as long as the size matches
591     .legalIf(sameSize(0, 1))
592     .widenScalarIf(smallerThan(1, 0),
593       [](const LegalityQuery &Query) {
594         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
595       })
596     .narrowScalarIf(greaterThan(1, 0),
597       [](const LegalityQuery &Query) {
598         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
599       });
600 
601   getActionDefinitionsBuilder(G_PTRTOINT)
602     // List the common cases
603     .legalForCartesianProduct(AddrSpaces64, {S64})
604     .legalForCartesianProduct(AddrSpaces32, {S32})
605     .scalarize(0)
606     // Accept any address space as long as the size matches
607     .legalIf(sameSize(0, 1))
608     .widenScalarIf(smallerThan(0, 1),
609       [](const LegalityQuery &Query) {
610         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
611       })
612     .narrowScalarIf(
613       greaterThan(0, 1),
614       [](const LegalityQuery &Query) {
615         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
616       });
617 
618   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
619     .scalarize(0)
620     .custom();
621 
622   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
623   // handle some operations by just promoting the register during
624   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
625   auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned {
626     switch (AS) {
627     // FIXME: Private element size.
628     case AMDGPUAS::PRIVATE_ADDRESS:
629       return 32;
630     // FIXME: Check subtarget
631     case AMDGPUAS::LOCAL_ADDRESS:
632       return ST.useDS128() ? 128 : 64;
633 
634     // Treat constant and global as identical. SMRD loads are sometimes usable
635     // for global loads (ideally constant address space should be eliminated)
636     // depending on the context. Legality cannot be context dependent, but
637     // RegBankSelect can split the load as necessary depending on the pointer
638     // register bank/uniformity and if the memory is invariant or not written in
639     // a kernel.
640     case AMDGPUAS::CONSTANT_ADDRESS:
641     case AMDGPUAS::GLOBAL_ADDRESS:
642       return 512;
643     default:
644       return 128;
645     }
646   };
647 
648   const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool {
649     const LLT DstTy = Query.Types[0];
650 
651     // Split vector extloads.
652     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
653     unsigned Align = Query.MMODescrs[0].AlignInBits;
654 
655     if (MemSize < DstTy.getSizeInBits())
656       MemSize = std::max(MemSize, Align);
657 
658     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
659       return true;
660 
661     const LLT PtrTy = Query.Types[1];
662     unsigned AS = PtrTy.getAddressSpace();
663     if (MemSize > maxSizeForAddrSpace(AS))
664       return true;
665 
666     // Catch weird sized loads that don't evenly divide into the access sizes
667     // TODO: May be able to widen depending on alignment etc.
668     unsigned NumRegs = MemSize / 32;
669     if (NumRegs == 3 && !ST.hasDwordx3LoadStores())
670       return true;
671 
672     if (Align < MemSize) {
673       const SITargetLowering *TLI = ST.getTargetLowering();
674       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
675     }
676 
677     return false;
678   };
679 
680   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
681   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
682   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
683 
684   // TODO: Refine based on subtargets which support unaligned access or 128-bit
685   // LDS
686   // TODO: Unsupported flat for SI.
687 
688   for (unsigned Op : {G_LOAD, G_STORE}) {
689     const bool IsStore = Op == G_STORE;
690 
691     auto &Actions = getActionDefinitionsBuilder(Op);
692     // Whitelist the common cases.
693     // TODO: Pointer loads
694     // TODO: Wide constant loads
695     // TODO: Only CI+ has 3x loads
696     // TODO: Loads to s16 on gfx9
697     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
698                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
699                                       {V3S32, GlobalPtr, 96, GlobalAlign32},
700                                       {S96, GlobalPtr, 96, GlobalAlign32},
701                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
702                                       {S128, GlobalPtr, 128, GlobalAlign32},
703                                       {S64, GlobalPtr, 64, GlobalAlign32},
704                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
705                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
706                                       {S32, GlobalPtr, 8, GlobalAlign8},
707                                       {S32, GlobalPtr, 16, GlobalAlign16},
708 
709                                       {S32, LocalPtr, 32, 32},
710                                       {S64, LocalPtr, 64, 32},
711                                       {V2S32, LocalPtr, 64, 32},
712                                       {S32, LocalPtr, 8, 8},
713                                       {S32, LocalPtr, 16, 16},
714                                       {V2S16, LocalPtr, 32, 32},
715 
716                                       {S32, PrivatePtr, 32, 32},
717                                       {S32, PrivatePtr, 8, 8},
718                                       {S32, PrivatePtr, 16, 16},
719                                       {V2S16, PrivatePtr, 32, 32},
720 
721                                       {S32, FlatPtr, 32, GlobalAlign32},
722                                       {S32, FlatPtr, 16, GlobalAlign16},
723                                       {S32, FlatPtr, 8, GlobalAlign8},
724                                       {V2S16, FlatPtr, 32, GlobalAlign32},
725 
726                                       {S32, ConstantPtr, 32, GlobalAlign32},
727                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
728                                       {V3S32, ConstantPtr, 96, GlobalAlign32},
729                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
730                                       {S64, ConstantPtr, 64, GlobalAlign32},
731                                       {S128, ConstantPtr, 128, GlobalAlign32},
732                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
733     Actions
734         .customIf(typeIs(1, Constant32Ptr))
735         .narrowScalarIf(
736             [=](const LegalityQuery &Query) -> bool {
737               return !Query.Types[0].isVector() && needToSplitLoad(Query);
738             },
739             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
740               const LLT DstTy = Query.Types[0];
741               const LLT PtrTy = Query.Types[1];
742 
743               const unsigned DstSize = DstTy.getSizeInBits();
744               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
745 
746               // Split extloads.
747               if (DstSize > MemSize)
748                 return std::make_pair(0, LLT::scalar(MemSize));
749 
750               if (DstSize > 32 && (DstSize % 32 != 0)) {
751                 // FIXME: Need a way to specify non-extload of larger size if
752                 // suitably aligned.
753                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
754               }
755 
756               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
757               if (MemSize > MaxSize)
758                 return std::make_pair(0, LLT::scalar(MaxSize));
759 
760               unsigned Align = Query.MMODescrs[0].AlignInBits;
761               return std::make_pair(0, LLT::scalar(Align));
762             })
763         .fewerElementsIf(
764             [=](const LegalityQuery &Query) -> bool {
765               return Query.Types[0].isVector() && needToSplitLoad(Query);
766             },
767             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
768               const LLT DstTy = Query.Types[0];
769               const LLT PtrTy = Query.Types[1];
770 
771               LLT EltTy = DstTy.getElementType();
772               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
773 
774               // Split if it's too large for the address space.
775               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
776                 unsigned NumElts = DstTy.getNumElements();
777                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
778 
779                 // FIXME: Refine when odd breakdowns handled
780                 // The scalars will need to be re-legalized.
781                 if (NumPieces == 1 || NumPieces >= NumElts ||
782                     NumElts % NumPieces != 0)
783                   return std::make_pair(0, EltTy);
784 
785                 return std::make_pair(0,
786                                       LLT::vector(NumElts / NumPieces, EltTy));
787               }
788 
789               // Need to split because of alignment.
790               unsigned Align = Query.MMODescrs[0].AlignInBits;
791               unsigned EltSize = EltTy.getSizeInBits();
792               if (EltSize > Align &&
793                   (EltSize / Align < DstTy.getNumElements())) {
794                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
795               }
796 
797               // May need relegalization for the scalars.
798               return std::make_pair(0, EltTy);
799             })
800         .minScalar(0, S32);
801 
802     if (IsStore)
803       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
804 
805     // TODO: Need a bitcast lower option?
806     Actions
807         .legalIf([=](const LegalityQuery &Query) {
808           const LLT Ty0 = Query.Types[0];
809           unsigned Size = Ty0.getSizeInBits();
810           unsigned MemSize = Query.MMODescrs[0].SizeInBits;
811           unsigned Align = Query.MMODescrs[0].AlignInBits;
812 
813           // FIXME: Widening store from alignment not valid.
814           if (MemSize < Size)
815             MemSize = std::max(MemSize, Align);
816 
817           // No extending vector loads.
818           if (Size > MemSize && Ty0.isVector())
819             return false;
820 
821           switch (MemSize) {
822           case 8:
823           case 16:
824             return Size == 32;
825           case 32:
826           case 64:
827           case 128:
828             return true;
829           case 96:
830             return ST.hasDwordx3LoadStores();
831           case 256:
832           case 512:
833             return true;
834           default:
835             return false;
836           }
837         })
838         .widenScalarToNextPow2(0)
839         // TODO: v3s32->v4s32 with alignment
840         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
841   }
842 
843   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
844                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
845                                                   {S32, GlobalPtr, 16, 2 * 8},
846                                                   {S32, LocalPtr, 8, 8},
847                                                   {S32, LocalPtr, 16, 16},
848                                                   {S32, PrivatePtr, 8, 8},
849                                                   {S32, PrivatePtr, 16, 16},
850                                                   {S32, ConstantPtr, 8, 8},
851                                                   {S32, ConstantPtr, 16, 2 * 8}});
852   if (ST.hasFlatAddressSpace()) {
853     ExtLoads.legalForTypesWithMemDesc(
854         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
855   }
856 
857   ExtLoads.clampScalar(0, S32, S32)
858           .widenScalarToNextPow2(0)
859           .unsupportedIfMemSizeNotPow2()
860           .lower();
861 
862   auto &Atomics = getActionDefinitionsBuilder(
863     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
864      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
865      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
866      G_ATOMICRMW_UMIN})
867     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
868                {S64, GlobalPtr}, {S64, LocalPtr}});
869   if (ST.hasFlatAddressSpace()) {
870     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
871   }
872 
873   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
874     .legalFor({{S32, LocalPtr}});
875 
876   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
877   // demarshalling
878   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
879     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
880                 {S32, FlatPtr}, {S64, FlatPtr}})
881     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
882                {S32, RegionPtr}, {S64, RegionPtr}});
883   // TODO: Pointer types, any 32-bit or 64-bit vector
884 
885   // Condition should be s32 for scalar, s1 for vector.
886   getActionDefinitionsBuilder(G_SELECT)
887     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
888           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
889           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
890     .clampScalar(0, S16, S64)
891     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
892     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
893     .scalarize(1)
894     .clampMaxNumElements(0, S32, 2)
895     .clampMaxNumElements(0, LocalPtr, 2)
896     .clampMaxNumElements(0, PrivatePtr, 2)
897     .scalarize(0)
898     .widenScalarToNextPow2(0)
899     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
900 
901   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
902   // be more flexible with the shift amount type.
903   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
904     .legalFor({{S32, S32}, {S64, S32}});
905   if (ST.has16BitInsts()) {
906     if (ST.hasVOP3PInsts()) {
907       Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
908             .clampMaxNumElements(0, S16, 2);
909     } else
910       Shifts.legalFor({{S16, S32}, {S16, S16}});
911 
912     // TODO: Support 16-bit shift amounts
913     Shifts.clampScalar(1, S32, S32);
914     Shifts.clampScalar(0, S16, S64);
915     Shifts.widenScalarToNextPow2(0, 16);
916   } else {
917     // Make sure we legalize the shift amount type first, as the general
918     // expansion for the shifted type will produce much worse code if it hasn't
919     // been truncated already.
920     Shifts.clampScalar(1, S32, S32);
921     Shifts.clampScalar(0, S32, S64);
922     Shifts.widenScalarToNextPow2(0, 32);
923   }
924   Shifts.scalarize(0);
925 
926   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
927     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
928     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
929     unsigned IdxTypeIdx = 2;
930 
931     getActionDefinitionsBuilder(Op)
932       .customIf([=](const LegalityQuery &Query) {
933           const LLT EltTy = Query.Types[EltTypeIdx];
934           const LLT VecTy = Query.Types[VecTypeIdx];
935           const LLT IdxTy = Query.Types[IdxTypeIdx];
936           return (EltTy.getSizeInBits() == 16 ||
937                   EltTy.getSizeInBits() % 32 == 0) &&
938                  VecTy.getSizeInBits() % 32 == 0 &&
939                  VecTy.getSizeInBits() <= 1024 &&
940                  IdxTy.getSizeInBits() == 32;
941         })
942       .clampScalar(EltTypeIdx, S32, S64)
943       .clampScalar(VecTypeIdx, S32, S64)
944       .clampScalar(IdxTypeIdx, S32, S32);
945   }
946 
947   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
948     .unsupportedIf([=](const LegalityQuery &Query) {
949         const LLT &EltTy = Query.Types[1].getElementType();
950         return Query.Types[0] != EltTy;
951       });
952 
953   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
954     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
955     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
956 
957     // FIXME: Doesn't handle extract of illegal sizes.
958     getActionDefinitionsBuilder(Op)
959       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
960       // FIXME: Multiples of 16 should not be legal.
961       .legalIf([=](const LegalityQuery &Query) {
962           const LLT BigTy = Query.Types[BigTyIdx];
963           const LLT LitTy = Query.Types[LitTyIdx];
964           return (BigTy.getSizeInBits() % 32 == 0) &&
965                  (LitTy.getSizeInBits() % 16 == 0);
966         })
967       .widenScalarIf(
968         [=](const LegalityQuery &Query) {
969           const LLT BigTy = Query.Types[BigTyIdx];
970           return (BigTy.getScalarSizeInBits() < 16);
971         },
972         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
973       .widenScalarIf(
974         [=](const LegalityQuery &Query) {
975           const LLT LitTy = Query.Types[LitTyIdx];
976           return (LitTy.getScalarSizeInBits() < 16);
977         },
978         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
979       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
980       .widenScalarToNextPow2(BigTyIdx, 32);
981 
982   }
983 
984   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
985     .legalForCartesianProduct(AllS32Vectors, {S32})
986     .legalForCartesianProduct(AllS64Vectors, {S64})
987     .clampNumElements(0, V16S32, V32S32)
988     .clampNumElements(0, V2S64, V16S64)
989     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
990 
991   if (ST.hasScalarPackInsts())
992     BuildVector.legalFor({V2S16, S32});
993 
994   BuildVector
995     .minScalarSameAs(1, 0)
996     .legalIf(isRegisterType(0))
997     .minScalarOrElt(0, S32);
998 
999   if (ST.hasScalarPackInsts()) {
1000     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1001       .legalFor({V2S16, S32})
1002       .lower();
1003   } else {
1004     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1005       .lower();
1006   }
1007 
1008   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1009     .legalIf(isRegisterType(0));
1010 
1011   // TODO: Don't fully scalarize v2s16 pieces
1012   getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1013 
1014   // Merge/Unmerge
1015   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1016     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1017     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1018 
1019     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1020       const LLT &Ty = Query.Types[TypeIdx];
1021       if (Ty.isVector()) {
1022         const LLT &EltTy = Ty.getElementType();
1023         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
1024           return true;
1025         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1026           return true;
1027       }
1028       return false;
1029     };
1030 
1031     auto &Builder = getActionDefinitionsBuilder(Op)
1032       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1033       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1034       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1035       // valid.
1036       .clampScalar(LitTyIdx, S16, S256)
1037       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1038       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1039       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1040                            elementTypeIs(1, S16)),
1041                        changeTo(1, V2S16))
1042       // Break up vectors with weird elements into scalars
1043       .fewerElementsIf(
1044         [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
1045         scalarize(0))
1046       .fewerElementsIf(
1047         [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
1048         scalarize(1))
1049       .clampScalar(BigTyIdx, S32, S1024)
1050       .lowerFor({{S16, V2S16}});
1051 
1052     if (Op == G_MERGE_VALUES) {
1053       Builder.widenScalarIf(
1054         // TODO: Use 16-bit shifts if legal for 8-bit values?
1055         [=](const LegalityQuery &Query) {
1056           const LLT Ty = Query.Types[LitTyIdx];
1057           return Ty.getSizeInBits() < 32;
1058         },
1059         changeTo(LitTyIdx, S32));
1060     }
1061 
1062     Builder.widenScalarIf(
1063       [=](const LegalityQuery &Query) {
1064         const LLT Ty = Query.Types[BigTyIdx];
1065         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1066           Ty.getSizeInBits() % 16 != 0;
1067       },
1068       [=](const LegalityQuery &Query) {
1069         // Pick the next power of 2, or a multiple of 64 over 128.
1070         // Whichever is smaller.
1071         const LLT &Ty = Query.Types[BigTyIdx];
1072         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1073         if (NewSizeInBits >= 256) {
1074           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1075           if (RoundedTo < NewSizeInBits)
1076             NewSizeInBits = RoundedTo;
1077         }
1078         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1079       })
1080       .legalIf([=](const LegalityQuery &Query) {
1081           const LLT &BigTy = Query.Types[BigTyIdx];
1082           const LLT &LitTy = Query.Types[LitTyIdx];
1083 
1084           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1085             return false;
1086           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1087             return false;
1088 
1089           return BigTy.getSizeInBits() % 16 == 0 &&
1090                  LitTy.getSizeInBits() % 16 == 0 &&
1091                  BigTy.getSizeInBits() <= 1024;
1092         })
1093       // Any vectors left are the wrong size. Scalarize them.
1094       .scalarize(0)
1095       .scalarize(1);
1096   }
1097 
1098   // TODO: Make legal for s32, s64. s64 case needs break down in regbankselect.
1099   getActionDefinitionsBuilder(G_SEXT_INREG)
1100     .clampScalar(0, MinLegalScalarShiftTy, S64)
1101     .lower();
1102 
1103   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1104     .legalFor({S64});
1105 
1106   getActionDefinitionsBuilder({
1107       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1108       G_FCOPYSIGN,
1109 
1110       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1111       G_READ_REGISTER,
1112       G_WRITE_REGISTER,
1113 
1114       G_SADDO, G_SSUBO,
1115 
1116        // TODO: Implement
1117       G_FMINIMUM, G_FMAXIMUM
1118     }).lower();
1119 
1120   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1121         G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1122         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1123     .unsupported();
1124 
1125   computeTables();
1126   verify(*ST.getInstrInfo());
1127 }
1128 
1129 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1130                                          MachineRegisterInfo &MRI,
1131                                          MachineIRBuilder &B,
1132                                          GISelChangeObserver &Observer) const {
1133   switch (MI.getOpcode()) {
1134   case TargetOpcode::G_ADDRSPACE_CAST:
1135     return legalizeAddrSpaceCast(MI, MRI, B);
1136   case TargetOpcode::G_FRINT:
1137     return legalizeFrint(MI, MRI, B);
1138   case TargetOpcode::G_FCEIL:
1139     return legalizeFceil(MI, MRI, B);
1140   case TargetOpcode::G_INTRINSIC_TRUNC:
1141     return legalizeIntrinsicTrunc(MI, MRI, B);
1142   case TargetOpcode::G_SITOFP:
1143     return legalizeITOFP(MI, MRI, B, true);
1144   case TargetOpcode::G_UITOFP:
1145     return legalizeITOFP(MI, MRI, B, false);
1146   case TargetOpcode::G_FMINNUM:
1147   case TargetOpcode::G_FMAXNUM:
1148   case TargetOpcode::G_FMINNUM_IEEE:
1149   case TargetOpcode::G_FMAXNUM_IEEE:
1150     return legalizeMinNumMaxNum(MI, MRI, B);
1151   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1152     return legalizeExtractVectorElt(MI, MRI, B);
1153   case TargetOpcode::G_INSERT_VECTOR_ELT:
1154     return legalizeInsertVectorElt(MI, MRI, B);
1155   case TargetOpcode::G_FSIN:
1156   case TargetOpcode::G_FCOS:
1157     return legalizeSinCos(MI, MRI, B);
1158   case TargetOpcode::G_GLOBAL_VALUE:
1159     return legalizeGlobalValue(MI, MRI, B);
1160   case TargetOpcode::G_LOAD:
1161     return legalizeLoad(MI, MRI, B, Observer);
1162   case TargetOpcode::G_FMAD:
1163     return legalizeFMad(MI, MRI, B);
1164   case TargetOpcode::G_FDIV:
1165     return legalizeFDIV(MI, MRI, B);
1166   case TargetOpcode::G_ATOMIC_CMPXCHG:
1167     return legalizeAtomicCmpXChg(MI, MRI, B);
1168   default:
1169     return false;
1170   }
1171 
1172   llvm_unreachable("expected switch to return");
1173 }
1174 
1175 Register AMDGPULegalizerInfo::getSegmentAperture(
1176   unsigned AS,
1177   MachineRegisterInfo &MRI,
1178   MachineIRBuilder &B) const {
1179   MachineFunction &MF = B.getMF();
1180   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1181   const LLT S32 = LLT::scalar(32);
1182 
1183   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1184 
1185   if (ST.hasApertureRegs()) {
1186     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1187     // getreg.
1188     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1189         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1190         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1191     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1192         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1193         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1194     unsigned Encoding =
1195         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1196         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1197         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1198 
1199     Register ApertureReg = MRI.createGenericVirtualRegister(S32);
1200     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1201 
1202     B.buildInstr(AMDGPU::S_GETREG_B32)
1203       .addDef(GetReg)
1204       .addImm(Encoding);
1205     MRI.setType(GetReg, S32);
1206 
1207     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1208     B.buildInstr(TargetOpcode::G_SHL)
1209       .addDef(ApertureReg)
1210       .addUse(GetReg)
1211       .addUse(ShiftAmt.getReg(0));
1212 
1213     return ApertureReg;
1214   }
1215 
1216   Register QueuePtr = MRI.createGenericVirtualRegister(
1217     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1218 
1219   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1220   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1221     return Register();
1222 
1223   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1224   // private_segment_aperture_base_hi.
1225   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1226 
1227   // TODO: can we be smarter about machine pointer info?
1228   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1229   MachineMemOperand *MMO = MF.getMachineMemOperand(
1230     PtrInfo,
1231     MachineMemOperand::MOLoad |
1232     MachineMemOperand::MODereferenceable |
1233     MachineMemOperand::MOInvariant,
1234     4,
1235     MinAlign(64, StructOffset));
1236 
1237   Register LoadResult = MRI.createGenericVirtualRegister(S32);
1238   Register LoadAddr;
1239 
1240   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1241   B.buildLoad(LoadResult, LoadAddr, *MMO);
1242   return LoadResult;
1243 }
1244 
1245 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1246   MachineInstr &MI, MachineRegisterInfo &MRI,
1247   MachineIRBuilder &B) const {
1248   MachineFunction &MF = B.getMF();
1249 
1250   B.setInstr(MI);
1251 
1252   const LLT S32 = LLT::scalar(32);
1253   Register Dst = MI.getOperand(0).getReg();
1254   Register Src = MI.getOperand(1).getReg();
1255 
1256   LLT DstTy = MRI.getType(Dst);
1257   LLT SrcTy = MRI.getType(Src);
1258   unsigned DestAS = DstTy.getAddressSpace();
1259   unsigned SrcAS = SrcTy.getAddressSpace();
1260 
1261   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1262   // vector element.
1263   assert(!DstTy.isVector());
1264 
1265   const AMDGPUTargetMachine &TM
1266     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1267 
1268   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1269   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1270     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1271     return true;
1272   }
1273 
1274   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1275     // Truncate.
1276     B.buildExtract(Dst, Src, 0);
1277     MI.eraseFromParent();
1278     return true;
1279   }
1280 
1281   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1282     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1283     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1284 
1285     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1286     // another. Merge operands are required to be the same type, but creating an
1287     // extra ptrtoint would be kind of pointless.
1288     auto HighAddr = B.buildConstant(
1289       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1290     B.buildMerge(Dst, {Src, HighAddr.getReg(0)});
1291     MI.eraseFromParent();
1292     return true;
1293   }
1294 
1295   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1296     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1297            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1298     unsigned NullVal = TM.getNullPointerValue(DestAS);
1299 
1300     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1301     auto FlatNull = B.buildConstant(SrcTy, 0);
1302 
1303     Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy);
1304 
1305     // Extract low 32-bits of the pointer.
1306     B.buildExtract(PtrLo32, Src, 0);
1307 
1308     Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1309     B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
1310     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1311 
1312     MI.eraseFromParent();
1313     return true;
1314   }
1315 
1316   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1317     return false;
1318 
1319   if (!ST.hasFlatAddressSpace())
1320     return false;
1321 
1322   auto SegmentNull =
1323       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1324   auto FlatNull =
1325       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1326 
1327   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1328   if (!ApertureReg.isValid())
1329     return false;
1330 
1331   Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1332   B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
1333 
1334   Register BuildPtr = MRI.createGenericVirtualRegister(DstTy);
1335 
1336   // Coerce the type of the low half of the result so we can use merge_values.
1337   Register SrcAsInt = MRI.createGenericVirtualRegister(S32);
1338   B.buildInstr(TargetOpcode::G_PTRTOINT)
1339     .addDef(SrcAsInt)
1340     .addUse(Src);
1341 
1342   // TODO: Should we allow mismatched types but matching sizes in merges to
1343   // avoid the ptrtoint?
1344   B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
1345   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
1346 
1347   MI.eraseFromParent();
1348   return true;
1349 }
1350 
1351 bool AMDGPULegalizerInfo::legalizeFrint(
1352   MachineInstr &MI, MachineRegisterInfo &MRI,
1353   MachineIRBuilder &B) const {
1354   B.setInstr(MI);
1355 
1356   Register Src = MI.getOperand(1).getReg();
1357   LLT Ty = MRI.getType(Src);
1358   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1359 
1360   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1361   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1362 
1363   auto C1 = B.buildFConstant(Ty, C1Val);
1364   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1365 
1366   // TODO: Should this propagate fast-math-flags?
1367   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1368   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1369 
1370   auto C2 = B.buildFConstant(Ty, C2Val);
1371   auto Fabs = B.buildFAbs(Ty, Src);
1372 
1373   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1374   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1375   return true;
1376 }
1377 
1378 bool AMDGPULegalizerInfo::legalizeFceil(
1379   MachineInstr &MI, MachineRegisterInfo &MRI,
1380   MachineIRBuilder &B) const {
1381   B.setInstr(MI);
1382 
1383   const LLT S1 = LLT::scalar(1);
1384   const LLT S64 = LLT::scalar(64);
1385 
1386   Register Src = MI.getOperand(1).getReg();
1387   assert(MRI.getType(Src) == S64);
1388 
1389   // result = trunc(src)
1390   // if (src > 0.0 && src != result)
1391   //   result += 1.0
1392 
1393   auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src});
1394 
1395   const auto Zero = B.buildFConstant(S64, 0.0);
1396   const auto One = B.buildFConstant(S64, 1.0);
1397   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1398   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1399   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1400   auto Add = B.buildSelect(S64, And, One, Zero);
1401 
1402   // TODO: Should this propagate fast-math-flags?
1403   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1404   return true;
1405 }
1406 
1407 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1408                                               MachineIRBuilder &B) {
1409   const unsigned FractBits = 52;
1410   const unsigned ExpBits = 11;
1411   LLT S32 = LLT::scalar(32);
1412 
1413   auto Const0 = B.buildConstant(S32, FractBits - 32);
1414   auto Const1 = B.buildConstant(S32, ExpBits);
1415 
1416   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1417     .addUse(Const0.getReg(0))
1418     .addUse(Const1.getReg(0));
1419 
1420   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1421 }
1422 
1423 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1424   MachineInstr &MI, MachineRegisterInfo &MRI,
1425   MachineIRBuilder &B) const {
1426   B.setInstr(MI);
1427 
1428   const LLT S1 = LLT::scalar(1);
1429   const LLT S32 = LLT::scalar(32);
1430   const LLT S64 = LLT::scalar(64);
1431 
1432   Register Src = MI.getOperand(1).getReg();
1433   assert(MRI.getType(Src) == S64);
1434 
1435   // TODO: Should this use extract since the low half is unused?
1436   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1437   Register Hi = Unmerge.getReg(1);
1438 
1439   // Extract the upper half, since this is where we will find the sign and
1440   // exponent.
1441   auto Exp = extractF64Exponent(Hi, B);
1442 
1443   const unsigned FractBits = 52;
1444 
1445   // Extract the sign bit.
1446   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1447   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1448 
1449   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1450 
1451   const auto Zero32 = B.buildConstant(S32, 0);
1452 
1453   // Extend back to 64-bits.
1454   auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)});
1455 
1456   auto Shr = B.buildAShr(S64, FractMask, Exp);
1457   auto Not = B.buildNot(S64, Shr);
1458   auto Tmp0 = B.buildAnd(S64, Src, Not);
1459   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1460 
1461   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1462   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1463 
1464   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1465   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1466   return true;
1467 }
1468 
1469 bool AMDGPULegalizerInfo::legalizeITOFP(
1470   MachineInstr &MI, MachineRegisterInfo &MRI,
1471   MachineIRBuilder &B, bool Signed) const {
1472   B.setInstr(MI);
1473 
1474   Register Dst = MI.getOperand(0).getReg();
1475   Register Src = MI.getOperand(1).getReg();
1476 
1477   const LLT S64 = LLT::scalar(64);
1478   const LLT S32 = LLT::scalar(32);
1479 
1480   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1481 
1482   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1483 
1484   auto CvtHi = Signed ?
1485     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1486     B.buildUITOFP(S64, Unmerge.getReg(1));
1487 
1488   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1489 
1490   auto ThirtyTwo = B.buildConstant(S32, 32);
1491   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1492     .addUse(CvtHi.getReg(0))
1493     .addUse(ThirtyTwo.getReg(0));
1494 
1495   // TODO: Should this propagate fast-math-flags?
1496   B.buildFAdd(Dst, LdExp, CvtLo);
1497   MI.eraseFromParent();
1498   return true;
1499 }
1500 
1501 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1502   MachineInstr &MI, MachineRegisterInfo &MRI,
1503   MachineIRBuilder &B) const {
1504   MachineFunction &MF = B.getMF();
1505   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1506 
1507   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1508                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1509 
1510   // With ieee_mode disabled, the instructions have the correct behavior
1511   // already for G_FMINNUM/G_FMAXNUM
1512   if (!MFI->getMode().IEEE)
1513     return !IsIEEEOp;
1514 
1515   if (IsIEEEOp)
1516     return true;
1517 
1518   MachineIRBuilder HelperBuilder(MI);
1519   GISelObserverWrapper DummyObserver;
1520   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1521   HelperBuilder.setInstr(MI);
1522   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1523 }
1524 
1525 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1526   MachineInstr &MI, MachineRegisterInfo &MRI,
1527   MachineIRBuilder &B) const {
1528   // TODO: Should move some of this into LegalizerHelper.
1529 
1530   // TODO: Promote dynamic indexing of s16 to s32
1531   // TODO: Dynamic s64 indexing is only legal for SGPR.
1532   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
1533   if (!IdxVal) // Dynamic case will be selected to register indexing.
1534     return true;
1535 
1536   Register Dst = MI.getOperand(0).getReg();
1537   Register Vec = MI.getOperand(1).getReg();
1538 
1539   LLT VecTy = MRI.getType(Vec);
1540   LLT EltTy = VecTy.getElementType();
1541   assert(EltTy == MRI.getType(Dst));
1542 
1543   B.setInstr(MI);
1544 
1545   if (IdxVal.getValue() < VecTy.getNumElements())
1546     B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits());
1547   else
1548     B.buildUndef(Dst);
1549 
1550   MI.eraseFromParent();
1551   return true;
1552 }
1553 
1554 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1555   MachineInstr &MI, MachineRegisterInfo &MRI,
1556   MachineIRBuilder &B) const {
1557   // TODO: Should move some of this into LegalizerHelper.
1558 
1559   // TODO: Promote dynamic indexing of s16 to s32
1560   // TODO: Dynamic s64 indexing is only legal for SGPR.
1561   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI);
1562   if (!IdxVal) // Dynamic case will be selected to register indexing.
1563     return true;
1564 
1565   Register Dst = MI.getOperand(0).getReg();
1566   Register Vec = MI.getOperand(1).getReg();
1567   Register Ins = MI.getOperand(2).getReg();
1568 
1569   LLT VecTy = MRI.getType(Vec);
1570   LLT EltTy = VecTy.getElementType();
1571   assert(EltTy == MRI.getType(Ins));
1572 
1573   B.setInstr(MI);
1574 
1575   if (IdxVal.getValue() < VecTy.getNumElements())
1576     B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits());
1577   else
1578     B.buildUndef(Dst);
1579 
1580   MI.eraseFromParent();
1581   return true;
1582 }
1583 
1584 bool AMDGPULegalizerInfo::legalizeSinCos(
1585   MachineInstr &MI, MachineRegisterInfo &MRI,
1586   MachineIRBuilder &B) const {
1587   B.setInstr(MI);
1588 
1589   Register DstReg = MI.getOperand(0).getReg();
1590   Register SrcReg = MI.getOperand(1).getReg();
1591   LLT Ty = MRI.getType(DstReg);
1592   unsigned Flags = MI.getFlags();
1593 
1594   Register TrigVal;
1595   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1596   if (ST.hasTrigReducedRange()) {
1597     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1598     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1599       .addUse(MulVal.getReg(0))
1600       .setMIFlags(Flags).getReg(0);
1601   } else
1602     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1603 
1604   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1605     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1606   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1607     .addUse(TrigVal)
1608     .setMIFlags(Flags);
1609   MI.eraseFromParent();
1610   return true;
1611 }
1612 
1613 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1614   Register DstReg, LLT PtrTy,
1615   MachineIRBuilder &B, const GlobalValue *GV,
1616   unsigned Offset, unsigned GAFlags) const {
1617   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1618   // to the following code sequence:
1619   //
1620   // For constant address space:
1621   //   s_getpc_b64 s[0:1]
1622   //   s_add_u32 s0, s0, $symbol
1623   //   s_addc_u32 s1, s1, 0
1624   //
1625   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1626   //   a fixup or relocation is emitted to replace $symbol with a literal
1627   //   constant, which is a pc-relative offset from the encoding of the $symbol
1628   //   operand to the global variable.
1629   //
1630   // For global address space:
1631   //   s_getpc_b64 s[0:1]
1632   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1633   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1634   //
1635   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1636   //   fixups or relocations are emitted to replace $symbol@*@lo and
1637   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1638   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
1639   //   operand to the global variable.
1640   //
1641   // What we want here is an offset from the value returned by s_getpc
1642   // (which is the address of the s_add_u32 instruction) to the global
1643   // variable, but since the encoding of $symbol starts 4 bytes after the start
1644   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1645   // small. This requires us to add 4 to the global variable offset in order to
1646   // compute the correct address.
1647 
1648   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1649 
1650   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1651     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1652 
1653   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1654     .addDef(PCReg);
1655 
1656   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1657   if (GAFlags == SIInstrInfo::MO_NONE)
1658     MIB.addImm(0);
1659   else
1660     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1661 
1662   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1663 
1664   if (PtrTy.getSizeInBits() == 32)
1665     B.buildExtract(DstReg, PCReg, 0);
1666   return true;
1667  }
1668 
1669 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1670   MachineInstr &MI, MachineRegisterInfo &MRI,
1671   MachineIRBuilder &B) const {
1672   Register DstReg = MI.getOperand(0).getReg();
1673   LLT Ty = MRI.getType(DstReg);
1674   unsigned AS = Ty.getAddressSpace();
1675 
1676   const GlobalValue *GV = MI.getOperand(1).getGlobal();
1677   MachineFunction &MF = B.getMF();
1678   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1679   B.setInstr(MI);
1680 
1681   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1682     if (!MFI->isEntryFunction()) {
1683       const Function &Fn = MF.getFunction();
1684       DiagnosticInfoUnsupported BadLDSDecl(
1685         Fn, "local memory global used by non-kernel function", MI.getDebugLoc());
1686       Fn.getContext().diagnose(BadLDSDecl);
1687     }
1688 
1689     // TODO: We could emit code to handle the initialization somewhere.
1690     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1691       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1692       MI.eraseFromParent();
1693       return true;
1694     }
1695 
1696     const Function &Fn = MF.getFunction();
1697     DiagnosticInfoUnsupported BadInit(
1698       Fn, "unsupported initializer for address space", MI.getDebugLoc());
1699     Fn.getContext().diagnose(BadInit);
1700     return true;
1701   }
1702 
1703   const SITargetLowering *TLI = ST.getTargetLowering();
1704 
1705   if (TLI->shouldEmitFixup(GV)) {
1706     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
1707     MI.eraseFromParent();
1708     return true;
1709   }
1710 
1711   if (TLI->shouldEmitPCReloc(GV)) {
1712     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
1713     MI.eraseFromParent();
1714     return true;
1715   }
1716 
1717   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1718   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
1719 
1720   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
1721     MachinePointerInfo::getGOT(MF),
1722     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1723     MachineMemOperand::MOInvariant,
1724     8 /*Size*/, 8 /*Align*/);
1725 
1726   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
1727 
1728   if (Ty.getSizeInBits() == 32) {
1729     // Truncate if this is a 32-bit constant adrdess.
1730     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
1731     B.buildExtract(DstReg, Load, 0);
1732   } else
1733     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
1734 
1735   MI.eraseFromParent();
1736   return true;
1737 }
1738 
1739 bool AMDGPULegalizerInfo::legalizeLoad(
1740   MachineInstr &MI, MachineRegisterInfo &MRI,
1741   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
1742   B.setInstr(MI);
1743   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1744   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
1745   Observer.changingInstr(MI);
1746   MI.getOperand(1).setReg(Cast.getReg(0));
1747   Observer.changedInstr(MI);
1748   return true;
1749 }
1750 
1751 bool AMDGPULegalizerInfo::legalizeFMad(
1752   MachineInstr &MI, MachineRegisterInfo &MRI,
1753   MachineIRBuilder &B) const {
1754   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1755   assert(Ty.isScalar());
1756 
1757   MachineFunction &MF = B.getMF();
1758   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1759 
1760   // TODO: Always legal with future ftz flag.
1761   if (Ty == LLT::scalar(32) && !MFI->getMode().FP32Denormals)
1762     return true;
1763   if (Ty == LLT::scalar(16) && !MFI->getMode().FP64FP16Denormals)
1764     return true;
1765 
1766 
1767   MachineIRBuilder HelperBuilder(MI);
1768   GISelObserverWrapper DummyObserver;
1769   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1770   HelperBuilder.setMBB(*MI.getParent());
1771   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
1772 }
1773 
1774 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
1775   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
1776   Register DstReg = MI.getOperand(0).getReg();
1777   Register PtrReg = MI.getOperand(1).getReg();
1778   Register CmpVal = MI.getOperand(2).getReg();
1779   Register NewVal = MI.getOperand(3).getReg();
1780 
1781   assert(SITargetLowering::isFlatGlobalAddrSpace(
1782            MRI.getType(PtrReg).getAddressSpace()) &&
1783          "this should not have been custom lowered");
1784 
1785   LLT ValTy = MRI.getType(CmpVal);
1786   LLT VecTy = LLT::vector(2, ValTy);
1787 
1788   B.setInstr(MI);
1789   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
1790 
1791   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
1792     .addDef(DstReg)
1793     .addUse(PtrReg)
1794     .addUse(PackedVal)
1795     .setMemRefs(MI.memoperands());
1796 
1797   MI.eraseFromParent();
1798   return true;
1799 }
1800 
1801 // Return the use branch instruction, otherwise null if the usage is invalid.
1802 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
1803                                        MachineRegisterInfo &MRI,
1804                                        MachineInstr *&Br) {
1805   Register CondDef = MI.getOperand(0).getReg();
1806   if (!MRI.hasOneNonDBGUse(CondDef))
1807     return nullptr;
1808 
1809   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
1810   if (UseMI.getParent() != MI.getParent() ||
1811       UseMI.getOpcode() != AMDGPU::G_BRCOND)
1812     return nullptr;
1813 
1814   // Make sure the cond br is followed by a G_BR
1815   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
1816   if (Next != MI.getParent()->end()) {
1817     if (Next->getOpcode() != AMDGPU::G_BR)
1818       return nullptr;
1819     Br = &*Next;
1820   }
1821 
1822   return &UseMI;
1823 }
1824 
1825 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
1826                                                 Register Reg, LLT Ty) const {
1827   Register LiveIn = MRI.getLiveInVirtReg(Reg);
1828   if (LiveIn)
1829     return LiveIn;
1830 
1831   Register NewReg = MRI.createGenericVirtualRegister(Ty);
1832   MRI.addLiveIn(Reg, NewReg);
1833   return NewReg;
1834 }
1835 
1836 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
1837                                          const ArgDescriptor *Arg) const {
1838   if (!Arg->isRegister() || !Arg->getRegister().isValid())
1839     return false; // TODO: Handle these
1840 
1841   assert(Arg->getRegister().isPhysical());
1842 
1843   MachineRegisterInfo &MRI = *B.getMRI();
1844 
1845   LLT Ty = MRI.getType(DstReg);
1846   Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
1847 
1848   if (Arg->isMasked()) {
1849     // TODO: Should we try to emit this once in the entry block?
1850     const LLT S32 = LLT::scalar(32);
1851     const unsigned Mask = Arg->getMask();
1852     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
1853 
1854     Register AndMaskSrc = LiveIn;
1855 
1856     if (Shift != 0) {
1857       auto ShiftAmt = B.buildConstant(S32, Shift);
1858       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
1859     }
1860 
1861     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
1862   } else
1863     B.buildCopy(DstReg, LiveIn);
1864 
1865   // Insert the argument copy if it doens't already exist.
1866   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
1867   if (!MRI.getVRegDef(LiveIn)) {
1868     // FIXME: Should have scoped insert pt
1869     MachineBasicBlock &OrigInsBB = B.getMBB();
1870     auto OrigInsPt = B.getInsertPt();
1871 
1872     MachineBasicBlock &EntryMBB = B.getMF().front();
1873     EntryMBB.addLiveIn(Arg->getRegister());
1874     B.setInsertPt(EntryMBB, EntryMBB.begin());
1875     B.buildCopy(LiveIn, Arg->getRegister());
1876 
1877     B.setInsertPt(OrigInsBB, OrigInsPt);
1878   }
1879 
1880   return true;
1881 }
1882 
1883 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
1884   MachineInstr &MI,
1885   MachineRegisterInfo &MRI,
1886   MachineIRBuilder &B,
1887   AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
1888   B.setInstr(MI);
1889 
1890   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1891 
1892   const ArgDescriptor *Arg;
1893   const TargetRegisterClass *RC;
1894   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
1895   if (!Arg) {
1896     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
1897     return false;
1898   }
1899 
1900   if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
1901     MI.eraseFromParent();
1902     return true;
1903   }
1904 
1905   return false;
1906 }
1907 
1908 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
1909                                        MachineRegisterInfo &MRI,
1910                                        MachineIRBuilder &B) const {
1911   B.setInstr(MI);
1912   Register Dst = MI.getOperand(0).getReg();
1913   LLT DstTy = MRI.getType(Dst);
1914   LLT S16 = LLT::scalar(16);
1915   LLT S32 = LLT::scalar(32);
1916   LLT S64 = LLT::scalar(64);
1917 
1918   if (legalizeFastUnsafeFDIV(MI, MRI, B))
1919     return true;
1920 
1921   if (DstTy == S16)
1922     return legalizeFDIV16(MI, MRI, B);
1923   if (DstTy == S32)
1924     return legalizeFDIV32(MI, MRI, B);
1925   if (DstTy == S64)
1926     return legalizeFDIV64(MI, MRI, B);
1927 
1928   return false;
1929 }
1930 
1931 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
1932                                                  MachineRegisterInfo &MRI,
1933                                                  MachineIRBuilder &B) const {
1934   Register Res = MI.getOperand(0).getReg();
1935   Register LHS = MI.getOperand(1).getReg();
1936   Register RHS = MI.getOperand(2).getReg();
1937 
1938   uint16_t Flags = MI.getFlags();
1939 
1940   LLT ResTy = MRI.getType(Res);
1941   LLT S32 = LLT::scalar(32);
1942   LLT S64 = LLT::scalar(64);
1943 
1944   const MachineFunction &MF = B.getMF();
1945   bool Unsafe =
1946     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
1947 
1948   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
1949     return false;
1950 
1951   if (!Unsafe && ResTy == S32 &&
1952       MF.getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals)
1953     return false;
1954 
1955   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
1956     // 1 / x -> RCP(x)
1957     if (CLHS->isExactlyValue(1.0)) {
1958       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
1959         .addUse(RHS)
1960         .setMIFlags(Flags);
1961 
1962       MI.eraseFromParent();
1963       return true;
1964     }
1965 
1966     // -1 / x -> RCP( FNEG(x) )
1967     if (CLHS->isExactlyValue(-1.0)) {
1968       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
1969       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
1970         .addUse(FNeg.getReg(0))
1971         .setMIFlags(Flags);
1972 
1973       MI.eraseFromParent();
1974       return true;
1975     }
1976   }
1977 
1978   // x / y -> x * (1.0 / y)
1979   if (Unsafe) {
1980     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
1981       .addUse(RHS)
1982       .setMIFlags(Flags);
1983     B.buildFMul(Res, LHS, RCP, Flags);
1984 
1985     MI.eraseFromParent();
1986     return true;
1987   }
1988 
1989   return false;
1990 }
1991 
1992 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
1993                                          MachineRegisterInfo &MRI,
1994                                          MachineIRBuilder &B) const {
1995   B.setInstr(MI);
1996   Register Res = MI.getOperand(0).getReg();
1997   Register LHS = MI.getOperand(1).getReg();
1998   Register RHS = MI.getOperand(2).getReg();
1999 
2000   uint16_t Flags = MI.getFlags();
2001 
2002   LLT S16 = LLT::scalar(16);
2003   LLT S32 = LLT::scalar(32);
2004 
2005   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2006   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2007 
2008   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2009     .addUse(RHSExt.getReg(0))
2010     .setMIFlags(Flags);
2011 
2012   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2013   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2014 
2015   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2016     .addUse(RDst.getReg(0))
2017     .addUse(RHS)
2018     .addUse(LHS)
2019     .setMIFlags(Flags);
2020 
2021   MI.eraseFromParent();
2022   return true;
2023 }
2024 
2025 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2026 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2027 static void toggleSPDenormMode(bool Enable,
2028                                MachineIRBuilder &B,
2029                                const GCNSubtarget &ST,
2030                                AMDGPU::SIModeRegisterDefaults Mode) {
2031   // Set SP denorm mode to this value.
2032   unsigned SPDenormMode =
2033     Enable ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
2034 
2035   if (ST.hasDenormModeInst()) {
2036     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2037     unsigned DPDenormModeDefault = Mode.FP64FP16Denormals
2038                                    ? FP_DENORM_FLUSH_NONE
2039                                    : FP_DENORM_FLUSH_IN_FLUSH_OUT;
2040 
2041     unsigned NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2042     B.buildInstr(AMDGPU::S_DENORM_MODE)
2043       .addImm(NewDenormModeValue);
2044 
2045   } else {
2046     // Select FP32 bit field in mode register.
2047     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2048                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2049                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2050 
2051     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2052       .addImm(SPDenormMode)
2053       .addImm(SPDenormModeBitField);
2054   }
2055 }
2056 
2057 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2058                                          MachineRegisterInfo &MRI,
2059                                          MachineIRBuilder &B) const {
2060   B.setInstr(MI);
2061   Register Res = MI.getOperand(0).getReg();
2062   Register LHS = MI.getOperand(1).getReg();
2063   Register RHS = MI.getOperand(2).getReg();
2064   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2065   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2066 
2067   uint16_t Flags = MI.getFlags();
2068 
2069   LLT S32 = LLT::scalar(32);
2070   LLT S1 = LLT::scalar(1);
2071 
2072   auto One = B.buildFConstant(S32, 1.0f);
2073 
2074   auto DenominatorScaled =
2075     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2076       .addUse(RHS)
2077       .addUse(LHS)
2078       .addImm(1)
2079       .setMIFlags(Flags);
2080   auto NumeratorScaled =
2081     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2082       .addUse(LHS)
2083       .addUse(RHS)
2084       .addImm(0)
2085       .setMIFlags(Flags);
2086 
2087   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2088     .addUse(DenominatorScaled.getReg(0))
2089     .setMIFlags(Flags);
2090   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2091 
2092   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2093   // aren't modeled as reading it.
2094   if (!Mode.FP32Denormals)
2095     toggleSPDenormMode(true, B, ST, Mode);
2096 
2097   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2098   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2099   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2100   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2101   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2102   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2103 
2104   if (!Mode.FP32Denormals)
2105     toggleSPDenormMode(false, B, ST, Mode);
2106 
2107   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2108     .addUse(Fma4.getReg(0))
2109     .addUse(Fma1.getReg(0))
2110     .addUse(Fma3.getReg(0))
2111     .addUse(NumeratorScaled.getReg(1))
2112     .setMIFlags(Flags);
2113 
2114   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2115     .addUse(Fmas.getReg(0))
2116     .addUse(RHS)
2117     .addUse(LHS)
2118     .setMIFlags(Flags);
2119 
2120   MI.eraseFromParent();
2121   return true;
2122 }
2123 
2124 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
2125                                          MachineRegisterInfo &MRI,
2126                                          MachineIRBuilder &B) const {
2127   B.setInstr(MI);
2128   Register Res = MI.getOperand(0).getReg();
2129   Register LHS = MI.getOperand(1).getReg();
2130   Register RHS = MI.getOperand(2).getReg();
2131 
2132   uint16_t Flags = MI.getFlags();
2133 
2134   LLT S64 = LLT::scalar(64);
2135   LLT S1 = LLT::scalar(1);
2136 
2137   auto One = B.buildFConstant(S64, 1.0);
2138 
2139   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2140     .addUse(LHS)
2141     .addUse(RHS)
2142     .addImm(1)
2143     .setMIFlags(Flags);
2144 
2145   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
2146 
2147   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
2148     .addUse(DivScale0.getReg(0))
2149     .setMIFlags(Flags);
2150 
2151   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
2152   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
2153   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
2154 
2155   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2156     .addUse(LHS)
2157     .addUse(RHS)
2158     .addImm(0)
2159     .setMIFlags(Flags);
2160 
2161   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
2162   auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags);
2163   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
2164 
2165   Register Scale;
2166   if (!ST.hasUsableDivScaleConditionOutput()) {
2167     // Workaround a hardware bug on SI where the condition output from div_scale
2168     // is not usable.
2169 
2170     Scale = MRI.createGenericVirtualRegister(S1);
2171 
2172     LLT S32 = LLT::scalar(32);
2173 
2174     auto NumUnmerge = B.buildUnmerge(S32, LHS);
2175     auto DenUnmerge = B.buildUnmerge(S32, RHS);
2176     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
2177     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
2178 
2179     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
2180                               Scale1Unmerge.getReg(1));
2181     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
2182                               Scale0Unmerge.getReg(1));
2183     B.buildXor(Scale, CmpNum, CmpDen);
2184   } else {
2185     Scale = DivScale1.getReg(1);
2186   }
2187 
2188   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
2189     .addUse(Fma4.getReg(0))
2190     .addUse(Fma3.getReg(0))
2191     .addUse(Mul.getReg(0))
2192     .addUse(Scale)
2193     .setMIFlags(Flags);
2194 
2195   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
2196     .addUse(Fmas.getReg(0))
2197     .addUse(RHS)
2198     .addUse(LHS)
2199     .setMIFlags(Flags);
2200 
2201   MI.eraseFromParent();
2202   return true;
2203 }
2204 
2205 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
2206                                                  MachineRegisterInfo &MRI,
2207                                                  MachineIRBuilder &B) const {
2208   B.setInstr(MI);
2209   Register Res = MI.getOperand(0).getReg();
2210   Register LHS = MI.getOperand(2).getReg();
2211   Register RHS = MI.getOperand(3).getReg();
2212   uint16_t Flags = MI.getFlags();
2213 
2214   LLT S32 = LLT::scalar(32);
2215   LLT S1 = LLT::scalar(1);
2216 
2217   auto Abs = B.buildFAbs(S32, RHS, Flags);
2218   const APFloat C0Val(1.0f);
2219 
2220   auto C0 = B.buildConstant(S32, 0x6f800000);
2221   auto C1 = B.buildConstant(S32, 0x2f800000);
2222   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
2223 
2224   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
2225   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
2226 
2227   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
2228 
2229   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2230     .addUse(Mul0.getReg(0))
2231     .setMIFlags(Flags);
2232 
2233   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
2234 
2235   B.buildFMul(Res, Sel, Mul1, Flags);
2236 
2237   MI.eraseFromParent();
2238   return true;
2239 }
2240 
2241 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
2242                                                  MachineRegisterInfo &MRI,
2243                                                  MachineIRBuilder &B) const {
2244   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2245   if (!MFI->isEntryFunction()) {
2246     return legalizePreloadedArgIntrin(MI, MRI, B,
2247                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
2248   }
2249 
2250   B.setInstr(MI);
2251 
2252   uint64_t Offset =
2253     ST.getTargetLowering()->getImplicitParameterOffset(
2254       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
2255   Register DstReg = MI.getOperand(0).getReg();
2256   LLT DstTy = MRI.getType(DstReg);
2257   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
2258 
2259   const ArgDescriptor *Arg;
2260   const TargetRegisterClass *RC;
2261   std::tie(Arg, RC)
2262     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2263   if (!Arg)
2264     return false;
2265 
2266   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
2267   if (!loadInputValue(KernargPtrReg, B, Arg))
2268     return false;
2269 
2270   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
2271   MI.eraseFromParent();
2272   return true;
2273 }
2274 
2275 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
2276                                               MachineRegisterInfo &MRI,
2277                                               MachineIRBuilder &B,
2278                                               unsigned AddrSpace) const {
2279   B.setInstr(MI);
2280   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
2281   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
2282   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
2283   MI.eraseFromParent();
2284   return true;
2285 }
2286 
2287 /// Handle register layout difference for f16 images for some subtargets.
2288 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
2289                                              MachineRegisterInfo &MRI,
2290                                              Register Reg) const {
2291   if (!ST.hasUnpackedD16VMem())
2292     return Reg;
2293 
2294   const LLT S16 = LLT::scalar(16);
2295   const LLT S32 = LLT::scalar(32);
2296   LLT StoreVT = MRI.getType(Reg);
2297   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
2298 
2299   auto Unmerge = B.buildUnmerge(S16, Reg);
2300 
2301   SmallVector<Register, 4> WideRegs;
2302   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
2303     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
2304 
2305   int NumElts = StoreVT.getNumElements();
2306 
2307   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
2308 }
2309 
2310 bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI,
2311                                                  MachineRegisterInfo &MRI,
2312                                                  MachineIRBuilder &B,
2313                                                  bool IsFormat) const {
2314   // TODO: Reject f16 format on targets where unsupported.
2315   Register VData = MI.getOperand(1).getReg();
2316   LLT Ty = MRI.getType(VData);
2317 
2318   B.setInstr(MI);
2319 
2320   const LLT S32 = LLT::scalar(32);
2321   const LLT S16 = LLT::scalar(16);
2322 
2323   // Fixup illegal register types for i8 stores.
2324   if (Ty == LLT::scalar(8) || Ty == S16) {
2325     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
2326     MI.getOperand(1).setReg(AnyExt);
2327     return true;
2328   }
2329 
2330   if (Ty.isVector()) {
2331     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
2332       if (IsFormat)
2333         MI.getOperand(1).setReg(handleD16VData(B, MRI, VData));
2334       return true;
2335     }
2336 
2337     return Ty.getElementType() == S32 && Ty.getNumElements() <= 4;
2338   }
2339 
2340   return Ty == S32;
2341 }
2342 
2343 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
2344                                                MachineIRBuilder &B,
2345                                                bool IsInc) const {
2346   B.setInstr(MI);
2347   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
2348                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
2349   B.buildInstr(Opc)
2350     .addDef(MI.getOperand(0).getReg())
2351     .addUse(MI.getOperand(2).getReg())
2352     .addUse(MI.getOperand(3).getReg())
2353     .cloneMemRefs(MI);
2354   MI.eraseFromParent();
2355   return true;
2356 }
2357 
2358 // FIMXE: Needs observer like custom
2359 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
2360                                             MachineRegisterInfo &MRI,
2361                                             MachineIRBuilder &B) const {
2362   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
2363   auto IntrID = MI.getIntrinsicID();
2364   switch (IntrID) {
2365   case Intrinsic::amdgcn_if:
2366   case Intrinsic::amdgcn_else: {
2367     MachineInstr *Br = nullptr;
2368     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
2369       const SIRegisterInfo *TRI
2370         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
2371 
2372       B.setInstr(*BrCond);
2373       Register Def = MI.getOperand(1).getReg();
2374       Register Use = MI.getOperand(3).getReg();
2375 
2376       MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB();
2377       if (Br)
2378         BrTarget = Br->getOperand(0).getMBB();
2379 
2380       if (IntrID == Intrinsic::amdgcn_if) {
2381         B.buildInstr(AMDGPU::SI_IF)
2382           .addDef(Def)
2383           .addUse(Use)
2384           .addMBB(BrTarget);
2385       } else {
2386         B.buildInstr(AMDGPU::SI_ELSE)
2387           .addDef(Def)
2388           .addUse(Use)
2389           .addMBB(BrTarget)
2390           .addImm(0);
2391       }
2392 
2393       if (Br)
2394         Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB());
2395 
2396       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
2397       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
2398       MI.eraseFromParent();
2399       BrCond->eraseFromParent();
2400       return true;
2401     }
2402 
2403     return false;
2404   }
2405   case Intrinsic::amdgcn_loop: {
2406     MachineInstr *Br = nullptr;
2407     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
2408       const SIRegisterInfo *TRI
2409         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
2410 
2411       B.setInstr(*BrCond);
2412 
2413       // FIXME: Need to adjust branch targets based on unconditional branch.
2414       Register Reg = MI.getOperand(2).getReg();
2415       B.buildInstr(AMDGPU::SI_LOOP)
2416         .addUse(Reg)
2417         .addMBB(BrCond->getOperand(1).getMBB());
2418       MI.eraseFromParent();
2419       BrCond->eraseFromParent();
2420       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
2421       return true;
2422     }
2423 
2424     return false;
2425   }
2426   case Intrinsic::amdgcn_kernarg_segment_ptr:
2427     return legalizePreloadedArgIntrin(
2428       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2429   case Intrinsic::amdgcn_implicitarg_ptr:
2430     return legalizeImplicitArgPtr(MI, MRI, B);
2431   case Intrinsic::amdgcn_workitem_id_x:
2432     return legalizePreloadedArgIntrin(MI, MRI, B,
2433                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
2434   case Intrinsic::amdgcn_workitem_id_y:
2435     return legalizePreloadedArgIntrin(MI, MRI, B,
2436                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
2437   case Intrinsic::amdgcn_workitem_id_z:
2438     return legalizePreloadedArgIntrin(MI, MRI, B,
2439                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
2440   case Intrinsic::amdgcn_workgroup_id_x:
2441     return legalizePreloadedArgIntrin(MI, MRI, B,
2442                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
2443   case Intrinsic::amdgcn_workgroup_id_y:
2444     return legalizePreloadedArgIntrin(MI, MRI, B,
2445                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
2446   case Intrinsic::amdgcn_workgroup_id_z:
2447     return legalizePreloadedArgIntrin(MI, MRI, B,
2448                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
2449   case Intrinsic::amdgcn_dispatch_ptr:
2450     return legalizePreloadedArgIntrin(MI, MRI, B,
2451                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
2452   case Intrinsic::amdgcn_queue_ptr:
2453     return legalizePreloadedArgIntrin(MI, MRI, B,
2454                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
2455   case Intrinsic::amdgcn_implicit_buffer_ptr:
2456     return legalizePreloadedArgIntrin(
2457       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
2458   case Intrinsic::amdgcn_dispatch_id:
2459     return legalizePreloadedArgIntrin(MI, MRI, B,
2460                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
2461   case Intrinsic::amdgcn_fdiv_fast:
2462     return legalizeFDIVFastIntrin(MI, MRI, B);
2463   case Intrinsic::amdgcn_is_shared:
2464     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
2465   case Intrinsic::amdgcn_is_private:
2466     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
2467   case Intrinsic::amdgcn_wavefrontsize: {
2468     B.setInstr(MI);
2469     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
2470     MI.eraseFromParent();
2471     return true;
2472   }
2473   case Intrinsic::amdgcn_raw_buffer_store:
2474     return legalizeRawBufferStore(MI, MRI, B, false);
2475   case Intrinsic::amdgcn_raw_buffer_store_format:
2476     return legalizeRawBufferStore(MI, MRI, B, true);
2477   case Intrinsic::amdgcn_atomic_inc:
2478     return legalizeAtomicIncDec(MI, B, true);
2479   case Intrinsic::amdgcn_atomic_dec:
2480     return legalizeAtomicIncDec(MI, B, false);
2481   default:
2482     return true;
2483   }
2484 
2485   return true;
2486 }
2487