1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
19 #endif
20 
21 #include "AMDGPU.h"
22 #include "AMDGPULegalizerInfo.h"
23 #include "AMDGPUTargetMachine.h"
24 #include "SIMachineFunctionInfo.h"
25 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/CodeGen/TargetOpcodes.h"
28 #include "llvm/CodeGen/ValueTypes.h"
29 #include "llvm/IR/DerivedTypes.h"
30 #include "llvm/IR/DiagnosticInfo.h"
31 #include "llvm/IR/Type.h"
32 #include "llvm/Support/Debug.h"
33 
34 #define DEBUG_TYPE "amdgpu-legalinfo"
35 
36 using namespace llvm;
37 using namespace LegalizeActions;
38 using namespace LegalizeMutations;
39 using namespace LegalityPredicates;
40 
41 
42 static LegalityPredicate isMultiple32(unsigned TypeIdx,
43                                       unsigned MaxSize = 1024) {
44   return [=](const LegalityQuery &Query) {
45     const LLT Ty = Query.Types[TypeIdx];
46     const LLT EltTy = Ty.getScalarType();
47     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
48   };
49 }
50 
51 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
52   return [=](const LegalityQuery &Query) {
53     return Query.Types[TypeIdx].getSizeInBits() == Size;
54   };
55 }
56 
57 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
58   return [=](const LegalityQuery &Query) {
59     const LLT Ty = Query.Types[TypeIdx];
60     return Ty.isVector() &&
61            Ty.getNumElements() % 2 != 0 &&
62            Ty.getElementType().getSizeInBits() < 32 &&
63            Ty.getSizeInBits() % 32 != 0;
64   };
65 }
66 
67 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
68   return [=](const LegalityQuery &Query) {
69     const LLT Ty = Query.Types[TypeIdx];
70     const LLT EltTy = Ty.getScalarType();
71     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
72   };
73 }
74 
75 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
76   return [=](const LegalityQuery &Query) {
77     const LLT Ty = Query.Types[TypeIdx];
78     const LLT EltTy = Ty.getElementType();
79     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
80   };
81 }
82 
83 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
84   return [=](const LegalityQuery &Query) {
85     const LLT Ty = Query.Types[TypeIdx];
86     const LLT EltTy = Ty.getElementType();
87     unsigned Size = Ty.getSizeInBits();
88     unsigned Pieces = (Size + 63) / 64;
89     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
90     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
91   };
92 }
93 
94 // Increase the number of vector elements to reach the next multiple of 32-bit
95 // type.
96 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
97   return [=](const LegalityQuery &Query) {
98     const LLT Ty = Query.Types[TypeIdx];
99 
100     const LLT EltTy = Ty.getElementType();
101     const int Size = Ty.getSizeInBits();
102     const int EltSize = EltTy.getSizeInBits();
103     const int NextMul32 = (Size + 31) / 32;
104 
105     assert(EltSize < 32);
106 
107     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
108     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
109   };
110 }
111 
112 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
113   return [=](const LegalityQuery &Query) {
114     const LLT QueryTy = Query.Types[TypeIdx];
115     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
116   };
117 }
118 
119 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
120   return [=](const LegalityQuery &Query) {
121     const LLT QueryTy = Query.Types[TypeIdx];
122     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
123   };
124 }
125 
126 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
127   return [=](const LegalityQuery &Query) {
128     const LLT QueryTy = Query.Types[TypeIdx];
129     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
130   };
131 }
132 
133 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
134 // v2s16.
135 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
136   return [=](const LegalityQuery &Query) {
137     const LLT Ty = Query.Types[TypeIdx];
138     if (Ty.isVector()) {
139       const int EltSize = Ty.getElementType().getSizeInBits();
140       return EltSize == 32 || EltSize == 64 ||
141             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
142              EltSize == 128 || EltSize == 256;
143     }
144 
145     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
146   };
147 }
148 
149 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
150   return [=](const LegalityQuery &Query) {
151     return Query.Types[TypeIdx].getElementType() == Type;
152   };
153 }
154 
155 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
156   return [=](const LegalityQuery &Query) {
157     const LLT Ty = Query.Types[TypeIdx];
158     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
159            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
160   };
161 }
162 
163 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
164                                          const GCNTargetMachine &TM)
165   :  ST(ST_) {
166   using namespace TargetOpcode;
167 
168   auto GetAddrSpacePtr = [&TM](unsigned AS) {
169     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
170   };
171 
172   const LLT S1 = LLT::scalar(1);
173   const LLT S8 = LLT::scalar(8);
174   const LLT S16 = LLT::scalar(16);
175   const LLT S32 = LLT::scalar(32);
176   const LLT S64 = LLT::scalar(64);
177   const LLT S96 = LLT::scalar(96);
178   const LLT S128 = LLT::scalar(128);
179   const LLT S256 = LLT::scalar(256);
180   const LLT S1024 = LLT::scalar(1024);
181 
182   const LLT V2S16 = LLT::vector(2, 16);
183   const LLT V4S16 = LLT::vector(4, 16);
184 
185   const LLT V2S32 = LLT::vector(2, 32);
186   const LLT V3S32 = LLT::vector(3, 32);
187   const LLT V4S32 = LLT::vector(4, 32);
188   const LLT V5S32 = LLT::vector(5, 32);
189   const LLT V6S32 = LLT::vector(6, 32);
190   const LLT V7S32 = LLT::vector(7, 32);
191   const LLT V8S32 = LLT::vector(8, 32);
192   const LLT V9S32 = LLT::vector(9, 32);
193   const LLT V10S32 = LLT::vector(10, 32);
194   const LLT V11S32 = LLT::vector(11, 32);
195   const LLT V12S32 = LLT::vector(12, 32);
196   const LLT V13S32 = LLT::vector(13, 32);
197   const LLT V14S32 = LLT::vector(14, 32);
198   const LLT V15S32 = LLT::vector(15, 32);
199   const LLT V16S32 = LLT::vector(16, 32);
200   const LLT V32S32 = LLT::vector(32, 32);
201 
202   const LLT V2S64 = LLT::vector(2, 64);
203   const LLT V3S64 = LLT::vector(3, 64);
204   const LLT V4S64 = LLT::vector(4, 64);
205   const LLT V5S64 = LLT::vector(5, 64);
206   const LLT V6S64 = LLT::vector(6, 64);
207   const LLT V7S64 = LLT::vector(7, 64);
208   const LLT V8S64 = LLT::vector(8, 64);
209   const LLT V16S64 = LLT::vector(16, 64);
210 
211   std::initializer_list<LLT> AllS32Vectors =
212     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
213      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
214   std::initializer_list<LLT> AllS64Vectors =
215     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
216 
217   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
218   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
219   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
220   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
221   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
222   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
223   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
224 
225   const LLT CodePtr = FlatPtr;
226 
227   const std::initializer_list<LLT> AddrSpaces64 = {
228     GlobalPtr, ConstantPtr, FlatPtr
229   };
230 
231   const std::initializer_list<LLT> AddrSpaces32 = {
232     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
233   };
234 
235   const std::initializer_list<LLT> FPTypesBase = {
236     S32, S64
237   };
238 
239   const std::initializer_list<LLT> FPTypes16 = {
240     S32, S64, S16
241   };
242 
243   const std::initializer_list<LLT> FPTypesPK16 = {
244     S32, S64, S16, V2S16
245   };
246 
247   setAction({G_BRCOND, S1}, Legal);
248 
249   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
250   // elements for v3s16
251   getActionDefinitionsBuilder(G_PHI)
252     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
253     .legalFor(AllS32Vectors)
254     .legalFor(AllS64Vectors)
255     .legalFor(AddrSpaces64)
256     .legalFor(AddrSpaces32)
257     .clampScalar(0, S32, S256)
258     .widenScalarToNextPow2(0, 32)
259     .clampMaxNumElements(0, S32, 16)
260     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
261     .legalIf(isPointer(0));
262 
263   if (ST.has16BitInsts()) {
264     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
265       .legalFor({S32, S16})
266       .clampScalar(0, S16, S32)
267       .scalarize(0);
268   } else {
269     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
270       .legalFor({S32})
271       .clampScalar(0, S32, S32)
272       .scalarize(0);
273   }
274 
275   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
276     .legalFor({S32})
277     .clampScalar(0, S32, S32)
278     .scalarize(0);
279 
280   // Report legal for any types we can handle anywhere. For the cases only legal
281   // on the SALU, RegBankSelect will be able to re-legalize.
282   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
283     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
284     .clampScalar(0, S32, S64)
285     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
286     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
287     .widenScalarToNextPow2(0)
288     .scalarize(0);
289 
290   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
291                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
292     .legalFor({{S32, S1}})
293     .clampScalar(0, S32, S32)
294     .scalarize(0); // TODO: Implement.
295 
296   getActionDefinitionsBuilder({G_SADDO, G_SSUBO})
297     .lower();
298 
299   getActionDefinitionsBuilder(G_BITCAST)
300     // Don't worry about the size constraint.
301     .legalIf(all(isRegisterType(0), isRegisterType(1)))
302     // FIXME: Testing hack
303     .legalForCartesianProduct({S16, LLT::vector(2, 8), });
304 
305   getActionDefinitionsBuilder(G_FCONSTANT)
306     .legalFor({S32, S64, S16})
307     .clampScalar(0, S16, S64);
308 
309   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
310     .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
311                ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
312     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
313     .clampScalarOrElt(0, S32, S1024)
314     .legalIf(isMultiple32(0))
315     .widenScalarToNextPow2(0, 32)
316     .clampMaxNumElements(0, S32, 16);
317 
318 
319   // FIXME: i1 operands to intrinsics should always be legal, but other i1
320   // values may not be legal.  We need to figure out how to distinguish
321   // between these two scenarios.
322   getActionDefinitionsBuilder(G_CONSTANT)
323     .legalFor({S1, S32, S64, S16, GlobalPtr,
324                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
325     .clampScalar(0, S32, S64)
326     .widenScalarToNextPow2(0)
327     .legalIf(isPointer(0));
328 
329   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
330   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
331     .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr});
332 
333 
334   auto &FPOpActions = getActionDefinitionsBuilder(
335     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
336     .legalFor({S32, S64});
337   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
338     .customFor({S32, S64});
339   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
340     .customFor({S32, S64});
341 
342   if (ST.has16BitInsts()) {
343     if (ST.hasVOP3PInsts())
344       FPOpActions.legalFor({S16, V2S16});
345     else
346       FPOpActions.legalFor({S16});
347 
348     TrigActions.customFor({S16});
349     FDIVActions.customFor({S16});
350   }
351 
352   auto &MinNumMaxNum = getActionDefinitionsBuilder({
353       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
354 
355   if (ST.hasVOP3PInsts()) {
356     MinNumMaxNum.customFor(FPTypesPK16)
357       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
358       .clampMaxNumElements(0, S16, 2)
359       .clampScalar(0, S16, S64)
360       .scalarize(0);
361   } else if (ST.has16BitInsts()) {
362     MinNumMaxNum.customFor(FPTypes16)
363       .clampScalar(0, S16, S64)
364       .scalarize(0);
365   } else {
366     MinNumMaxNum.customFor(FPTypesBase)
367       .clampScalar(0, S32, S64)
368       .scalarize(0);
369   }
370 
371   if (ST.hasVOP3PInsts())
372     FPOpActions.clampMaxNumElements(0, S16, 2);
373 
374   FPOpActions
375     .scalarize(0)
376     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
377 
378   TrigActions
379     .scalarize(0)
380     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
381 
382   FDIVActions
383     .scalarize(0)
384     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
385 
386   getActionDefinitionsBuilder({G_FNEG, G_FABS})
387     .legalFor(FPTypesPK16)
388     .clampMaxNumElements(0, S16, 2)
389     .scalarize(0)
390     .clampScalar(0, S16, S64);
391 
392   // TODO: Implement
393   getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
394 
395   if (ST.has16BitInsts()) {
396     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
397       .legalFor({S32, S64, S16})
398       .scalarize(0)
399       .clampScalar(0, S16, S64);
400   } else {
401     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
402       .legalFor({S32, S64})
403       .scalarize(0)
404       .clampScalar(0, S32, S64);
405   }
406 
407   getActionDefinitionsBuilder(G_FPTRUNC)
408     .legalFor({{S32, S64}, {S16, S32}})
409     .scalarize(0);
410 
411   getActionDefinitionsBuilder(G_FPEXT)
412     .legalFor({{S64, S32}, {S32, S16}})
413     .lowerFor({{S64, S16}}) // FIXME: Implement
414     .scalarize(0);
415 
416   // TODO: Verify V_BFI_B32 is generated from expanded bit ops.
417   getActionDefinitionsBuilder(G_FCOPYSIGN).lower();
418 
419   getActionDefinitionsBuilder(G_FSUB)
420       // Use actual fsub instruction
421       .legalFor({S32})
422       // Must use fadd + fneg
423       .lowerFor({S64, S16, V2S16})
424       .scalarize(0)
425       .clampScalar(0, S32, S64);
426 
427   // Whether this is legal depends on the floating point mode for the function.
428   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
429   if (ST.hasMadF16())
430     FMad.customFor({S32, S16});
431   else
432     FMad.customFor({S32});
433   FMad.scalarize(0)
434       .lower();
435 
436   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
437     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
438                {S32, S1}, {S64, S1}, {S16, S1},
439                {S96, S32},
440                // FIXME: Hack
441                {S64, LLT::scalar(33)},
442                {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}})
443     .scalarize(0);
444 
445   // TODO: Split s1->s64 during regbankselect for VALU.
446   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
447     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
448     .lowerFor({{S32, S64}})
449     .lowerIf(typeIs(1, S1))
450     .customFor({{S64, S64}});
451   if (ST.has16BitInsts())
452     IToFP.legalFor({{S16, S16}});
453   IToFP.clampScalar(1, S32, S64)
454        .scalarize(0);
455 
456   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
457     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}});
458   if (ST.has16BitInsts())
459     FPToI.legalFor({{S16, S16}});
460   else
461     FPToI.minScalar(1, S32);
462 
463   FPToI.minScalar(0, S32)
464        .scalarize(0);
465 
466   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
467     .legalFor({S32, S64})
468     .scalarize(0);
469 
470   if (ST.has16BitInsts()) {
471     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
472       .legalFor({S16, S32, S64})
473       .clampScalar(0, S16, S64)
474       .scalarize(0);
475   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
476     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
477       .legalFor({S32, S64})
478       .clampScalar(0, S32, S64)
479       .scalarize(0);
480   } else {
481     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
482       .legalFor({S32})
483       .customFor({S64})
484       .clampScalar(0, S32, S64)
485       .scalarize(0);
486   }
487 
488   getActionDefinitionsBuilder(G_PTR_ADD)
489     .legalForCartesianProduct(AddrSpaces64, {S64})
490     .legalForCartesianProduct(AddrSpaces32, {S32})
491     .scalarize(0);
492 
493   getActionDefinitionsBuilder(G_PTR_MASK)
494     .scalarize(0)
495     .alwaysLegal();
496 
497   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
498 
499   auto &CmpBuilder =
500     getActionDefinitionsBuilder(G_ICMP)
501     .legalForCartesianProduct(
502       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
503     .legalFor({{S1, S32}, {S1, S64}});
504   if (ST.has16BitInsts()) {
505     CmpBuilder.legalFor({{S1, S16}});
506   }
507 
508   CmpBuilder
509     .widenScalarToNextPow2(1)
510     .clampScalar(1, S32, S64)
511     .scalarize(0)
512     .legalIf(all(typeIs(0, S1), isPointer(1)));
513 
514   getActionDefinitionsBuilder(G_FCMP)
515     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
516     .widenScalarToNextPow2(1)
517     .clampScalar(1, S32, S64)
518     .scalarize(0);
519 
520   // FIXME: fexp, flog2, flog10 needs to be custom lowered.
521   getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2,
522                                G_FLOG, G_FLOG2, G_FLOG10})
523     .legalFor({S32})
524     .scalarize(0);
525 
526   // The 64-bit versions produce 32-bit results, but only on the SALU.
527   getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF,
528                                G_CTTZ, G_CTTZ_ZERO_UNDEF,
529                                G_CTPOP})
530     .legalFor({{S32, S32}, {S32, S64}})
531     .clampScalar(0, S32, S32)
532     .clampScalar(1, S32, S64)
533     .scalarize(0)
534     .widenScalarToNextPow2(0, 32)
535     .widenScalarToNextPow2(1, 32);
536 
537   // TODO: Expand for > s32
538   getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE})
539     .legalFor({S32})
540     .clampScalar(0, S32, S32)
541     .scalarize(0);
542 
543   if (ST.has16BitInsts()) {
544     if (ST.hasVOP3PInsts()) {
545       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
546         .legalFor({S32, S16, V2S16})
547         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
548         .clampMaxNumElements(0, S16, 2)
549         .clampScalar(0, S16, S32)
550         .widenScalarToNextPow2(0)
551         .scalarize(0);
552     } else {
553       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
554         .legalFor({S32, S16})
555         .widenScalarToNextPow2(0)
556         .clampScalar(0, S16, S32)
557         .scalarize(0);
558     }
559   } else {
560     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
561       .legalFor({S32})
562       .clampScalar(0, S32, S32)
563       .widenScalarToNextPow2(0)
564       .scalarize(0);
565   }
566 
567   auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
568     return [=](const LegalityQuery &Query) {
569       return Query.Types[TypeIdx0].getSizeInBits() <
570              Query.Types[TypeIdx1].getSizeInBits();
571     };
572   };
573 
574   auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
575     return [=](const LegalityQuery &Query) {
576       return Query.Types[TypeIdx0].getSizeInBits() >
577              Query.Types[TypeIdx1].getSizeInBits();
578     };
579   };
580 
581   getActionDefinitionsBuilder(G_INTTOPTR)
582     // List the common cases
583     .legalForCartesianProduct(AddrSpaces64, {S64})
584     .legalForCartesianProduct(AddrSpaces32, {S32})
585     .scalarize(0)
586     // Accept any address space as long as the size matches
587     .legalIf(sameSize(0, 1))
588     .widenScalarIf(smallerThan(1, 0),
589       [](const LegalityQuery &Query) {
590         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
591       })
592     .narrowScalarIf(greaterThan(1, 0),
593       [](const LegalityQuery &Query) {
594         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
595       });
596 
597   getActionDefinitionsBuilder(G_PTRTOINT)
598     // List the common cases
599     .legalForCartesianProduct(AddrSpaces64, {S64})
600     .legalForCartesianProduct(AddrSpaces32, {S32})
601     .scalarize(0)
602     // Accept any address space as long as the size matches
603     .legalIf(sameSize(0, 1))
604     .widenScalarIf(smallerThan(0, 1),
605       [](const LegalityQuery &Query) {
606         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
607       })
608     .narrowScalarIf(
609       greaterThan(0, 1),
610       [](const LegalityQuery &Query) {
611         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
612       });
613 
614   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
615     .scalarize(0)
616     .custom();
617 
618   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
619   // handle some operations by just promoting the register during
620   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
621   auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned {
622     switch (AS) {
623     // FIXME: Private element size.
624     case AMDGPUAS::PRIVATE_ADDRESS:
625       return 32;
626     // FIXME: Check subtarget
627     case AMDGPUAS::LOCAL_ADDRESS:
628       return ST.useDS128() ? 128 : 64;
629 
630     // Treat constant and global as identical. SMRD loads are sometimes usable
631     // for global loads (ideally constant address space should be eliminated)
632     // depending on the context. Legality cannot be context dependent, but
633     // RegBankSelect can split the load as necessary depending on the pointer
634     // register bank/uniformity and if the memory is invariant or not written in
635     // a kernel.
636     case AMDGPUAS::CONSTANT_ADDRESS:
637     case AMDGPUAS::GLOBAL_ADDRESS:
638       return 512;
639     default:
640       return 128;
641     }
642   };
643 
644   const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool {
645     const LLT DstTy = Query.Types[0];
646 
647     // Split vector extloads.
648     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
649     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
650       return true;
651 
652     const LLT PtrTy = Query.Types[1];
653     unsigned AS = PtrTy.getAddressSpace();
654     if (MemSize > maxSizeForAddrSpace(AS))
655       return true;
656 
657     // Catch weird sized loads that don't evenly divide into the access sizes
658     // TODO: May be able to widen depending on alignment etc.
659     unsigned NumRegs = MemSize / 32;
660     if (NumRegs == 3 && !ST.hasDwordx3LoadStores())
661       return true;
662 
663     unsigned Align = Query.MMODescrs[0].AlignInBits;
664     if (Align < MemSize) {
665       const SITargetLowering *TLI = ST.getTargetLowering();
666       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
667     }
668 
669     return false;
670   };
671 
672   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
673   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
674   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
675 
676   // TODO: Refine based on subtargets which support unaligned access or 128-bit
677   // LDS
678   // TODO: Unsupported flat for SI.
679 
680   for (unsigned Op : {G_LOAD, G_STORE}) {
681     const bool IsStore = Op == G_STORE;
682 
683     auto &Actions = getActionDefinitionsBuilder(Op);
684     // Whitelist the common cases.
685     // TODO: Pointer loads
686     // TODO: Wide constant loads
687     // TODO: Only CI+ has 3x loads
688     // TODO: Loads to s16 on gfx9
689     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
690                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
691                                       {V3S32, GlobalPtr, 96, GlobalAlign32},
692                                       {S96, GlobalPtr, 96, GlobalAlign32},
693                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
694                                       {S128, GlobalPtr, 128, GlobalAlign32},
695                                       {S64, GlobalPtr, 64, GlobalAlign32},
696                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
697                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
698                                       {S32, GlobalPtr, 8, GlobalAlign8},
699                                       {S32, GlobalPtr, 16, GlobalAlign16},
700 
701                                       {S32, LocalPtr, 32, 32},
702                                       {S64, LocalPtr, 64, 32},
703                                       {V2S32, LocalPtr, 64, 32},
704                                       {S32, LocalPtr, 8, 8},
705                                       {S32, LocalPtr, 16, 16},
706                                       {V2S16, LocalPtr, 32, 32},
707 
708                                       {S32, PrivatePtr, 32, 32},
709                                       {S32, PrivatePtr, 8, 8},
710                                       {S32, PrivatePtr, 16, 16},
711                                       {V2S16, PrivatePtr, 32, 32},
712 
713                                       {S32, FlatPtr, 32, GlobalAlign32},
714                                       {S32, FlatPtr, 16, GlobalAlign16},
715                                       {S32, FlatPtr, 8, GlobalAlign8},
716                                       {V2S16, FlatPtr, 32, GlobalAlign32},
717 
718                                       {S32, ConstantPtr, 32, GlobalAlign32},
719                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
720                                       {V3S32, ConstantPtr, 96, GlobalAlign32},
721                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
722                                       {S64, ConstantPtr, 64, GlobalAlign32},
723                                       {S128, ConstantPtr, 128, GlobalAlign32},
724                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
725     Actions
726         .customIf(typeIs(1, Constant32Ptr))
727         .narrowScalarIf(
728             [=](const LegalityQuery &Query) -> bool {
729               return !Query.Types[0].isVector() && needToSplitLoad(Query);
730             },
731             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
732               const LLT DstTy = Query.Types[0];
733               const LLT PtrTy = Query.Types[1];
734 
735               const unsigned DstSize = DstTy.getSizeInBits();
736               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
737 
738               // Split extloads.
739               if (DstSize > MemSize)
740                 return std::make_pair(0, LLT::scalar(MemSize));
741 
742               if (DstSize > 32 && (DstSize % 32 != 0)) {
743                 // FIXME: Need a way to specify non-extload of larger size if
744                 // suitably aligned.
745                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
746               }
747 
748               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
749               if (MemSize > MaxSize)
750                 return std::make_pair(0, LLT::scalar(MaxSize));
751 
752               unsigned Align = Query.MMODescrs[0].AlignInBits;
753               return std::make_pair(0, LLT::scalar(Align));
754             })
755         .fewerElementsIf(
756             [=](const LegalityQuery &Query) -> bool {
757               return Query.Types[0].isVector() && needToSplitLoad(Query);
758             },
759             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
760               const LLT DstTy = Query.Types[0];
761               const LLT PtrTy = Query.Types[1];
762 
763               LLT EltTy = DstTy.getElementType();
764               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
765 
766               // Split if it's too large for the address space.
767               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
768                 unsigned NumElts = DstTy.getNumElements();
769                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
770 
771                 // FIXME: Refine when odd breakdowns handled
772                 // The scalars will need to be re-legalized.
773                 if (NumPieces == 1 || NumPieces >= NumElts ||
774                     NumElts % NumPieces != 0)
775                   return std::make_pair(0, EltTy);
776 
777                 return std::make_pair(0,
778                                       LLT::vector(NumElts / NumPieces, EltTy));
779               }
780 
781               // Need to split because of alignment.
782               unsigned Align = Query.MMODescrs[0].AlignInBits;
783               unsigned EltSize = EltTy.getSizeInBits();
784               if (EltSize > Align &&
785                   (EltSize / Align < DstTy.getNumElements())) {
786                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
787               }
788 
789               // May need relegalization for the scalars.
790               return std::make_pair(0, EltTy);
791             })
792         .minScalar(0, S32);
793 
794     if (IsStore)
795       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
796 
797     // TODO: Need a bitcast lower option?
798     Actions
799         .legalIf([=](const LegalityQuery &Query) {
800           const LLT Ty0 = Query.Types[0];
801           unsigned Size = Ty0.getSizeInBits();
802           unsigned MemSize = Query.MMODescrs[0].SizeInBits;
803           unsigned Align = Query.MMODescrs[0].AlignInBits;
804 
805           // No extending vector loads.
806           if (Size > MemSize && Ty0.isVector())
807             return false;
808 
809           // FIXME: Widening store from alignment not valid.
810           if (MemSize < Size)
811             MemSize = std::max(MemSize, Align);
812 
813           switch (MemSize) {
814           case 8:
815           case 16:
816             return Size == 32;
817           case 32:
818           case 64:
819           case 128:
820             return true;
821           case 96:
822             return ST.hasDwordx3LoadStores();
823           case 256:
824           case 512:
825             return true;
826           default:
827             return false;
828           }
829         })
830         .widenScalarToNextPow2(0)
831         // TODO: v3s32->v4s32 with alignment
832         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
833   }
834 
835   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
836                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
837                                                   {S32, GlobalPtr, 16, 2 * 8},
838                                                   {S32, LocalPtr, 8, 8},
839                                                   {S32, LocalPtr, 16, 16},
840                                                   {S32, PrivatePtr, 8, 8},
841                                                   {S32, PrivatePtr, 16, 16},
842                                                   {S32, ConstantPtr, 8, 8},
843                                                   {S32, ConstantPtr, 16, 2 * 8}});
844   if (ST.hasFlatAddressSpace()) {
845     ExtLoads.legalForTypesWithMemDesc(
846         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
847   }
848 
849   ExtLoads.clampScalar(0, S32, S32)
850           .widenScalarToNextPow2(0)
851           .unsupportedIfMemSizeNotPow2()
852           .lower();
853 
854   auto &Atomics = getActionDefinitionsBuilder(
855     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
856      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
857      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
858      G_ATOMICRMW_UMIN})
859     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
860                {S64, GlobalPtr}, {S64, LocalPtr}});
861   if (ST.hasFlatAddressSpace()) {
862     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
863   }
864 
865   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
866     .legalFor({{S32, LocalPtr}});
867 
868   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
869   // demarshalling
870   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
871     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
872                 {S32, FlatPtr}, {S64, FlatPtr}})
873     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
874                {S32, RegionPtr}, {S64, RegionPtr}});
875 
876   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS)
877     .lower();
878 
879   // TODO: Pointer types, any 32-bit or 64-bit vector
880   getActionDefinitionsBuilder(G_SELECT)
881     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
882           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
883           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1})
884     .clampScalar(0, S16, S64)
885     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
886     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
887     .scalarize(1)
888     .clampMaxNumElements(0, S32, 2)
889     .clampMaxNumElements(0, LocalPtr, 2)
890     .clampMaxNumElements(0, PrivatePtr, 2)
891     .scalarize(0)
892     .widenScalarToNextPow2(0)
893     .legalIf(all(isPointer(0), typeIs(1, S1)));
894 
895   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
896   // be more flexible with the shift amount type.
897   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
898     .legalFor({{S32, S32}, {S64, S32}});
899   if (ST.has16BitInsts()) {
900     if (ST.hasVOP3PInsts()) {
901       Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
902             .clampMaxNumElements(0, S16, 2);
903     } else
904       Shifts.legalFor({{S16, S32}, {S16, S16}});
905 
906     Shifts.clampScalar(1, S16, S32);
907     Shifts.clampScalar(0, S16, S64);
908     Shifts.widenScalarToNextPow2(0, 16);
909   } else {
910     // Make sure we legalize the shift amount type first, as the general
911     // expansion for the shifted type will produce much worse code if it hasn't
912     // been truncated already.
913     Shifts.clampScalar(1, S32, S32);
914     Shifts.clampScalar(0, S32, S64);
915     Shifts.widenScalarToNextPow2(0, 32);
916   }
917   Shifts.scalarize(0);
918 
919   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
920     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
921     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
922     unsigned IdxTypeIdx = 2;
923 
924     getActionDefinitionsBuilder(Op)
925       .customIf([=](const LegalityQuery &Query) {
926           const LLT EltTy = Query.Types[EltTypeIdx];
927           const LLT VecTy = Query.Types[VecTypeIdx];
928           const LLT IdxTy = Query.Types[IdxTypeIdx];
929           return (EltTy.getSizeInBits() == 16 ||
930                   EltTy.getSizeInBits() % 32 == 0) &&
931                  VecTy.getSizeInBits() % 32 == 0 &&
932                  VecTy.getSizeInBits() <= 1024 &&
933                  IdxTy.getSizeInBits() == 32;
934         })
935       .clampScalar(EltTypeIdx, S32, S64)
936       .clampScalar(VecTypeIdx, S32, S64)
937       .clampScalar(IdxTypeIdx, S32, S32);
938   }
939 
940   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
941     .unsupportedIf([=](const LegalityQuery &Query) {
942         const LLT &EltTy = Query.Types[1].getElementType();
943         return Query.Types[0] != EltTy;
944       });
945 
946   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
947     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
948     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
949 
950     // FIXME: Doesn't handle extract of illegal sizes.
951     getActionDefinitionsBuilder(Op)
952       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
953       // FIXME: Multiples of 16 should not be legal.
954       .legalIf([=](const LegalityQuery &Query) {
955           const LLT BigTy = Query.Types[BigTyIdx];
956           const LLT LitTy = Query.Types[LitTyIdx];
957           return (BigTy.getSizeInBits() % 32 == 0) &&
958                  (LitTy.getSizeInBits() % 16 == 0);
959         })
960       .widenScalarIf(
961         [=](const LegalityQuery &Query) {
962           const LLT BigTy = Query.Types[BigTyIdx];
963           return (BigTy.getScalarSizeInBits() < 16);
964         },
965         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
966       .widenScalarIf(
967         [=](const LegalityQuery &Query) {
968           const LLT LitTy = Query.Types[LitTyIdx];
969           return (LitTy.getScalarSizeInBits() < 16);
970         },
971         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
972       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
973       .widenScalarToNextPow2(BigTyIdx, 32);
974 
975   }
976 
977   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
978     .legalForCartesianProduct(AllS32Vectors, {S32})
979     .legalForCartesianProduct(AllS64Vectors, {S64})
980     .clampNumElements(0, V16S32, V32S32)
981     .clampNumElements(0, V2S64, V16S64)
982     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
983 
984   if (ST.hasScalarPackInsts())
985     BuildVector.legalFor({V2S16, S32});
986 
987   BuildVector
988     .minScalarSameAs(1, 0)
989     .legalIf(isRegisterType(0))
990     .minScalarOrElt(0, S32);
991 
992   if (ST.hasScalarPackInsts()) {
993     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
994       .legalFor({V2S16, S32})
995       .lower();
996   } else {
997     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
998       .lower();
999   }
1000 
1001   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1002     .legalIf(isRegisterType(0));
1003 
1004   // TODO: Don't fully scalarize v2s16 pieces
1005   getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1006 
1007   // Merge/Unmerge
1008   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1009     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1010     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1011 
1012     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1013       const LLT &Ty = Query.Types[TypeIdx];
1014       if (Ty.isVector()) {
1015         const LLT &EltTy = Ty.getElementType();
1016         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
1017           return true;
1018         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1019           return true;
1020       }
1021       return false;
1022     };
1023 
1024     auto &Builder = getActionDefinitionsBuilder(Op)
1025       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1026       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1027       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1028       // valid.
1029       .clampScalar(LitTyIdx, S16, S256)
1030       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1031       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1032       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1033                            elementTypeIs(1, S16)),
1034                        changeTo(1, V2S16))
1035       // Break up vectors with weird elements into scalars
1036       .fewerElementsIf(
1037         [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
1038         scalarize(0))
1039       .fewerElementsIf(
1040         [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
1041         scalarize(1))
1042       .clampScalar(BigTyIdx, S32, S1024)
1043       .lowerFor({{S16, V2S16}});
1044 
1045     if (Op == G_MERGE_VALUES) {
1046       Builder.widenScalarIf(
1047         // TODO: Use 16-bit shifts if legal for 8-bit values?
1048         [=](const LegalityQuery &Query) {
1049           const LLT Ty = Query.Types[LitTyIdx];
1050           return Ty.getSizeInBits() < 32;
1051         },
1052         changeTo(LitTyIdx, S32));
1053     }
1054 
1055     Builder.widenScalarIf(
1056       [=](const LegalityQuery &Query) {
1057         const LLT Ty = Query.Types[BigTyIdx];
1058         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1059           Ty.getSizeInBits() % 16 != 0;
1060       },
1061       [=](const LegalityQuery &Query) {
1062         // Pick the next power of 2, or a multiple of 64 over 128.
1063         // Whichever is smaller.
1064         const LLT &Ty = Query.Types[BigTyIdx];
1065         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1066         if (NewSizeInBits >= 256) {
1067           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1068           if (RoundedTo < NewSizeInBits)
1069             NewSizeInBits = RoundedTo;
1070         }
1071         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1072       })
1073       .legalIf([=](const LegalityQuery &Query) {
1074           const LLT &BigTy = Query.Types[BigTyIdx];
1075           const LLT &LitTy = Query.Types[LitTyIdx];
1076 
1077           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1078             return false;
1079           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1080             return false;
1081 
1082           return BigTy.getSizeInBits() % 16 == 0 &&
1083                  LitTy.getSizeInBits() % 16 == 0 &&
1084                  BigTy.getSizeInBits() <= 1024;
1085         })
1086       // Any vectors left are the wrong size. Scalarize them.
1087       .scalarize(0)
1088       .scalarize(1);
1089   }
1090 
1091   getActionDefinitionsBuilder(G_SEXT_INREG).lower();
1092 
1093   computeTables();
1094   verify(*ST.getInstrInfo());
1095 }
1096 
1097 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1098                                          MachineRegisterInfo &MRI,
1099                                          MachineIRBuilder &B,
1100                                          GISelChangeObserver &Observer) const {
1101   switch (MI.getOpcode()) {
1102   case TargetOpcode::G_ADDRSPACE_CAST:
1103     return legalizeAddrSpaceCast(MI, MRI, B);
1104   case TargetOpcode::G_FRINT:
1105     return legalizeFrint(MI, MRI, B);
1106   case TargetOpcode::G_FCEIL:
1107     return legalizeFceil(MI, MRI, B);
1108   case TargetOpcode::G_INTRINSIC_TRUNC:
1109     return legalizeIntrinsicTrunc(MI, MRI, B);
1110   case TargetOpcode::G_SITOFP:
1111     return legalizeITOFP(MI, MRI, B, true);
1112   case TargetOpcode::G_UITOFP:
1113     return legalizeITOFP(MI, MRI, B, false);
1114   case TargetOpcode::G_FMINNUM:
1115   case TargetOpcode::G_FMAXNUM:
1116   case TargetOpcode::G_FMINNUM_IEEE:
1117   case TargetOpcode::G_FMAXNUM_IEEE:
1118     return legalizeMinNumMaxNum(MI, MRI, B);
1119   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1120     return legalizeExtractVectorElt(MI, MRI, B);
1121   case TargetOpcode::G_INSERT_VECTOR_ELT:
1122     return legalizeInsertVectorElt(MI, MRI, B);
1123   case TargetOpcode::G_FSIN:
1124   case TargetOpcode::G_FCOS:
1125     return legalizeSinCos(MI, MRI, B);
1126   case TargetOpcode::G_GLOBAL_VALUE:
1127     return legalizeGlobalValue(MI, MRI, B);
1128   case TargetOpcode::G_LOAD:
1129     return legalizeLoad(MI, MRI, B, Observer);
1130   case TargetOpcode::G_FMAD:
1131     return legalizeFMad(MI, MRI, B);
1132   case TargetOpcode::G_FDIV:
1133     return legalizeFDIV(MI, MRI, B);
1134   case TargetOpcode::G_ATOMIC_CMPXCHG:
1135     return legalizeAtomicCmpXChg(MI, MRI, B);
1136   default:
1137     return false;
1138   }
1139 
1140   llvm_unreachable("expected switch to return");
1141 }
1142 
1143 Register AMDGPULegalizerInfo::getSegmentAperture(
1144   unsigned AS,
1145   MachineRegisterInfo &MRI,
1146   MachineIRBuilder &B) const {
1147   MachineFunction &MF = B.getMF();
1148   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1149   const LLT S32 = LLT::scalar(32);
1150 
1151   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1152 
1153   if (ST.hasApertureRegs()) {
1154     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1155     // getreg.
1156     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1157         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1158         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1159     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1160         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1161         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1162     unsigned Encoding =
1163         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1164         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1165         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1166 
1167     Register ApertureReg = MRI.createGenericVirtualRegister(S32);
1168     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1169 
1170     B.buildInstr(AMDGPU::S_GETREG_B32)
1171       .addDef(GetReg)
1172       .addImm(Encoding);
1173     MRI.setType(GetReg, S32);
1174 
1175     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1176     B.buildInstr(TargetOpcode::G_SHL)
1177       .addDef(ApertureReg)
1178       .addUse(GetReg)
1179       .addUse(ShiftAmt.getReg(0));
1180 
1181     return ApertureReg;
1182   }
1183 
1184   Register QueuePtr = MRI.createGenericVirtualRegister(
1185     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1186 
1187   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1188   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1189     return Register();
1190 
1191   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1192   // private_segment_aperture_base_hi.
1193   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1194 
1195   // TODO: can we be smarter about machine pointer info?
1196   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1197   MachineMemOperand *MMO = MF.getMachineMemOperand(
1198     PtrInfo,
1199     MachineMemOperand::MOLoad |
1200     MachineMemOperand::MODereferenceable |
1201     MachineMemOperand::MOInvariant,
1202     4,
1203     MinAlign(64, StructOffset));
1204 
1205   Register LoadResult = MRI.createGenericVirtualRegister(S32);
1206   Register LoadAddr;
1207 
1208   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1209   B.buildLoad(LoadResult, LoadAddr, *MMO);
1210   return LoadResult;
1211 }
1212 
1213 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1214   MachineInstr &MI, MachineRegisterInfo &MRI,
1215   MachineIRBuilder &B) const {
1216   MachineFunction &MF = B.getMF();
1217 
1218   B.setInstr(MI);
1219 
1220   const LLT S32 = LLT::scalar(32);
1221   Register Dst = MI.getOperand(0).getReg();
1222   Register Src = MI.getOperand(1).getReg();
1223 
1224   LLT DstTy = MRI.getType(Dst);
1225   LLT SrcTy = MRI.getType(Src);
1226   unsigned DestAS = DstTy.getAddressSpace();
1227   unsigned SrcAS = SrcTy.getAddressSpace();
1228 
1229   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1230   // vector element.
1231   assert(!DstTy.isVector());
1232 
1233   const AMDGPUTargetMachine &TM
1234     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1235 
1236   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1237   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1238     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1239     return true;
1240   }
1241 
1242   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1243     // Truncate.
1244     B.buildExtract(Dst, Src, 0);
1245     MI.eraseFromParent();
1246     return true;
1247   }
1248 
1249   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1250     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1251     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1252 
1253     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1254     // another. Merge operands are required to be the same type, but creating an
1255     // extra ptrtoint would be kind of pointless.
1256     auto HighAddr = B.buildConstant(
1257       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1258     B.buildMerge(Dst, {Src, HighAddr.getReg(0)});
1259     MI.eraseFromParent();
1260     return true;
1261   }
1262 
1263   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1264     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1265            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1266     unsigned NullVal = TM.getNullPointerValue(DestAS);
1267 
1268     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1269     auto FlatNull = B.buildConstant(SrcTy, 0);
1270 
1271     Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy);
1272 
1273     // Extract low 32-bits of the pointer.
1274     B.buildExtract(PtrLo32, Src, 0);
1275 
1276     Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1277     B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
1278     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1279 
1280     MI.eraseFromParent();
1281     return true;
1282   }
1283 
1284   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1285     return false;
1286 
1287   if (!ST.hasFlatAddressSpace())
1288     return false;
1289 
1290   auto SegmentNull =
1291       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1292   auto FlatNull =
1293       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1294 
1295   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1296   if (!ApertureReg.isValid())
1297     return false;
1298 
1299   Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1300   B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
1301 
1302   Register BuildPtr = MRI.createGenericVirtualRegister(DstTy);
1303 
1304   // Coerce the type of the low half of the result so we can use merge_values.
1305   Register SrcAsInt = MRI.createGenericVirtualRegister(S32);
1306   B.buildInstr(TargetOpcode::G_PTRTOINT)
1307     .addDef(SrcAsInt)
1308     .addUse(Src);
1309 
1310   // TODO: Should we allow mismatched types but matching sizes in merges to
1311   // avoid the ptrtoint?
1312   B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
1313   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
1314 
1315   MI.eraseFromParent();
1316   return true;
1317 }
1318 
1319 bool AMDGPULegalizerInfo::legalizeFrint(
1320   MachineInstr &MI, MachineRegisterInfo &MRI,
1321   MachineIRBuilder &B) const {
1322   B.setInstr(MI);
1323 
1324   Register Src = MI.getOperand(1).getReg();
1325   LLT Ty = MRI.getType(Src);
1326   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1327 
1328   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1329   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1330 
1331   auto C1 = B.buildFConstant(Ty, C1Val);
1332   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1333 
1334   // TODO: Should this propagate fast-math-flags?
1335   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1336   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1337 
1338   auto C2 = B.buildFConstant(Ty, C2Val);
1339   auto Fabs = B.buildFAbs(Ty, Src);
1340 
1341   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1342   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1343   return true;
1344 }
1345 
1346 bool AMDGPULegalizerInfo::legalizeFceil(
1347   MachineInstr &MI, MachineRegisterInfo &MRI,
1348   MachineIRBuilder &B) const {
1349   B.setInstr(MI);
1350 
1351   const LLT S1 = LLT::scalar(1);
1352   const LLT S64 = LLT::scalar(64);
1353 
1354   Register Src = MI.getOperand(1).getReg();
1355   assert(MRI.getType(Src) == S64);
1356 
1357   // result = trunc(src)
1358   // if (src > 0.0 && src != result)
1359   //   result += 1.0
1360 
1361   auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src});
1362 
1363   const auto Zero = B.buildFConstant(S64, 0.0);
1364   const auto One = B.buildFConstant(S64, 1.0);
1365   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1366   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1367   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1368   auto Add = B.buildSelect(S64, And, One, Zero);
1369 
1370   // TODO: Should this propagate fast-math-flags?
1371   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1372   return true;
1373 }
1374 
1375 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1376                                               MachineIRBuilder &B) {
1377   const unsigned FractBits = 52;
1378   const unsigned ExpBits = 11;
1379   LLT S32 = LLT::scalar(32);
1380 
1381   auto Const0 = B.buildConstant(S32, FractBits - 32);
1382   auto Const1 = B.buildConstant(S32, ExpBits);
1383 
1384   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1385     .addUse(Const0.getReg(0))
1386     .addUse(Const1.getReg(0));
1387 
1388   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1389 }
1390 
1391 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1392   MachineInstr &MI, MachineRegisterInfo &MRI,
1393   MachineIRBuilder &B) const {
1394   B.setInstr(MI);
1395 
1396   const LLT S1 = LLT::scalar(1);
1397   const LLT S32 = LLT::scalar(32);
1398   const LLT S64 = LLT::scalar(64);
1399 
1400   Register Src = MI.getOperand(1).getReg();
1401   assert(MRI.getType(Src) == S64);
1402 
1403   // TODO: Should this use extract since the low half is unused?
1404   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1405   Register Hi = Unmerge.getReg(1);
1406 
1407   // Extract the upper half, since this is where we will find the sign and
1408   // exponent.
1409   auto Exp = extractF64Exponent(Hi, B);
1410 
1411   const unsigned FractBits = 52;
1412 
1413   // Extract the sign bit.
1414   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1415   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1416 
1417   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1418 
1419   const auto Zero32 = B.buildConstant(S32, 0);
1420 
1421   // Extend back to 64-bits.
1422   auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)});
1423 
1424   auto Shr = B.buildAShr(S64, FractMask, Exp);
1425   auto Not = B.buildNot(S64, Shr);
1426   auto Tmp0 = B.buildAnd(S64, Src, Not);
1427   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1428 
1429   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1430   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1431 
1432   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1433   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1434   return true;
1435 }
1436 
1437 bool AMDGPULegalizerInfo::legalizeITOFP(
1438   MachineInstr &MI, MachineRegisterInfo &MRI,
1439   MachineIRBuilder &B, bool Signed) const {
1440   B.setInstr(MI);
1441 
1442   Register Dst = MI.getOperand(0).getReg();
1443   Register Src = MI.getOperand(1).getReg();
1444 
1445   const LLT S64 = LLT::scalar(64);
1446   const LLT S32 = LLT::scalar(32);
1447 
1448   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1449 
1450   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1451 
1452   auto CvtHi = Signed ?
1453     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1454     B.buildUITOFP(S64, Unmerge.getReg(1));
1455 
1456   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1457 
1458   auto ThirtyTwo = B.buildConstant(S32, 32);
1459   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1460     .addUse(CvtHi.getReg(0))
1461     .addUse(ThirtyTwo.getReg(0));
1462 
1463   // TODO: Should this propagate fast-math-flags?
1464   B.buildFAdd(Dst, LdExp, CvtLo);
1465   MI.eraseFromParent();
1466   return true;
1467 }
1468 
1469 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1470   MachineInstr &MI, MachineRegisterInfo &MRI,
1471   MachineIRBuilder &B) const {
1472   MachineFunction &MF = B.getMF();
1473   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1474 
1475   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1476                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1477 
1478   // With ieee_mode disabled, the instructions have the correct behavior
1479   // already for G_FMINNUM/G_FMAXNUM
1480   if (!MFI->getMode().IEEE)
1481     return !IsIEEEOp;
1482 
1483   if (IsIEEEOp)
1484     return true;
1485 
1486   MachineIRBuilder HelperBuilder(MI);
1487   GISelObserverWrapper DummyObserver;
1488   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1489   HelperBuilder.setInstr(MI);
1490   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1491 }
1492 
1493 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1494   MachineInstr &MI, MachineRegisterInfo &MRI,
1495   MachineIRBuilder &B) const {
1496   // TODO: Should move some of this into LegalizerHelper.
1497 
1498   // TODO: Promote dynamic indexing of s16 to s32
1499   // TODO: Dynamic s64 indexing is only legal for SGPR.
1500   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
1501   if (!IdxVal) // Dynamic case will be selected to register indexing.
1502     return true;
1503 
1504   Register Dst = MI.getOperand(0).getReg();
1505   Register Vec = MI.getOperand(1).getReg();
1506 
1507   LLT VecTy = MRI.getType(Vec);
1508   LLT EltTy = VecTy.getElementType();
1509   assert(EltTy == MRI.getType(Dst));
1510 
1511   B.setInstr(MI);
1512 
1513   if (IdxVal.getValue() < VecTy.getNumElements())
1514     B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits());
1515   else
1516     B.buildUndef(Dst);
1517 
1518   MI.eraseFromParent();
1519   return true;
1520 }
1521 
1522 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1523   MachineInstr &MI, MachineRegisterInfo &MRI,
1524   MachineIRBuilder &B) const {
1525   // TODO: Should move some of this into LegalizerHelper.
1526 
1527   // TODO: Promote dynamic indexing of s16 to s32
1528   // TODO: Dynamic s64 indexing is only legal for SGPR.
1529   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI);
1530   if (!IdxVal) // Dynamic case will be selected to register indexing.
1531     return true;
1532 
1533   Register Dst = MI.getOperand(0).getReg();
1534   Register Vec = MI.getOperand(1).getReg();
1535   Register Ins = MI.getOperand(2).getReg();
1536 
1537   LLT VecTy = MRI.getType(Vec);
1538   LLT EltTy = VecTy.getElementType();
1539   assert(EltTy == MRI.getType(Ins));
1540 
1541   B.setInstr(MI);
1542 
1543   if (IdxVal.getValue() < VecTy.getNumElements())
1544     B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits());
1545   else
1546     B.buildUndef(Dst);
1547 
1548   MI.eraseFromParent();
1549   return true;
1550 }
1551 
1552 bool AMDGPULegalizerInfo::legalizeSinCos(
1553   MachineInstr &MI, MachineRegisterInfo &MRI,
1554   MachineIRBuilder &B) const {
1555   B.setInstr(MI);
1556 
1557   Register DstReg = MI.getOperand(0).getReg();
1558   Register SrcReg = MI.getOperand(1).getReg();
1559   LLT Ty = MRI.getType(DstReg);
1560   unsigned Flags = MI.getFlags();
1561 
1562   Register TrigVal;
1563   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1564   if (ST.hasTrigReducedRange()) {
1565     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1566     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1567       .addUse(MulVal.getReg(0))
1568       .setMIFlags(Flags).getReg(0);
1569   } else
1570     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1571 
1572   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1573     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1574   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1575     .addUse(TrigVal)
1576     .setMIFlags(Flags);
1577   MI.eraseFromParent();
1578   return true;
1579 }
1580 
1581 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1582   Register DstReg, LLT PtrTy,
1583   MachineIRBuilder &B, const GlobalValue *GV,
1584   unsigned Offset, unsigned GAFlags) const {
1585   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1586   // to the following code sequence:
1587   //
1588   // For constant address space:
1589   //   s_getpc_b64 s[0:1]
1590   //   s_add_u32 s0, s0, $symbol
1591   //   s_addc_u32 s1, s1, 0
1592   //
1593   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1594   //   a fixup or relocation is emitted to replace $symbol with a literal
1595   //   constant, which is a pc-relative offset from the encoding of the $symbol
1596   //   operand to the global variable.
1597   //
1598   // For global address space:
1599   //   s_getpc_b64 s[0:1]
1600   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1601   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1602   //
1603   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1604   //   fixups or relocations are emitted to replace $symbol@*@lo and
1605   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1606   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
1607   //   operand to the global variable.
1608   //
1609   // What we want here is an offset from the value returned by s_getpc
1610   // (which is the address of the s_add_u32 instruction) to the global
1611   // variable, but since the encoding of $symbol starts 4 bytes after the start
1612   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1613   // small. This requires us to add 4 to the global variable offset in order to
1614   // compute the correct address.
1615 
1616   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1617 
1618   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1619     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1620 
1621   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1622     .addDef(PCReg);
1623 
1624   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1625   if (GAFlags == SIInstrInfo::MO_NONE)
1626     MIB.addImm(0);
1627   else
1628     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1629 
1630   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1631 
1632   if (PtrTy.getSizeInBits() == 32)
1633     B.buildExtract(DstReg, PCReg, 0);
1634   return true;
1635  }
1636 
1637 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1638   MachineInstr &MI, MachineRegisterInfo &MRI,
1639   MachineIRBuilder &B) const {
1640   Register DstReg = MI.getOperand(0).getReg();
1641   LLT Ty = MRI.getType(DstReg);
1642   unsigned AS = Ty.getAddressSpace();
1643 
1644   const GlobalValue *GV = MI.getOperand(1).getGlobal();
1645   MachineFunction &MF = B.getMF();
1646   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1647   B.setInstr(MI);
1648 
1649   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1650     if (!MFI->isEntryFunction()) {
1651       const Function &Fn = MF.getFunction();
1652       DiagnosticInfoUnsupported BadLDSDecl(
1653         Fn, "local memory global used by non-kernel function", MI.getDebugLoc());
1654       Fn.getContext().diagnose(BadLDSDecl);
1655     }
1656 
1657     // TODO: We could emit code to handle the initialization somewhere.
1658     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1659       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1660       MI.eraseFromParent();
1661       return true;
1662     }
1663 
1664     const Function &Fn = MF.getFunction();
1665     DiagnosticInfoUnsupported BadInit(
1666       Fn, "unsupported initializer for address space", MI.getDebugLoc());
1667     Fn.getContext().diagnose(BadInit);
1668     return true;
1669   }
1670 
1671   const SITargetLowering *TLI = ST.getTargetLowering();
1672 
1673   if (TLI->shouldEmitFixup(GV)) {
1674     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
1675     MI.eraseFromParent();
1676     return true;
1677   }
1678 
1679   if (TLI->shouldEmitPCReloc(GV)) {
1680     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
1681     MI.eraseFromParent();
1682     return true;
1683   }
1684 
1685   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1686   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
1687 
1688   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
1689     MachinePointerInfo::getGOT(MF),
1690     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1691     MachineMemOperand::MOInvariant,
1692     8 /*Size*/, 8 /*Align*/);
1693 
1694   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
1695 
1696   if (Ty.getSizeInBits() == 32) {
1697     // Truncate if this is a 32-bit constant adrdess.
1698     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
1699     B.buildExtract(DstReg, Load, 0);
1700   } else
1701     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
1702 
1703   MI.eraseFromParent();
1704   return true;
1705 }
1706 
1707 bool AMDGPULegalizerInfo::legalizeLoad(
1708   MachineInstr &MI, MachineRegisterInfo &MRI,
1709   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
1710   B.setInstr(MI);
1711   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1712   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
1713   Observer.changingInstr(MI);
1714   MI.getOperand(1).setReg(Cast.getReg(0));
1715   Observer.changedInstr(MI);
1716   return true;
1717 }
1718 
1719 bool AMDGPULegalizerInfo::legalizeFMad(
1720   MachineInstr &MI, MachineRegisterInfo &MRI,
1721   MachineIRBuilder &B) const {
1722   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1723   assert(Ty.isScalar());
1724 
1725   MachineFunction &MF = B.getMF();
1726   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1727 
1728   // TODO: Always legal with future ftz flag.
1729   if (Ty == LLT::scalar(32) && !MFI->getMode().FP32Denormals)
1730     return true;
1731   if (Ty == LLT::scalar(16) && !MFI->getMode().FP64FP16Denormals)
1732     return true;
1733 
1734 
1735   MachineIRBuilder HelperBuilder(MI);
1736   GISelObserverWrapper DummyObserver;
1737   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1738   HelperBuilder.setMBB(*MI.getParent());
1739   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
1740 }
1741 
1742 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
1743   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
1744   Register DstReg = MI.getOperand(0).getReg();
1745   Register PtrReg = MI.getOperand(1).getReg();
1746   Register CmpVal = MI.getOperand(2).getReg();
1747   Register NewVal = MI.getOperand(3).getReg();
1748 
1749   assert(SITargetLowering::isFlatGlobalAddrSpace(
1750            MRI.getType(PtrReg).getAddressSpace()) &&
1751          "this should not have been custom lowered");
1752 
1753   LLT ValTy = MRI.getType(CmpVal);
1754   LLT VecTy = LLT::vector(2, ValTy);
1755 
1756   B.setInstr(MI);
1757   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
1758 
1759   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
1760     .addDef(DstReg)
1761     .addUse(PtrReg)
1762     .addUse(PackedVal)
1763     .setMemRefs(MI.memoperands());
1764 
1765   MI.eraseFromParent();
1766   return true;
1767 }
1768 
1769 // Return the use branch instruction, otherwise null if the usage is invalid.
1770 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
1771                                        MachineRegisterInfo &MRI) {
1772   Register CondDef = MI.getOperand(0).getReg();
1773   if (!MRI.hasOneNonDBGUse(CondDef))
1774     return nullptr;
1775 
1776   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
1777   return UseMI.getParent() == MI.getParent() &&
1778     UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr;
1779 }
1780 
1781 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
1782                                                 Register Reg, LLT Ty) const {
1783   Register LiveIn = MRI.getLiveInVirtReg(Reg);
1784   if (LiveIn)
1785     return LiveIn;
1786 
1787   Register NewReg = MRI.createGenericVirtualRegister(Ty);
1788   MRI.addLiveIn(Reg, NewReg);
1789   return NewReg;
1790 }
1791 
1792 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
1793                                          const ArgDescriptor *Arg) const {
1794   if (!Arg->isRegister() || !Arg->getRegister().isValid())
1795     return false; // TODO: Handle these
1796 
1797   assert(Arg->getRegister().isPhysical());
1798 
1799   MachineRegisterInfo &MRI = *B.getMRI();
1800 
1801   LLT Ty = MRI.getType(DstReg);
1802   Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
1803 
1804   if (Arg->isMasked()) {
1805     // TODO: Should we try to emit this once in the entry block?
1806     const LLT S32 = LLT::scalar(32);
1807     const unsigned Mask = Arg->getMask();
1808     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
1809 
1810     Register AndMaskSrc = LiveIn;
1811 
1812     if (Shift != 0) {
1813       auto ShiftAmt = B.buildConstant(S32, Shift);
1814       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
1815     }
1816 
1817     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
1818   } else
1819     B.buildCopy(DstReg, LiveIn);
1820 
1821   // Insert the argument copy if it doens't already exist.
1822   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
1823   if (!MRI.getVRegDef(LiveIn)) {
1824     // FIXME: Should have scoped insert pt
1825     MachineBasicBlock &OrigInsBB = B.getMBB();
1826     auto OrigInsPt = B.getInsertPt();
1827 
1828     MachineBasicBlock &EntryMBB = B.getMF().front();
1829     EntryMBB.addLiveIn(Arg->getRegister());
1830     B.setInsertPt(EntryMBB, EntryMBB.begin());
1831     B.buildCopy(LiveIn, Arg->getRegister());
1832 
1833     B.setInsertPt(OrigInsBB, OrigInsPt);
1834   }
1835 
1836   return true;
1837 }
1838 
1839 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
1840   MachineInstr &MI,
1841   MachineRegisterInfo &MRI,
1842   MachineIRBuilder &B,
1843   AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
1844   B.setInstr(MI);
1845 
1846   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1847 
1848   const ArgDescriptor *Arg;
1849   const TargetRegisterClass *RC;
1850   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
1851   if (!Arg) {
1852     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
1853     return false;
1854   }
1855 
1856   if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
1857     MI.eraseFromParent();
1858     return true;
1859   }
1860 
1861   return false;
1862 }
1863 
1864 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
1865                                        MachineRegisterInfo &MRI,
1866                                        MachineIRBuilder &B) const {
1867   B.setInstr(MI);
1868   Register Dst = MI.getOperand(0).getReg();
1869   LLT DstTy = MRI.getType(Dst);
1870   LLT S16 = LLT::scalar(16);
1871   LLT S32 = LLT::scalar(32);
1872   LLT S64 = LLT::scalar(64);
1873 
1874   if (legalizeFastUnsafeFDIV(MI, MRI, B))
1875     return true;
1876 
1877   if (DstTy == S16)
1878     return legalizeFDIV16(MI, MRI, B);
1879   if (DstTy == S32)
1880     return legalizeFDIV32(MI, MRI, B);
1881   if (DstTy == S64)
1882     return legalizeFDIV64(MI, MRI, B);
1883 
1884   return false;
1885 }
1886 
1887 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
1888                                                  MachineRegisterInfo &MRI,
1889                                                  MachineIRBuilder &B) const {
1890   Register Res = MI.getOperand(0).getReg();
1891   Register LHS = MI.getOperand(1).getReg();
1892   Register RHS = MI.getOperand(2).getReg();
1893 
1894   uint16_t Flags = MI.getFlags();
1895 
1896   LLT ResTy = MRI.getType(Res);
1897   LLT S32 = LLT::scalar(32);
1898   LLT S64 = LLT::scalar(64);
1899 
1900   const MachineFunction &MF = B.getMF();
1901   bool Unsafe =
1902     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
1903 
1904   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
1905     return false;
1906 
1907   if (!Unsafe && ResTy == S32 &&
1908       MF.getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals)
1909     return false;
1910 
1911   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
1912     // 1 / x -> RCP(x)
1913     if (CLHS->isExactlyValue(1.0)) {
1914       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
1915         .addUse(RHS)
1916         .setMIFlags(Flags);
1917 
1918       MI.eraseFromParent();
1919       return true;
1920     }
1921 
1922     // -1 / x -> RCP( FNEG(x) )
1923     if (CLHS->isExactlyValue(-1.0)) {
1924       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
1925       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
1926         .addUse(FNeg.getReg(0))
1927         .setMIFlags(Flags);
1928 
1929       MI.eraseFromParent();
1930       return true;
1931     }
1932   }
1933 
1934   // x / y -> x * (1.0 / y)
1935   if (Unsafe) {
1936     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
1937       .addUse(RHS)
1938       .setMIFlags(Flags);
1939     B.buildFMul(Res, LHS, RCP, Flags);
1940 
1941     MI.eraseFromParent();
1942     return true;
1943   }
1944 
1945   return false;
1946 }
1947 
1948 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
1949                                          MachineRegisterInfo &MRI,
1950                                          MachineIRBuilder &B) const {
1951   B.setInstr(MI);
1952   Register Res = MI.getOperand(0).getReg();
1953   Register LHS = MI.getOperand(1).getReg();
1954   Register RHS = MI.getOperand(2).getReg();
1955 
1956   uint16_t Flags = MI.getFlags();
1957 
1958   LLT S16 = LLT::scalar(16);
1959   LLT S32 = LLT::scalar(32);
1960 
1961   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
1962   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
1963 
1964   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
1965     .addUse(RHSExt.getReg(0))
1966     .setMIFlags(Flags);
1967 
1968   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
1969   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
1970 
1971   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
1972     .addUse(RDst.getReg(0))
1973     .addUse(RHS)
1974     .addUse(LHS)
1975     .setMIFlags(Flags);
1976 
1977   MI.eraseFromParent();
1978   return true;
1979 }
1980 
1981 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
1982 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
1983 static void toggleSPDenormMode(bool Enable,
1984                                MachineIRBuilder &B,
1985                                const GCNSubtarget &ST,
1986                                AMDGPU::SIModeRegisterDefaults Mode) {
1987   // Set SP denorm mode to this value.
1988   unsigned SPDenormMode =
1989     Enable ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
1990 
1991   if (ST.hasDenormModeInst()) {
1992     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
1993     unsigned DPDenormModeDefault = Mode.FP64FP16Denormals
1994                                    ? FP_DENORM_FLUSH_NONE
1995                                    : FP_DENORM_FLUSH_IN_FLUSH_OUT;
1996 
1997     unsigned NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
1998     B.buildInstr(AMDGPU::S_DENORM_MODE)
1999       .addImm(NewDenormModeValue);
2000 
2001   } else {
2002     // Select FP32 bit field in mode register.
2003     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2004                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2005                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2006 
2007     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2008       .addImm(SPDenormMode)
2009       .addImm(SPDenormModeBitField);
2010   }
2011 }
2012 
2013 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2014                                          MachineRegisterInfo &MRI,
2015                                          MachineIRBuilder &B) const {
2016   B.setInstr(MI);
2017   Register Res = MI.getOperand(0).getReg();
2018   Register LHS = MI.getOperand(1).getReg();
2019   Register RHS = MI.getOperand(2).getReg();
2020   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2021   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2022 
2023   uint16_t Flags = MI.getFlags();
2024 
2025   LLT S32 = LLT::scalar(32);
2026   LLT S1 = LLT::scalar(1);
2027 
2028   auto One = B.buildFConstant(S32, 1.0f);
2029 
2030   auto DenominatorScaled =
2031     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2032       .addUse(RHS)
2033       .addUse(LHS)
2034       .addImm(1)
2035       .setMIFlags(Flags);
2036   auto NumeratorScaled =
2037     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2038       .addUse(LHS)
2039       .addUse(RHS)
2040       .addImm(0)
2041       .setMIFlags(Flags);
2042 
2043   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2044     .addUse(DenominatorScaled.getReg(0))
2045     .setMIFlags(Flags);
2046   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2047 
2048   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2049   // aren't modeled as reading it.
2050   if (!Mode.FP32Denormals)
2051     toggleSPDenormMode(true, B, ST, Mode);
2052 
2053   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2054   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2055   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2056   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2057   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2058   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2059 
2060   if (!Mode.FP32Denormals)
2061     toggleSPDenormMode(false, B, ST, Mode);
2062 
2063   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2064     .addUse(Fma4.getReg(0))
2065     .addUse(Fma1.getReg(0))
2066     .addUse(Fma3.getReg(0))
2067     .addUse(NumeratorScaled.getReg(1))
2068     .setMIFlags(Flags);
2069 
2070   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2071     .addUse(Fmas.getReg(0))
2072     .addUse(RHS)
2073     .addUse(LHS)
2074     .setMIFlags(Flags);
2075 
2076   MI.eraseFromParent();
2077   return true;
2078 }
2079 
2080 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
2081                                          MachineRegisterInfo &MRI,
2082                                          MachineIRBuilder &B) const {
2083   B.setInstr(MI);
2084   Register Res = MI.getOperand(0).getReg();
2085   Register LHS = MI.getOperand(1).getReg();
2086   Register RHS = MI.getOperand(2).getReg();
2087 
2088   uint16_t Flags = MI.getFlags();
2089 
2090   LLT S64 = LLT::scalar(64);
2091   LLT S1 = LLT::scalar(1);
2092 
2093   auto One = B.buildFConstant(S64, 1.0);
2094 
2095   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2096     .addUse(LHS)
2097     .addUse(RHS)
2098     .addImm(1)
2099     .setMIFlags(Flags);
2100 
2101   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
2102 
2103   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
2104     .addUse(DivScale0.getReg(0))
2105     .setMIFlags(Flags);
2106 
2107   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
2108   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
2109   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
2110 
2111   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2112     .addUse(LHS)
2113     .addUse(RHS)
2114     .addImm(0)
2115     .setMIFlags(Flags);
2116 
2117   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
2118   auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags);
2119   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
2120 
2121   Register Scale;
2122   if (!ST.hasUsableDivScaleConditionOutput()) {
2123     // Workaround a hardware bug on SI where the condition output from div_scale
2124     // is not usable.
2125 
2126     Scale = MRI.createGenericVirtualRegister(S1);
2127 
2128     LLT S32 = LLT::scalar(32);
2129 
2130     auto NumUnmerge = B.buildUnmerge(S32, LHS);
2131     auto DenUnmerge = B.buildUnmerge(S32, RHS);
2132     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
2133     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
2134 
2135     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
2136                               Scale1Unmerge.getReg(1));
2137     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
2138                               Scale0Unmerge.getReg(1));
2139     B.buildXor(Scale, CmpNum, CmpDen);
2140   } else {
2141     Scale = DivScale1.getReg(1);
2142   }
2143 
2144   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
2145     .addUse(Fma4.getReg(0))
2146     .addUse(Fma3.getReg(0))
2147     .addUse(Mul.getReg(0))
2148     .addUse(Scale)
2149     .setMIFlags(Flags);
2150 
2151   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
2152     .addUse(Fmas.getReg(0))
2153     .addUse(RHS)
2154     .addUse(LHS)
2155     .setMIFlags(Flags);
2156 
2157   MI.eraseFromParent();
2158   return true;
2159 }
2160 
2161 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
2162                                                  MachineRegisterInfo &MRI,
2163                                                  MachineIRBuilder &B) const {
2164   B.setInstr(MI);
2165   Register Res = MI.getOperand(0).getReg();
2166   Register LHS = MI.getOperand(2).getReg();
2167   Register RHS = MI.getOperand(3).getReg();
2168   uint16_t Flags = MI.getFlags();
2169 
2170   LLT S32 = LLT::scalar(32);
2171   LLT S1 = LLT::scalar(1);
2172 
2173   auto Abs = B.buildFAbs(S32, RHS, Flags);
2174   const APFloat C0Val(1.0f);
2175 
2176   auto C0 = B.buildConstant(S32, 0x6f800000);
2177   auto C1 = B.buildConstant(S32, 0x2f800000);
2178   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
2179 
2180   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
2181   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
2182 
2183   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
2184 
2185   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2186     .addUse(Mul0.getReg(0))
2187     .setMIFlags(Flags);
2188 
2189   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
2190 
2191   B.buildFMul(Res, Sel, Mul1, Flags);
2192 
2193   MI.eraseFromParent();
2194   return true;
2195 }
2196 
2197 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
2198                                                  MachineRegisterInfo &MRI,
2199                                                  MachineIRBuilder &B) const {
2200   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2201   if (!MFI->isEntryFunction()) {
2202     return legalizePreloadedArgIntrin(MI, MRI, B,
2203                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
2204   }
2205 
2206   B.setInstr(MI);
2207 
2208   uint64_t Offset =
2209     ST.getTargetLowering()->getImplicitParameterOffset(
2210       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
2211   Register DstReg = MI.getOperand(0).getReg();
2212   LLT DstTy = MRI.getType(DstReg);
2213   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
2214 
2215   const ArgDescriptor *Arg;
2216   const TargetRegisterClass *RC;
2217   std::tie(Arg, RC)
2218     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2219   if (!Arg)
2220     return false;
2221 
2222   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
2223   if (!loadInputValue(KernargPtrReg, B, Arg))
2224     return false;
2225 
2226   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
2227   MI.eraseFromParent();
2228   return true;
2229 }
2230 
2231 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
2232                                               MachineRegisterInfo &MRI,
2233                                               MachineIRBuilder &B,
2234                                               unsigned AddrSpace) const {
2235   B.setInstr(MI);
2236   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
2237   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
2238   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
2239   MI.eraseFromParent();
2240   return true;
2241 }
2242 
2243 /// Handle register layout difference for f16 images for some subtargets.
2244 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
2245                                              MachineRegisterInfo &MRI,
2246                                              Register Reg) const {
2247   if (!ST.hasUnpackedD16VMem())
2248     return Reg;
2249 
2250   const LLT S16 = LLT::scalar(16);
2251   const LLT S32 = LLT::scalar(32);
2252   LLT StoreVT = MRI.getType(Reg);
2253   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
2254 
2255   auto Unmerge = B.buildUnmerge(S16, Reg);
2256 
2257   SmallVector<Register, 4> WideRegs;
2258   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
2259     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
2260 
2261   int NumElts = StoreVT.getNumElements();
2262 
2263   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
2264 }
2265 
2266 bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI,
2267                                                  MachineRegisterInfo &MRI,
2268                                                  MachineIRBuilder &B,
2269                                                  bool IsFormat) const {
2270   // TODO: Reject f16 format on targets where unsupported.
2271   Register VData = MI.getOperand(1).getReg();
2272   LLT Ty = MRI.getType(VData);
2273 
2274   B.setInstr(MI);
2275 
2276   const LLT S32 = LLT::scalar(32);
2277   const LLT S16 = LLT::scalar(16);
2278 
2279   // Fixup illegal register types for i8 stores.
2280   if (Ty == LLT::scalar(8) || Ty == S16) {
2281     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
2282     MI.getOperand(1).setReg(AnyExt);
2283     return true;
2284   }
2285 
2286   if (Ty.isVector()) {
2287     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
2288       if (IsFormat)
2289         MI.getOperand(1).setReg(handleD16VData(B, MRI, VData));
2290       return true;
2291     }
2292 
2293     return Ty.getElementType() == S32 && Ty.getNumElements() <= 4;
2294   }
2295 
2296   return Ty == S32;
2297 }
2298 
2299 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
2300                                             MachineRegisterInfo &MRI,
2301                                             MachineIRBuilder &B) const {
2302   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
2303   auto IntrID = MI.getIntrinsicID();
2304   switch (IntrID) {
2305   case Intrinsic::amdgcn_if:
2306   case Intrinsic::amdgcn_else: {
2307     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
2308       const SIRegisterInfo *TRI
2309         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
2310 
2311       B.setInstr(*BrCond);
2312       Register Def = MI.getOperand(1).getReg();
2313       Register Use = MI.getOperand(3).getReg();
2314 
2315       if (IntrID == Intrinsic::amdgcn_if) {
2316         B.buildInstr(AMDGPU::SI_IF)
2317           .addDef(Def)
2318           .addUse(Use)
2319           .addMBB(BrCond->getOperand(1).getMBB());
2320       } else {
2321         B.buildInstr(AMDGPU::SI_ELSE)
2322           .addDef(Def)
2323           .addUse(Use)
2324           .addMBB(BrCond->getOperand(1).getMBB())
2325           .addImm(0);
2326       }
2327 
2328       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
2329       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
2330       MI.eraseFromParent();
2331       BrCond->eraseFromParent();
2332       return true;
2333     }
2334 
2335     return false;
2336   }
2337   case Intrinsic::amdgcn_loop: {
2338     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
2339       const SIRegisterInfo *TRI
2340         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
2341 
2342       B.setInstr(*BrCond);
2343       Register Reg = MI.getOperand(2).getReg();
2344       B.buildInstr(AMDGPU::SI_LOOP)
2345         .addUse(Reg)
2346         .addMBB(BrCond->getOperand(1).getMBB());
2347       MI.eraseFromParent();
2348       BrCond->eraseFromParent();
2349       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
2350       return true;
2351     }
2352 
2353     return false;
2354   }
2355   case Intrinsic::amdgcn_kernarg_segment_ptr:
2356     return legalizePreloadedArgIntrin(
2357       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2358   case Intrinsic::amdgcn_implicitarg_ptr:
2359     return legalizeImplicitArgPtr(MI, MRI, B);
2360   case Intrinsic::amdgcn_workitem_id_x:
2361     return legalizePreloadedArgIntrin(MI, MRI, B,
2362                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
2363   case Intrinsic::amdgcn_workitem_id_y:
2364     return legalizePreloadedArgIntrin(MI, MRI, B,
2365                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
2366   case Intrinsic::amdgcn_workitem_id_z:
2367     return legalizePreloadedArgIntrin(MI, MRI, B,
2368                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
2369   case Intrinsic::amdgcn_workgroup_id_x:
2370     return legalizePreloadedArgIntrin(MI, MRI, B,
2371                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
2372   case Intrinsic::amdgcn_workgroup_id_y:
2373     return legalizePreloadedArgIntrin(MI, MRI, B,
2374                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
2375   case Intrinsic::amdgcn_workgroup_id_z:
2376     return legalizePreloadedArgIntrin(MI, MRI, B,
2377                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
2378   case Intrinsic::amdgcn_dispatch_ptr:
2379     return legalizePreloadedArgIntrin(MI, MRI, B,
2380                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
2381   case Intrinsic::amdgcn_queue_ptr:
2382     return legalizePreloadedArgIntrin(MI, MRI, B,
2383                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
2384   case Intrinsic::amdgcn_implicit_buffer_ptr:
2385     return legalizePreloadedArgIntrin(
2386       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
2387   case Intrinsic::amdgcn_dispatch_id:
2388     return legalizePreloadedArgIntrin(MI, MRI, B,
2389                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
2390   case Intrinsic::amdgcn_fdiv_fast:
2391     return legalizeFDIVFastIntrin(MI, MRI, B);
2392   case Intrinsic::amdgcn_is_shared:
2393     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
2394   case Intrinsic::amdgcn_is_private:
2395     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
2396   case Intrinsic::amdgcn_wavefrontsize: {
2397     B.setInstr(MI);
2398     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
2399     MI.eraseFromParent();
2400     return true;
2401   }
2402   case Intrinsic::amdgcn_raw_buffer_store:
2403     return legalizeRawBufferStore(MI, MRI, B, false);
2404   case Intrinsic::amdgcn_raw_buffer_store_format:
2405     return legalizeRawBufferStore(MI, MRI, B, true);
2406   default:
2407     return true;
2408   }
2409 
2410   return true;
2411 }
2412