1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
19 #endif
20 
21 #include "AMDGPU.h"
22 #include "AMDGPULegalizerInfo.h"
23 #include "AMDGPUTargetMachine.h"
24 #include "SIMachineFunctionInfo.h"
25 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/CodeGen/TargetOpcodes.h"
28 #include "llvm/CodeGen/ValueTypes.h"
29 #include "llvm/IR/DerivedTypes.h"
30 #include "llvm/IR/DiagnosticInfo.h"
31 #include "llvm/IR/Type.h"
32 #include "llvm/Support/Debug.h"
33 
34 #define DEBUG_TYPE "amdgpu-legalinfo"
35 
36 using namespace llvm;
37 using namespace LegalizeActions;
38 using namespace LegalizeMutations;
39 using namespace LegalityPredicates;
40 
41 
42 static LegalityPredicate isMultiple32(unsigned TypeIdx,
43                                       unsigned MaxSize = 1024) {
44   return [=](const LegalityQuery &Query) {
45     const LLT Ty = Query.Types[TypeIdx];
46     const LLT EltTy = Ty.getScalarType();
47     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
48   };
49 }
50 
51 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
52   return [=](const LegalityQuery &Query) {
53     return Query.Types[TypeIdx].getSizeInBits() == Size;
54   };
55 }
56 
57 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
58   return [=](const LegalityQuery &Query) {
59     const LLT Ty = Query.Types[TypeIdx];
60     return Ty.isVector() &&
61            Ty.getNumElements() % 2 != 0 &&
62            Ty.getElementType().getSizeInBits() < 32 &&
63            Ty.getSizeInBits() % 32 != 0;
64   };
65 }
66 
67 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
68   return [=](const LegalityQuery &Query) {
69     const LLT Ty = Query.Types[TypeIdx];
70     const LLT EltTy = Ty.getScalarType();
71     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
72   };
73 }
74 
75 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
76   return [=](const LegalityQuery &Query) {
77     const LLT Ty = Query.Types[TypeIdx];
78     const LLT EltTy = Ty.getElementType();
79     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
80   };
81 }
82 
83 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
84   return [=](const LegalityQuery &Query) {
85     const LLT Ty = Query.Types[TypeIdx];
86     const LLT EltTy = Ty.getElementType();
87     unsigned Size = Ty.getSizeInBits();
88     unsigned Pieces = (Size + 63) / 64;
89     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
90     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
91   };
92 }
93 
94 // Increase the number of vector elements to reach the next multiple of 32-bit
95 // type.
96 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
97   return [=](const LegalityQuery &Query) {
98     const LLT Ty = Query.Types[TypeIdx];
99 
100     const LLT EltTy = Ty.getElementType();
101     const int Size = Ty.getSizeInBits();
102     const int EltSize = EltTy.getSizeInBits();
103     const int NextMul32 = (Size + 31) / 32;
104 
105     assert(EltSize < 32);
106 
107     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
108     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
109   };
110 }
111 
112 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
113   return [=](const LegalityQuery &Query) {
114     const LLT QueryTy = Query.Types[TypeIdx];
115     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
116   };
117 }
118 
119 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
120   return [=](const LegalityQuery &Query) {
121     const LLT QueryTy = Query.Types[TypeIdx];
122     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
123   };
124 }
125 
126 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
127   return [=](const LegalityQuery &Query) {
128     const LLT QueryTy = Query.Types[TypeIdx];
129     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
130   };
131 }
132 
133 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
134 // v2s16.
135 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
136   return [=](const LegalityQuery &Query) {
137     const LLT Ty = Query.Types[TypeIdx];
138     if (Ty.isVector()) {
139       const int EltSize = Ty.getElementType().getSizeInBits();
140       return EltSize == 32 || EltSize == 64 ||
141             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
142              EltSize == 128 || EltSize == 256;
143     }
144 
145     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
146   };
147 }
148 
149 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
150   return [=](const LegalityQuery &Query) {
151     return Query.Types[TypeIdx].getElementType() == Type;
152   };
153 }
154 
155 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
156   return [=](const LegalityQuery &Query) {
157     const LLT Ty = Query.Types[TypeIdx];
158     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
159            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
160   };
161 }
162 
163 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
164                                          const GCNTargetMachine &TM)
165   :  ST(ST_) {
166   using namespace TargetOpcode;
167 
168   auto GetAddrSpacePtr = [&TM](unsigned AS) {
169     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
170   };
171 
172   const LLT S1 = LLT::scalar(1);
173   const LLT S8 = LLT::scalar(8);
174   const LLT S16 = LLT::scalar(16);
175   const LLT S32 = LLT::scalar(32);
176   const LLT S64 = LLT::scalar(64);
177   const LLT S96 = LLT::scalar(96);
178   const LLT S128 = LLT::scalar(128);
179   const LLT S256 = LLT::scalar(256);
180   const LLT S1024 = LLT::scalar(1024);
181 
182   const LLT V2S16 = LLT::vector(2, 16);
183   const LLT V4S16 = LLT::vector(4, 16);
184 
185   const LLT V2S32 = LLT::vector(2, 32);
186   const LLT V3S32 = LLT::vector(3, 32);
187   const LLT V4S32 = LLT::vector(4, 32);
188   const LLT V5S32 = LLT::vector(5, 32);
189   const LLT V6S32 = LLT::vector(6, 32);
190   const LLT V7S32 = LLT::vector(7, 32);
191   const LLT V8S32 = LLT::vector(8, 32);
192   const LLT V9S32 = LLT::vector(9, 32);
193   const LLT V10S32 = LLT::vector(10, 32);
194   const LLT V11S32 = LLT::vector(11, 32);
195   const LLT V12S32 = LLT::vector(12, 32);
196   const LLT V13S32 = LLT::vector(13, 32);
197   const LLT V14S32 = LLT::vector(14, 32);
198   const LLT V15S32 = LLT::vector(15, 32);
199   const LLT V16S32 = LLT::vector(16, 32);
200   const LLT V32S32 = LLT::vector(32, 32);
201 
202   const LLT V2S64 = LLT::vector(2, 64);
203   const LLT V3S64 = LLT::vector(3, 64);
204   const LLT V4S64 = LLT::vector(4, 64);
205   const LLT V5S64 = LLT::vector(5, 64);
206   const LLT V6S64 = LLT::vector(6, 64);
207   const LLT V7S64 = LLT::vector(7, 64);
208   const LLT V8S64 = LLT::vector(8, 64);
209   const LLT V16S64 = LLT::vector(16, 64);
210 
211   std::initializer_list<LLT> AllS32Vectors =
212     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
213      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
214   std::initializer_list<LLT> AllS64Vectors =
215     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
216 
217   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
218   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
219   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
220   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
221   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
222   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
223   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
224 
225   const LLT CodePtr = FlatPtr;
226 
227   const std::initializer_list<LLT> AddrSpaces64 = {
228     GlobalPtr, ConstantPtr, FlatPtr
229   };
230 
231   const std::initializer_list<LLT> AddrSpaces32 = {
232     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
233   };
234 
235   const std::initializer_list<LLT> FPTypesBase = {
236     S32, S64
237   };
238 
239   const std::initializer_list<LLT> FPTypes16 = {
240     S32, S64, S16
241   };
242 
243   const std::initializer_list<LLT> FPTypesPK16 = {
244     S32, S64, S16, V2S16
245   };
246 
247   setAction({G_BRCOND, S1}, Legal);
248 
249   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
250   // elements for v3s16
251   getActionDefinitionsBuilder(G_PHI)
252     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
253     .legalFor(AllS32Vectors)
254     .legalFor(AllS64Vectors)
255     .legalFor(AddrSpaces64)
256     .legalFor(AddrSpaces32)
257     .clampScalar(0, S32, S256)
258     .widenScalarToNextPow2(0, 32)
259     .clampMaxNumElements(0, S32, 16)
260     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
261     .legalIf(isPointer(0));
262 
263   if (ST.has16BitInsts()) {
264     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
265       .legalFor({S32, S16})
266       .clampScalar(0, S16, S32)
267       .scalarize(0);
268   } else {
269     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
270       .legalFor({S32})
271       .clampScalar(0, S32, S32)
272       .scalarize(0);
273   }
274 
275   // FIXME: Not really legal. Placeholder for custom lowering.
276   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
277     .legalFor({S32, S64})
278     .clampScalar(0, S32, S64)
279     .widenScalarToNextPow2(0, 32)
280     .scalarize(0);
281 
282   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
283     .legalFor({S32})
284     .clampScalar(0, S32, S32)
285     .scalarize(0);
286 
287   // Report legal for any types we can handle anywhere. For the cases only legal
288   // on the SALU, RegBankSelect will be able to re-legalize.
289   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
290     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
291     .clampScalar(0, S32, S64)
292     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
293     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
294     .widenScalarToNextPow2(0)
295     .scalarize(0);
296 
297   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
298                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
299     .legalFor({{S32, S1}})
300     .clampScalar(0, S32, S32)
301     .scalarize(0); // TODO: Implement.
302 
303   getActionDefinitionsBuilder({G_SADDO, G_SSUBO})
304     .lower();
305 
306   getActionDefinitionsBuilder(G_BITCAST)
307     // Don't worry about the size constraint.
308     .legalIf(all(isRegisterType(0), isRegisterType(1)))
309     // FIXME: Testing hack
310     .legalForCartesianProduct({S16, LLT::vector(2, 8), });
311 
312   getActionDefinitionsBuilder(G_FCONSTANT)
313     .legalFor({S32, S64, S16})
314     .clampScalar(0, S16, S64);
315 
316   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
317     .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
318                ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
319     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
320     .clampScalarOrElt(0, S32, S1024)
321     .legalIf(isMultiple32(0))
322     .widenScalarToNextPow2(0, 32)
323     .clampMaxNumElements(0, S32, 16);
324 
325 
326   // FIXME: i1 operands to intrinsics should always be legal, but other i1
327   // values may not be legal.  We need to figure out how to distinguish
328   // between these two scenarios.
329   getActionDefinitionsBuilder(G_CONSTANT)
330     .legalFor({S1, S32, S64, S16, GlobalPtr,
331                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
332     .clampScalar(0, S32, S64)
333     .widenScalarToNextPow2(0)
334     .legalIf(isPointer(0));
335 
336   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
337   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
338     .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr});
339 
340 
341   auto &FPOpActions = getActionDefinitionsBuilder(
342     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
343     .legalFor({S32, S64});
344   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
345     .customFor({S32, S64});
346   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
347     .customFor({S32, S64});
348 
349   if (ST.has16BitInsts()) {
350     if (ST.hasVOP3PInsts())
351       FPOpActions.legalFor({S16, V2S16});
352     else
353       FPOpActions.legalFor({S16});
354 
355     TrigActions.customFor({S16});
356     FDIVActions.customFor({S16});
357   }
358 
359   auto &MinNumMaxNum = getActionDefinitionsBuilder({
360       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
361 
362   if (ST.hasVOP3PInsts()) {
363     MinNumMaxNum.customFor(FPTypesPK16)
364       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
365       .clampMaxNumElements(0, S16, 2)
366       .clampScalar(0, S16, S64)
367       .scalarize(0);
368   } else if (ST.has16BitInsts()) {
369     MinNumMaxNum.customFor(FPTypes16)
370       .clampScalar(0, S16, S64)
371       .scalarize(0);
372   } else {
373     MinNumMaxNum.customFor(FPTypesBase)
374       .clampScalar(0, S32, S64)
375       .scalarize(0);
376   }
377 
378   if (ST.hasVOP3PInsts())
379     FPOpActions.clampMaxNumElements(0, S16, 2);
380 
381   FPOpActions
382     .scalarize(0)
383     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
384 
385   TrigActions
386     .scalarize(0)
387     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
388 
389   FDIVActions
390     .scalarize(0)
391     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
392 
393   getActionDefinitionsBuilder({G_FNEG, G_FABS})
394     .legalFor(FPTypesPK16)
395     .clampMaxNumElements(0, S16, 2)
396     .scalarize(0)
397     .clampScalar(0, S16, S64);
398 
399   // TODO: Implement
400   getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
401 
402   if (ST.has16BitInsts()) {
403     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
404       .legalFor({S32, S64, S16})
405       .scalarize(0)
406       .clampScalar(0, S16, S64);
407   } else {
408     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
409       .legalFor({S32, S64})
410       .scalarize(0)
411       .clampScalar(0, S32, S64);
412   }
413 
414   getActionDefinitionsBuilder(G_FPTRUNC)
415     .legalFor({{S32, S64}, {S16, S32}})
416     .scalarize(0);
417 
418   getActionDefinitionsBuilder(G_FPEXT)
419     .legalFor({{S64, S32}, {S32, S16}})
420     .lowerFor({{S64, S16}}) // FIXME: Implement
421     .scalarize(0);
422 
423   // TODO: Verify V_BFI_B32 is generated from expanded bit ops.
424   getActionDefinitionsBuilder(G_FCOPYSIGN).lower();
425 
426   getActionDefinitionsBuilder(G_FSUB)
427       // Use actual fsub instruction
428       .legalFor({S32})
429       // Must use fadd + fneg
430       .lowerFor({S64, S16, V2S16})
431       .scalarize(0)
432       .clampScalar(0, S32, S64);
433 
434   // Whether this is legal depends on the floating point mode for the function.
435   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
436   if (ST.hasMadF16())
437     FMad.customFor({S32, S16});
438   else
439     FMad.customFor({S32});
440   FMad.scalarize(0)
441       .lower();
442 
443   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
444     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
445                {S32, S1}, {S64, S1}, {S16, S1},
446                {S96, S32},
447                // FIXME: Hack
448                {S64, LLT::scalar(33)},
449                {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}})
450     .scalarize(0);
451 
452   // TODO: Split s1->s64 during regbankselect for VALU.
453   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
454     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
455     .lowerFor({{S32, S64}})
456     .lowerIf(typeIs(1, S1))
457     .customFor({{S64, S64}});
458   if (ST.has16BitInsts())
459     IToFP.legalFor({{S16, S16}});
460   IToFP.clampScalar(1, S32, S64)
461        .scalarize(0);
462 
463   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
464     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}});
465   if (ST.has16BitInsts())
466     FPToI.legalFor({{S16, S16}});
467   else
468     FPToI.minScalar(1, S32);
469 
470   FPToI.minScalar(0, S32)
471        .scalarize(0);
472 
473   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
474     .legalFor({S32, S64})
475     .scalarize(0);
476 
477   if (ST.has16BitInsts()) {
478     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
479       .legalFor({S16, S32, S64})
480       .clampScalar(0, S16, S64)
481       .scalarize(0);
482   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
483     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
484       .legalFor({S32, S64})
485       .clampScalar(0, S32, S64)
486       .scalarize(0);
487   } else {
488     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
489       .legalFor({S32})
490       .customFor({S64})
491       .clampScalar(0, S32, S64)
492       .scalarize(0);
493   }
494 
495   getActionDefinitionsBuilder(G_PTR_ADD)
496     .legalForCartesianProduct(AddrSpaces64, {S64})
497     .legalForCartesianProduct(AddrSpaces32, {S32})
498     .scalarize(0);
499 
500   getActionDefinitionsBuilder(G_PTR_MASK)
501     .scalarize(0)
502     .alwaysLegal();
503 
504   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
505 
506   auto &CmpBuilder =
507     getActionDefinitionsBuilder(G_ICMP)
508     .legalForCartesianProduct(
509       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
510     .legalFor({{S1, S32}, {S1, S64}});
511   if (ST.has16BitInsts()) {
512     CmpBuilder.legalFor({{S1, S16}});
513   }
514 
515   CmpBuilder
516     .widenScalarToNextPow2(1)
517     .clampScalar(1, S32, S64)
518     .scalarize(0)
519     .legalIf(all(typeIs(0, S1), isPointer(1)));
520 
521   getActionDefinitionsBuilder(G_FCMP)
522     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
523     .widenScalarToNextPow2(1)
524     .clampScalar(1, S32, S64)
525     .scalarize(0);
526 
527   // FIXME: fexp, flog2, flog10 needs to be custom lowered.
528   getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2,
529                                G_FLOG, G_FLOG2, G_FLOG10})
530     .legalFor({S32})
531     .scalarize(0);
532 
533   // The 64-bit versions produce 32-bit results, but only on the SALU.
534   getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF,
535                                G_CTTZ, G_CTTZ_ZERO_UNDEF,
536                                G_CTPOP})
537     .legalFor({{S32, S32}, {S32, S64}})
538     .clampScalar(0, S32, S32)
539     .clampScalar(1, S32, S64)
540     .scalarize(0)
541     .widenScalarToNextPow2(0, 32)
542     .widenScalarToNextPow2(1, 32);
543 
544   // TODO: Expand for > s32
545   getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE})
546     .legalFor({S32})
547     .clampScalar(0, S32, S32)
548     .scalarize(0);
549 
550   if (ST.has16BitInsts()) {
551     if (ST.hasVOP3PInsts()) {
552       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
553         .legalFor({S32, S16, V2S16})
554         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
555         .clampMaxNumElements(0, S16, 2)
556         .clampScalar(0, S16, S32)
557         .widenScalarToNextPow2(0)
558         .scalarize(0);
559     } else {
560       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
561         .legalFor({S32, S16})
562         .widenScalarToNextPow2(0)
563         .clampScalar(0, S16, S32)
564         .scalarize(0);
565     }
566   } else {
567     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
568       .legalFor({S32})
569       .clampScalar(0, S32, S32)
570       .widenScalarToNextPow2(0)
571       .scalarize(0);
572   }
573 
574   auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
575     return [=](const LegalityQuery &Query) {
576       return Query.Types[TypeIdx0].getSizeInBits() <
577              Query.Types[TypeIdx1].getSizeInBits();
578     };
579   };
580 
581   auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
582     return [=](const LegalityQuery &Query) {
583       return Query.Types[TypeIdx0].getSizeInBits() >
584              Query.Types[TypeIdx1].getSizeInBits();
585     };
586   };
587 
588   getActionDefinitionsBuilder(G_INTTOPTR)
589     // List the common cases
590     .legalForCartesianProduct(AddrSpaces64, {S64})
591     .legalForCartesianProduct(AddrSpaces32, {S32})
592     .scalarize(0)
593     // Accept any address space as long as the size matches
594     .legalIf(sameSize(0, 1))
595     .widenScalarIf(smallerThan(1, 0),
596       [](const LegalityQuery &Query) {
597         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
598       })
599     .narrowScalarIf(greaterThan(1, 0),
600       [](const LegalityQuery &Query) {
601         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
602       });
603 
604   getActionDefinitionsBuilder(G_PTRTOINT)
605     // List the common cases
606     .legalForCartesianProduct(AddrSpaces64, {S64})
607     .legalForCartesianProduct(AddrSpaces32, {S32})
608     .scalarize(0)
609     // Accept any address space as long as the size matches
610     .legalIf(sameSize(0, 1))
611     .widenScalarIf(smallerThan(0, 1),
612       [](const LegalityQuery &Query) {
613         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
614       })
615     .narrowScalarIf(
616       greaterThan(0, 1),
617       [](const LegalityQuery &Query) {
618         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
619       });
620 
621   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
622     .scalarize(0)
623     .custom();
624 
625   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
626   // handle some operations by just promoting the register during
627   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
628   auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned {
629     switch (AS) {
630     // FIXME: Private element size.
631     case AMDGPUAS::PRIVATE_ADDRESS:
632       return 32;
633     // FIXME: Check subtarget
634     case AMDGPUAS::LOCAL_ADDRESS:
635       return ST.useDS128() ? 128 : 64;
636 
637     // Treat constant and global as identical. SMRD loads are sometimes usable
638     // for global loads (ideally constant address space should be eliminated)
639     // depending on the context. Legality cannot be context dependent, but
640     // RegBankSelect can split the load as necessary depending on the pointer
641     // register bank/uniformity and if the memory is invariant or not written in
642     // a kernel.
643     case AMDGPUAS::CONSTANT_ADDRESS:
644     case AMDGPUAS::GLOBAL_ADDRESS:
645       return 512;
646     default:
647       return 128;
648     }
649   };
650 
651   const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool {
652     const LLT DstTy = Query.Types[0];
653 
654     // Split vector extloads.
655     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
656     unsigned Align = Query.MMODescrs[0].AlignInBits;
657 
658     if (MemSize < DstTy.getSizeInBits())
659       MemSize = std::max(MemSize, Align);
660 
661     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
662       return true;
663 
664     const LLT PtrTy = Query.Types[1];
665     unsigned AS = PtrTy.getAddressSpace();
666     if (MemSize > maxSizeForAddrSpace(AS))
667       return true;
668 
669     // Catch weird sized loads that don't evenly divide into the access sizes
670     // TODO: May be able to widen depending on alignment etc.
671     unsigned NumRegs = MemSize / 32;
672     if (NumRegs == 3 && !ST.hasDwordx3LoadStores())
673       return true;
674 
675     if (Align < MemSize) {
676       const SITargetLowering *TLI = ST.getTargetLowering();
677       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
678     }
679 
680     return false;
681   };
682 
683   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
684   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
685   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
686 
687   // TODO: Refine based on subtargets which support unaligned access or 128-bit
688   // LDS
689   // TODO: Unsupported flat for SI.
690 
691   for (unsigned Op : {G_LOAD, G_STORE}) {
692     const bool IsStore = Op == G_STORE;
693 
694     auto &Actions = getActionDefinitionsBuilder(Op);
695     // Whitelist the common cases.
696     // TODO: Pointer loads
697     // TODO: Wide constant loads
698     // TODO: Only CI+ has 3x loads
699     // TODO: Loads to s16 on gfx9
700     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
701                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
702                                       {V3S32, GlobalPtr, 96, GlobalAlign32},
703                                       {S96, GlobalPtr, 96, GlobalAlign32},
704                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
705                                       {S128, GlobalPtr, 128, GlobalAlign32},
706                                       {S64, GlobalPtr, 64, GlobalAlign32},
707                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
708                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
709                                       {S32, GlobalPtr, 8, GlobalAlign8},
710                                       {S32, GlobalPtr, 16, GlobalAlign16},
711 
712                                       {S32, LocalPtr, 32, 32},
713                                       {S64, LocalPtr, 64, 32},
714                                       {V2S32, LocalPtr, 64, 32},
715                                       {S32, LocalPtr, 8, 8},
716                                       {S32, LocalPtr, 16, 16},
717                                       {V2S16, LocalPtr, 32, 32},
718 
719                                       {S32, PrivatePtr, 32, 32},
720                                       {S32, PrivatePtr, 8, 8},
721                                       {S32, PrivatePtr, 16, 16},
722                                       {V2S16, PrivatePtr, 32, 32},
723 
724                                       {S32, FlatPtr, 32, GlobalAlign32},
725                                       {S32, FlatPtr, 16, GlobalAlign16},
726                                       {S32, FlatPtr, 8, GlobalAlign8},
727                                       {V2S16, FlatPtr, 32, GlobalAlign32},
728 
729                                       {S32, ConstantPtr, 32, GlobalAlign32},
730                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
731                                       {V3S32, ConstantPtr, 96, GlobalAlign32},
732                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
733                                       {S64, ConstantPtr, 64, GlobalAlign32},
734                                       {S128, ConstantPtr, 128, GlobalAlign32},
735                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
736     Actions
737         .customIf(typeIs(1, Constant32Ptr))
738         .narrowScalarIf(
739             [=](const LegalityQuery &Query) -> bool {
740               return !Query.Types[0].isVector() && needToSplitLoad(Query);
741             },
742             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
743               const LLT DstTy = Query.Types[0];
744               const LLT PtrTy = Query.Types[1];
745 
746               const unsigned DstSize = DstTy.getSizeInBits();
747               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
748 
749               // Split extloads.
750               if (DstSize > MemSize)
751                 return std::make_pair(0, LLT::scalar(MemSize));
752 
753               if (DstSize > 32 && (DstSize % 32 != 0)) {
754                 // FIXME: Need a way to specify non-extload of larger size if
755                 // suitably aligned.
756                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
757               }
758 
759               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
760               if (MemSize > MaxSize)
761                 return std::make_pair(0, LLT::scalar(MaxSize));
762 
763               unsigned Align = Query.MMODescrs[0].AlignInBits;
764               return std::make_pair(0, LLT::scalar(Align));
765             })
766         .fewerElementsIf(
767             [=](const LegalityQuery &Query) -> bool {
768               return Query.Types[0].isVector() && needToSplitLoad(Query);
769             },
770             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
771               const LLT DstTy = Query.Types[0];
772               const LLT PtrTy = Query.Types[1];
773 
774               LLT EltTy = DstTy.getElementType();
775               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
776 
777               // Split if it's too large for the address space.
778               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
779                 unsigned NumElts = DstTy.getNumElements();
780                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
781 
782                 // FIXME: Refine when odd breakdowns handled
783                 // The scalars will need to be re-legalized.
784                 if (NumPieces == 1 || NumPieces >= NumElts ||
785                     NumElts % NumPieces != 0)
786                   return std::make_pair(0, EltTy);
787 
788                 return std::make_pair(0,
789                                       LLT::vector(NumElts / NumPieces, EltTy));
790               }
791 
792               // Need to split because of alignment.
793               unsigned Align = Query.MMODescrs[0].AlignInBits;
794               unsigned EltSize = EltTy.getSizeInBits();
795               if (EltSize > Align &&
796                   (EltSize / Align < DstTy.getNumElements())) {
797                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
798               }
799 
800               // May need relegalization for the scalars.
801               return std::make_pair(0, EltTy);
802             })
803         .minScalar(0, S32);
804 
805     if (IsStore)
806       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
807 
808     // TODO: Need a bitcast lower option?
809     Actions
810         .legalIf([=](const LegalityQuery &Query) {
811           const LLT Ty0 = Query.Types[0];
812           unsigned Size = Ty0.getSizeInBits();
813           unsigned MemSize = Query.MMODescrs[0].SizeInBits;
814           unsigned Align = Query.MMODescrs[0].AlignInBits;
815 
816           // FIXME: Widening store from alignment not valid.
817           if (MemSize < Size)
818             MemSize = std::max(MemSize, Align);
819 
820           // No extending vector loads.
821           if (Size > MemSize && Ty0.isVector())
822             return false;
823 
824           switch (MemSize) {
825           case 8:
826           case 16:
827             return Size == 32;
828           case 32:
829           case 64:
830           case 128:
831             return true;
832           case 96:
833             return ST.hasDwordx3LoadStores();
834           case 256:
835           case 512:
836             return true;
837           default:
838             return false;
839           }
840         })
841         .widenScalarToNextPow2(0)
842         // TODO: v3s32->v4s32 with alignment
843         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
844   }
845 
846   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
847                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
848                                                   {S32, GlobalPtr, 16, 2 * 8},
849                                                   {S32, LocalPtr, 8, 8},
850                                                   {S32, LocalPtr, 16, 16},
851                                                   {S32, PrivatePtr, 8, 8},
852                                                   {S32, PrivatePtr, 16, 16},
853                                                   {S32, ConstantPtr, 8, 8},
854                                                   {S32, ConstantPtr, 16, 2 * 8}});
855   if (ST.hasFlatAddressSpace()) {
856     ExtLoads.legalForTypesWithMemDesc(
857         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
858   }
859 
860   ExtLoads.clampScalar(0, S32, S32)
861           .widenScalarToNextPow2(0)
862           .unsupportedIfMemSizeNotPow2()
863           .lower();
864 
865   auto &Atomics = getActionDefinitionsBuilder(
866     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
867      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
868      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
869      G_ATOMICRMW_UMIN})
870     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
871                {S64, GlobalPtr}, {S64, LocalPtr}});
872   if (ST.hasFlatAddressSpace()) {
873     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
874   }
875 
876   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
877     .legalFor({{S32, LocalPtr}});
878 
879   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
880   // demarshalling
881   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
882     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
883                 {S32, FlatPtr}, {S64, FlatPtr}})
884     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
885                {S32, RegionPtr}, {S64, RegionPtr}});
886 
887   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS)
888     .lower();
889 
890   // TODO: Pointer types, any 32-bit or 64-bit vector
891   getActionDefinitionsBuilder(G_SELECT)
892     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
893           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
894           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1})
895     .clampScalar(0, S16, S64)
896     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
897     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
898     .scalarize(1)
899     .clampMaxNumElements(0, S32, 2)
900     .clampMaxNumElements(0, LocalPtr, 2)
901     .clampMaxNumElements(0, PrivatePtr, 2)
902     .scalarize(0)
903     .widenScalarToNextPow2(0)
904     .legalIf(all(isPointer(0), typeIs(1, S1)));
905 
906   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
907   // be more flexible with the shift amount type.
908   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
909     .legalFor({{S32, S32}, {S64, S32}});
910   if (ST.has16BitInsts()) {
911     if (ST.hasVOP3PInsts()) {
912       Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
913             .clampMaxNumElements(0, S16, 2);
914     } else
915       Shifts.legalFor({{S16, S32}, {S16, S16}});
916 
917     Shifts.clampScalar(1, S16, S32);
918     Shifts.clampScalar(0, S16, S64);
919     Shifts.widenScalarToNextPow2(0, 16);
920   } else {
921     // Make sure we legalize the shift amount type first, as the general
922     // expansion for the shifted type will produce much worse code if it hasn't
923     // been truncated already.
924     Shifts.clampScalar(1, S32, S32);
925     Shifts.clampScalar(0, S32, S64);
926     Shifts.widenScalarToNextPow2(0, 32);
927   }
928   Shifts.scalarize(0);
929 
930   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
931     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
932     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
933     unsigned IdxTypeIdx = 2;
934 
935     getActionDefinitionsBuilder(Op)
936       .customIf([=](const LegalityQuery &Query) {
937           const LLT EltTy = Query.Types[EltTypeIdx];
938           const LLT VecTy = Query.Types[VecTypeIdx];
939           const LLT IdxTy = Query.Types[IdxTypeIdx];
940           return (EltTy.getSizeInBits() == 16 ||
941                   EltTy.getSizeInBits() % 32 == 0) &&
942                  VecTy.getSizeInBits() % 32 == 0 &&
943                  VecTy.getSizeInBits() <= 1024 &&
944                  IdxTy.getSizeInBits() == 32;
945         })
946       .clampScalar(EltTypeIdx, S32, S64)
947       .clampScalar(VecTypeIdx, S32, S64)
948       .clampScalar(IdxTypeIdx, S32, S32);
949   }
950 
951   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
952     .unsupportedIf([=](const LegalityQuery &Query) {
953         const LLT &EltTy = Query.Types[1].getElementType();
954         return Query.Types[0] != EltTy;
955       });
956 
957   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
958     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
959     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
960 
961     // FIXME: Doesn't handle extract of illegal sizes.
962     getActionDefinitionsBuilder(Op)
963       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
964       // FIXME: Multiples of 16 should not be legal.
965       .legalIf([=](const LegalityQuery &Query) {
966           const LLT BigTy = Query.Types[BigTyIdx];
967           const LLT LitTy = Query.Types[LitTyIdx];
968           return (BigTy.getSizeInBits() % 32 == 0) &&
969                  (LitTy.getSizeInBits() % 16 == 0);
970         })
971       .widenScalarIf(
972         [=](const LegalityQuery &Query) {
973           const LLT BigTy = Query.Types[BigTyIdx];
974           return (BigTy.getScalarSizeInBits() < 16);
975         },
976         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
977       .widenScalarIf(
978         [=](const LegalityQuery &Query) {
979           const LLT LitTy = Query.Types[LitTyIdx];
980           return (LitTy.getScalarSizeInBits() < 16);
981         },
982         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
983       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
984       .widenScalarToNextPow2(BigTyIdx, 32);
985 
986   }
987 
988   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
989     .legalForCartesianProduct(AllS32Vectors, {S32})
990     .legalForCartesianProduct(AllS64Vectors, {S64})
991     .clampNumElements(0, V16S32, V32S32)
992     .clampNumElements(0, V2S64, V16S64)
993     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
994 
995   if (ST.hasScalarPackInsts())
996     BuildVector.legalFor({V2S16, S32});
997 
998   BuildVector
999     .minScalarSameAs(1, 0)
1000     .legalIf(isRegisterType(0))
1001     .minScalarOrElt(0, S32);
1002 
1003   if (ST.hasScalarPackInsts()) {
1004     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1005       .legalFor({V2S16, S32})
1006       .lower();
1007   } else {
1008     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1009       .lower();
1010   }
1011 
1012   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1013     .legalIf(isRegisterType(0));
1014 
1015   // TODO: Don't fully scalarize v2s16 pieces
1016   getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1017 
1018   // Merge/Unmerge
1019   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1020     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1021     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1022 
1023     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1024       const LLT &Ty = Query.Types[TypeIdx];
1025       if (Ty.isVector()) {
1026         const LLT &EltTy = Ty.getElementType();
1027         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
1028           return true;
1029         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1030           return true;
1031       }
1032       return false;
1033     };
1034 
1035     auto &Builder = getActionDefinitionsBuilder(Op)
1036       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1037       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1038       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1039       // valid.
1040       .clampScalar(LitTyIdx, S16, S256)
1041       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1042       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1043       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1044                            elementTypeIs(1, S16)),
1045                        changeTo(1, V2S16))
1046       // Break up vectors with weird elements into scalars
1047       .fewerElementsIf(
1048         [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
1049         scalarize(0))
1050       .fewerElementsIf(
1051         [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
1052         scalarize(1))
1053       .clampScalar(BigTyIdx, S32, S1024)
1054       .lowerFor({{S16, V2S16}});
1055 
1056     if (Op == G_MERGE_VALUES) {
1057       Builder.widenScalarIf(
1058         // TODO: Use 16-bit shifts if legal for 8-bit values?
1059         [=](const LegalityQuery &Query) {
1060           const LLT Ty = Query.Types[LitTyIdx];
1061           return Ty.getSizeInBits() < 32;
1062         },
1063         changeTo(LitTyIdx, S32));
1064     }
1065 
1066     Builder.widenScalarIf(
1067       [=](const LegalityQuery &Query) {
1068         const LLT Ty = Query.Types[BigTyIdx];
1069         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1070           Ty.getSizeInBits() % 16 != 0;
1071       },
1072       [=](const LegalityQuery &Query) {
1073         // Pick the next power of 2, or a multiple of 64 over 128.
1074         // Whichever is smaller.
1075         const LLT &Ty = Query.Types[BigTyIdx];
1076         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1077         if (NewSizeInBits >= 256) {
1078           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1079           if (RoundedTo < NewSizeInBits)
1080             NewSizeInBits = RoundedTo;
1081         }
1082         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1083       })
1084       .legalIf([=](const LegalityQuery &Query) {
1085           const LLT &BigTy = Query.Types[BigTyIdx];
1086           const LLT &LitTy = Query.Types[LitTyIdx];
1087 
1088           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1089             return false;
1090           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1091             return false;
1092 
1093           return BigTy.getSizeInBits() % 16 == 0 &&
1094                  LitTy.getSizeInBits() % 16 == 0 &&
1095                  BigTy.getSizeInBits() <= 1024;
1096         })
1097       // Any vectors left are the wrong size. Scalarize them.
1098       .scalarize(0)
1099       .scalarize(1);
1100   }
1101 
1102   getActionDefinitionsBuilder(G_SEXT_INREG).lower();
1103 
1104   computeTables();
1105   verify(*ST.getInstrInfo());
1106 }
1107 
1108 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1109                                          MachineRegisterInfo &MRI,
1110                                          MachineIRBuilder &B,
1111                                          GISelChangeObserver &Observer) const {
1112   switch (MI.getOpcode()) {
1113   case TargetOpcode::G_ADDRSPACE_CAST:
1114     return legalizeAddrSpaceCast(MI, MRI, B);
1115   case TargetOpcode::G_FRINT:
1116     return legalizeFrint(MI, MRI, B);
1117   case TargetOpcode::G_FCEIL:
1118     return legalizeFceil(MI, MRI, B);
1119   case TargetOpcode::G_INTRINSIC_TRUNC:
1120     return legalizeIntrinsicTrunc(MI, MRI, B);
1121   case TargetOpcode::G_SITOFP:
1122     return legalizeITOFP(MI, MRI, B, true);
1123   case TargetOpcode::G_UITOFP:
1124     return legalizeITOFP(MI, MRI, B, false);
1125   case TargetOpcode::G_FMINNUM:
1126   case TargetOpcode::G_FMAXNUM:
1127   case TargetOpcode::G_FMINNUM_IEEE:
1128   case TargetOpcode::G_FMAXNUM_IEEE:
1129     return legalizeMinNumMaxNum(MI, MRI, B);
1130   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1131     return legalizeExtractVectorElt(MI, MRI, B);
1132   case TargetOpcode::G_INSERT_VECTOR_ELT:
1133     return legalizeInsertVectorElt(MI, MRI, B);
1134   case TargetOpcode::G_FSIN:
1135   case TargetOpcode::G_FCOS:
1136     return legalizeSinCos(MI, MRI, B);
1137   case TargetOpcode::G_GLOBAL_VALUE:
1138     return legalizeGlobalValue(MI, MRI, B);
1139   case TargetOpcode::G_LOAD:
1140     return legalizeLoad(MI, MRI, B, Observer);
1141   case TargetOpcode::G_FMAD:
1142     return legalizeFMad(MI, MRI, B);
1143   case TargetOpcode::G_FDIV:
1144     return legalizeFDIV(MI, MRI, B);
1145   case TargetOpcode::G_ATOMIC_CMPXCHG:
1146     return legalizeAtomicCmpXChg(MI, MRI, B);
1147   default:
1148     return false;
1149   }
1150 
1151   llvm_unreachable("expected switch to return");
1152 }
1153 
1154 Register AMDGPULegalizerInfo::getSegmentAperture(
1155   unsigned AS,
1156   MachineRegisterInfo &MRI,
1157   MachineIRBuilder &B) const {
1158   MachineFunction &MF = B.getMF();
1159   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1160   const LLT S32 = LLT::scalar(32);
1161 
1162   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1163 
1164   if (ST.hasApertureRegs()) {
1165     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1166     // getreg.
1167     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1168         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1169         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1170     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1171         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1172         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1173     unsigned Encoding =
1174         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1175         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1176         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1177 
1178     Register ApertureReg = MRI.createGenericVirtualRegister(S32);
1179     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1180 
1181     B.buildInstr(AMDGPU::S_GETREG_B32)
1182       .addDef(GetReg)
1183       .addImm(Encoding);
1184     MRI.setType(GetReg, S32);
1185 
1186     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1187     B.buildInstr(TargetOpcode::G_SHL)
1188       .addDef(ApertureReg)
1189       .addUse(GetReg)
1190       .addUse(ShiftAmt.getReg(0));
1191 
1192     return ApertureReg;
1193   }
1194 
1195   Register QueuePtr = MRI.createGenericVirtualRegister(
1196     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1197 
1198   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1199   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1200     return Register();
1201 
1202   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1203   // private_segment_aperture_base_hi.
1204   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1205 
1206   // TODO: can we be smarter about machine pointer info?
1207   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1208   MachineMemOperand *MMO = MF.getMachineMemOperand(
1209     PtrInfo,
1210     MachineMemOperand::MOLoad |
1211     MachineMemOperand::MODereferenceable |
1212     MachineMemOperand::MOInvariant,
1213     4,
1214     MinAlign(64, StructOffset));
1215 
1216   Register LoadResult = MRI.createGenericVirtualRegister(S32);
1217   Register LoadAddr;
1218 
1219   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1220   B.buildLoad(LoadResult, LoadAddr, *MMO);
1221   return LoadResult;
1222 }
1223 
1224 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1225   MachineInstr &MI, MachineRegisterInfo &MRI,
1226   MachineIRBuilder &B) const {
1227   MachineFunction &MF = B.getMF();
1228 
1229   B.setInstr(MI);
1230 
1231   const LLT S32 = LLT::scalar(32);
1232   Register Dst = MI.getOperand(0).getReg();
1233   Register Src = MI.getOperand(1).getReg();
1234 
1235   LLT DstTy = MRI.getType(Dst);
1236   LLT SrcTy = MRI.getType(Src);
1237   unsigned DestAS = DstTy.getAddressSpace();
1238   unsigned SrcAS = SrcTy.getAddressSpace();
1239 
1240   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1241   // vector element.
1242   assert(!DstTy.isVector());
1243 
1244   const AMDGPUTargetMachine &TM
1245     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1246 
1247   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1248   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1249     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1250     return true;
1251   }
1252 
1253   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1254     // Truncate.
1255     B.buildExtract(Dst, Src, 0);
1256     MI.eraseFromParent();
1257     return true;
1258   }
1259 
1260   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1261     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1262     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1263 
1264     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1265     // another. Merge operands are required to be the same type, but creating an
1266     // extra ptrtoint would be kind of pointless.
1267     auto HighAddr = B.buildConstant(
1268       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1269     B.buildMerge(Dst, {Src, HighAddr.getReg(0)});
1270     MI.eraseFromParent();
1271     return true;
1272   }
1273 
1274   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1275     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1276            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1277     unsigned NullVal = TM.getNullPointerValue(DestAS);
1278 
1279     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1280     auto FlatNull = B.buildConstant(SrcTy, 0);
1281 
1282     Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy);
1283 
1284     // Extract low 32-bits of the pointer.
1285     B.buildExtract(PtrLo32, Src, 0);
1286 
1287     Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1288     B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
1289     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1290 
1291     MI.eraseFromParent();
1292     return true;
1293   }
1294 
1295   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1296     return false;
1297 
1298   if (!ST.hasFlatAddressSpace())
1299     return false;
1300 
1301   auto SegmentNull =
1302       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1303   auto FlatNull =
1304       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1305 
1306   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1307   if (!ApertureReg.isValid())
1308     return false;
1309 
1310   Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1311   B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
1312 
1313   Register BuildPtr = MRI.createGenericVirtualRegister(DstTy);
1314 
1315   // Coerce the type of the low half of the result so we can use merge_values.
1316   Register SrcAsInt = MRI.createGenericVirtualRegister(S32);
1317   B.buildInstr(TargetOpcode::G_PTRTOINT)
1318     .addDef(SrcAsInt)
1319     .addUse(Src);
1320 
1321   // TODO: Should we allow mismatched types but matching sizes in merges to
1322   // avoid the ptrtoint?
1323   B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
1324   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
1325 
1326   MI.eraseFromParent();
1327   return true;
1328 }
1329 
1330 bool AMDGPULegalizerInfo::legalizeFrint(
1331   MachineInstr &MI, MachineRegisterInfo &MRI,
1332   MachineIRBuilder &B) const {
1333   B.setInstr(MI);
1334 
1335   Register Src = MI.getOperand(1).getReg();
1336   LLT Ty = MRI.getType(Src);
1337   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1338 
1339   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1340   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1341 
1342   auto C1 = B.buildFConstant(Ty, C1Val);
1343   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1344 
1345   // TODO: Should this propagate fast-math-flags?
1346   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1347   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1348 
1349   auto C2 = B.buildFConstant(Ty, C2Val);
1350   auto Fabs = B.buildFAbs(Ty, Src);
1351 
1352   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1353   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1354   return true;
1355 }
1356 
1357 bool AMDGPULegalizerInfo::legalizeFceil(
1358   MachineInstr &MI, MachineRegisterInfo &MRI,
1359   MachineIRBuilder &B) const {
1360   B.setInstr(MI);
1361 
1362   const LLT S1 = LLT::scalar(1);
1363   const LLT S64 = LLT::scalar(64);
1364 
1365   Register Src = MI.getOperand(1).getReg();
1366   assert(MRI.getType(Src) == S64);
1367 
1368   // result = trunc(src)
1369   // if (src > 0.0 && src != result)
1370   //   result += 1.0
1371 
1372   auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src});
1373 
1374   const auto Zero = B.buildFConstant(S64, 0.0);
1375   const auto One = B.buildFConstant(S64, 1.0);
1376   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1377   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1378   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1379   auto Add = B.buildSelect(S64, And, One, Zero);
1380 
1381   // TODO: Should this propagate fast-math-flags?
1382   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1383   return true;
1384 }
1385 
1386 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1387                                               MachineIRBuilder &B) {
1388   const unsigned FractBits = 52;
1389   const unsigned ExpBits = 11;
1390   LLT S32 = LLT::scalar(32);
1391 
1392   auto Const0 = B.buildConstant(S32, FractBits - 32);
1393   auto Const1 = B.buildConstant(S32, ExpBits);
1394 
1395   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1396     .addUse(Const0.getReg(0))
1397     .addUse(Const1.getReg(0));
1398 
1399   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1400 }
1401 
1402 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1403   MachineInstr &MI, MachineRegisterInfo &MRI,
1404   MachineIRBuilder &B) const {
1405   B.setInstr(MI);
1406 
1407   const LLT S1 = LLT::scalar(1);
1408   const LLT S32 = LLT::scalar(32);
1409   const LLT S64 = LLT::scalar(64);
1410 
1411   Register Src = MI.getOperand(1).getReg();
1412   assert(MRI.getType(Src) == S64);
1413 
1414   // TODO: Should this use extract since the low half is unused?
1415   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1416   Register Hi = Unmerge.getReg(1);
1417 
1418   // Extract the upper half, since this is where we will find the sign and
1419   // exponent.
1420   auto Exp = extractF64Exponent(Hi, B);
1421 
1422   const unsigned FractBits = 52;
1423 
1424   // Extract the sign bit.
1425   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1426   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1427 
1428   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1429 
1430   const auto Zero32 = B.buildConstant(S32, 0);
1431 
1432   // Extend back to 64-bits.
1433   auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)});
1434 
1435   auto Shr = B.buildAShr(S64, FractMask, Exp);
1436   auto Not = B.buildNot(S64, Shr);
1437   auto Tmp0 = B.buildAnd(S64, Src, Not);
1438   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1439 
1440   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1441   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1442 
1443   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1444   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1445   return true;
1446 }
1447 
1448 bool AMDGPULegalizerInfo::legalizeITOFP(
1449   MachineInstr &MI, MachineRegisterInfo &MRI,
1450   MachineIRBuilder &B, bool Signed) const {
1451   B.setInstr(MI);
1452 
1453   Register Dst = MI.getOperand(0).getReg();
1454   Register Src = MI.getOperand(1).getReg();
1455 
1456   const LLT S64 = LLT::scalar(64);
1457   const LLT S32 = LLT::scalar(32);
1458 
1459   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1460 
1461   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1462 
1463   auto CvtHi = Signed ?
1464     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1465     B.buildUITOFP(S64, Unmerge.getReg(1));
1466 
1467   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1468 
1469   auto ThirtyTwo = B.buildConstant(S32, 32);
1470   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1471     .addUse(CvtHi.getReg(0))
1472     .addUse(ThirtyTwo.getReg(0));
1473 
1474   // TODO: Should this propagate fast-math-flags?
1475   B.buildFAdd(Dst, LdExp, CvtLo);
1476   MI.eraseFromParent();
1477   return true;
1478 }
1479 
1480 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1481   MachineInstr &MI, MachineRegisterInfo &MRI,
1482   MachineIRBuilder &B) const {
1483   MachineFunction &MF = B.getMF();
1484   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1485 
1486   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1487                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1488 
1489   // With ieee_mode disabled, the instructions have the correct behavior
1490   // already for G_FMINNUM/G_FMAXNUM
1491   if (!MFI->getMode().IEEE)
1492     return !IsIEEEOp;
1493 
1494   if (IsIEEEOp)
1495     return true;
1496 
1497   MachineIRBuilder HelperBuilder(MI);
1498   GISelObserverWrapper DummyObserver;
1499   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1500   HelperBuilder.setInstr(MI);
1501   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1502 }
1503 
1504 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1505   MachineInstr &MI, MachineRegisterInfo &MRI,
1506   MachineIRBuilder &B) const {
1507   // TODO: Should move some of this into LegalizerHelper.
1508 
1509   // TODO: Promote dynamic indexing of s16 to s32
1510   // TODO: Dynamic s64 indexing is only legal for SGPR.
1511   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
1512   if (!IdxVal) // Dynamic case will be selected to register indexing.
1513     return true;
1514 
1515   Register Dst = MI.getOperand(0).getReg();
1516   Register Vec = MI.getOperand(1).getReg();
1517 
1518   LLT VecTy = MRI.getType(Vec);
1519   LLT EltTy = VecTy.getElementType();
1520   assert(EltTy == MRI.getType(Dst));
1521 
1522   B.setInstr(MI);
1523 
1524   if (IdxVal.getValue() < VecTy.getNumElements())
1525     B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits());
1526   else
1527     B.buildUndef(Dst);
1528 
1529   MI.eraseFromParent();
1530   return true;
1531 }
1532 
1533 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1534   MachineInstr &MI, MachineRegisterInfo &MRI,
1535   MachineIRBuilder &B) const {
1536   // TODO: Should move some of this into LegalizerHelper.
1537 
1538   // TODO: Promote dynamic indexing of s16 to s32
1539   // TODO: Dynamic s64 indexing is only legal for SGPR.
1540   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI);
1541   if (!IdxVal) // Dynamic case will be selected to register indexing.
1542     return true;
1543 
1544   Register Dst = MI.getOperand(0).getReg();
1545   Register Vec = MI.getOperand(1).getReg();
1546   Register Ins = MI.getOperand(2).getReg();
1547 
1548   LLT VecTy = MRI.getType(Vec);
1549   LLT EltTy = VecTy.getElementType();
1550   assert(EltTy == MRI.getType(Ins));
1551 
1552   B.setInstr(MI);
1553 
1554   if (IdxVal.getValue() < VecTy.getNumElements())
1555     B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits());
1556   else
1557     B.buildUndef(Dst);
1558 
1559   MI.eraseFromParent();
1560   return true;
1561 }
1562 
1563 bool AMDGPULegalizerInfo::legalizeSinCos(
1564   MachineInstr &MI, MachineRegisterInfo &MRI,
1565   MachineIRBuilder &B) const {
1566   B.setInstr(MI);
1567 
1568   Register DstReg = MI.getOperand(0).getReg();
1569   Register SrcReg = MI.getOperand(1).getReg();
1570   LLT Ty = MRI.getType(DstReg);
1571   unsigned Flags = MI.getFlags();
1572 
1573   Register TrigVal;
1574   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1575   if (ST.hasTrigReducedRange()) {
1576     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1577     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1578       .addUse(MulVal.getReg(0))
1579       .setMIFlags(Flags).getReg(0);
1580   } else
1581     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1582 
1583   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1584     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1585   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1586     .addUse(TrigVal)
1587     .setMIFlags(Flags);
1588   MI.eraseFromParent();
1589   return true;
1590 }
1591 
1592 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1593   Register DstReg, LLT PtrTy,
1594   MachineIRBuilder &B, const GlobalValue *GV,
1595   unsigned Offset, unsigned GAFlags) const {
1596   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1597   // to the following code sequence:
1598   //
1599   // For constant address space:
1600   //   s_getpc_b64 s[0:1]
1601   //   s_add_u32 s0, s0, $symbol
1602   //   s_addc_u32 s1, s1, 0
1603   //
1604   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1605   //   a fixup or relocation is emitted to replace $symbol with a literal
1606   //   constant, which is a pc-relative offset from the encoding of the $symbol
1607   //   operand to the global variable.
1608   //
1609   // For global address space:
1610   //   s_getpc_b64 s[0:1]
1611   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1612   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1613   //
1614   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1615   //   fixups or relocations are emitted to replace $symbol@*@lo and
1616   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1617   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
1618   //   operand to the global variable.
1619   //
1620   // What we want here is an offset from the value returned by s_getpc
1621   // (which is the address of the s_add_u32 instruction) to the global
1622   // variable, but since the encoding of $symbol starts 4 bytes after the start
1623   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1624   // small. This requires us to add 4 to the global variable offset in order to
1625   // compute the correct address.
1626 
1627   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1628 
1629   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1630     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1631 
1632   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1633     .addDef(PCReg);
1634 
1635   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1636   if (GAFlags == SIInstrInfo::MO_NONE)
1637     MIB.addImm(0);
1638   else
1639     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1640 
1641   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1642 
1643   if (PtrTy.getSizeInBits() == 32)
1644     B.buildExtract(DstReg, PCReg, 0);
1645   return true;
1646  }
1647 
1648 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1649   MachineInstr &MI, MachineRegisterInfo &MRI,
1650   MachineIRBuilder &B) const {
1651   Register DstReg = MI.getOperand(0).getReg();
1652   LLT Ty = MRI.getType(DstReg);
1653   unsigned AS = Ty.getAddressSpace();
1654 
1655   const GlobalValue *GV = MI.getOperand(1).getGlobal();
1656   MachineFunction &MF = B.getMF();
1657   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1658   B.setInstr(MI);
1659 
1660   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1661     if (!MFI->isEntryFunction()) {
1662       const Function &Fn = MF.getFunction();
1663       DiagnosticInfoUnsupported BadLDSDecl(
1664         Fn, "local memory global used by non-kernel function", MI.getDebugLoc());
1665       Fn.getContext().diagnose(BadLDSDecl);
1666     }
1667 
1668     // TODO: We could emit code to handle the initialization somewhere.
1669     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1670       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1671       MI.eraseFromParent();
1672       return true;
1673     }
1674 
1675     const Function &Fn = MF.getFunction();
1676     DiagnosticInfoUnsupported BadInit(
1677       Fn, "unsupported initializer for address space", MI.getDebugLoc());
1678     Fn.getContext().diagnose(BadInit);
1679     return true;
1680   }
1681 
1682   const SITargetLowering *TLI = ST.getTargetLowering();
1683 
1684   if (TLI->shouldEmitFixup(GV)) {
1685     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
1686     MI.eraseFromParent();
1687     return true;
1688   }
1689 
1690   if (TLI->shouldEmitPCReloc(GV)) {
1691     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
1692     MI.eraseFromParent();
1693     return true;
1694   }
1695 
1696   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1697   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
1698 
1699   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
1700     MachinePointerInfo::getGOT(MF),
1701     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1702     MachineMemOperand::MOInvariant,
1703     8 /*Size*/, 8 /*Align*/);
1704 
1705   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
1706 
1707   if (Ty.getSizeInBits() == 32) {
1708     // Truncate if this is a 32-bit constant adrdess.
1709     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
1710     B.buildExtract(DstReg, Load, 0);
1711   } else
1712     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
1713 
1714   MI.eraseFromParent();
1715   return true;
1716 }
1717 
1718 bool AMDGPULegalizerInfo::legalizeLoad(
1719   MachineInstr &MI, MachineRegisterInfo &MRI,
1720   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
1721   B.setInstr(MI);
1722   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1723   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
1724   Observer.changingInstr(MI);
1725   MI.getOperand(1).setReg(Cast.getReg(0));
1726   Observer.changedInstr(MI);
1727   return true;
1728 }
1729 
1730 bool AMDGPULegalizerInfo::legalizeFMad(
1731   MachineInstr &MI, MachineRegisterInfo &MRI,
1732   MachineIRBuilder &B) const {
1733   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1734   assert(Ty.isScalar());
1735 
1736   MachineFunction &MF = B.getMF();
1737   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1738 
1739   // TODO: Always legal with future ftz flag.
1740   if (Ty == LLT::scalar(32) && !MFI->getMode().FP32Denormals)
1741     return true;
1742   if (Ty == LLT::scalar(16) && !MFI->getMode().FP64FP16Denormals)
1743     return true;
1744 
1745 
1746   MachineIRBuilder HelperBuilder(MI);
1747   GISelObserverWrapper DummyObserver;
1748   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1749   HelperBuilder.setMBB(*MI.getParent());
1750   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
1751 }
1752 
1753 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
1754   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
1755   Register DstReg = MI.getOperand(0).getReg();
1756   Register PtrReg = MI.getOperand(1).getReg();
1757   Register CmpVal = MI.getOperand(2).getReg();
1758   Register NewVal = MI.getOperand(3).getReg();
1759 
1760   assert(SITargetLowering::isFlatGlobalAddrSpace(
1761            MRI.getType(PtrReg).getAddressSpace()) &&
1762          "this should not have been custom lowered");
1763 
1764   LLT ValTy = MRI.getType(CmpVal);
1765   LLT VecTy = LLT::vector(2, ValTy);
1766 
1767   B.setInstr(MI);
1768   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
1769 
1770   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
1771     .addDef(DstReg)
1772     .addUse(PtrReg)
1773     .addUse(PackedVal)
1774     .setMemRefs(MI.memoperands());
1775 
1776   MI.eraseFromParent();
1777   return true;
1778 }
1779 
1780 // Return the use branch instruction, otherwise null if the usage is invalid.
1781 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
1782                                        MachineRegisterInfo &MRI) {
1783   Register CondDef = MI.getOperand(0).getReg();
1784   if (!MRI.hasOneNonDBGUse(CondDef))
1785     return nullptr;
1786 
1787   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
1788   return UseMI.getParent() == MI.getParent() &&
1789     UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr;
1790 }
1791 
1792 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
1793                                                 Register Reg, LLT Ty) const {
1794   Register LiveIn = MRI.getLiveInVirtReg(Reg);
1795   if (LiveIn)
1796     return LiveIn;
1797 
1798   Register NewReg = MRI.createGenericVirtualRegister(Ty);
1799   MRI.addLiveIn(Reg, NewReg);
1800   return NewReg;
1801 }
1802 
1803 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
1804                                          const ArgDescriptor *Arg) const {
1805   if (!Arg->isRegister() || !Arg->getRegister().isValid())
1806     return false; // TODO: Handle these
1807 
1808   assert(Arg->getRegister().isPhysical());
1809 
1810   MachineRegisterInfo &MRI = *B.getMRI();
1811 
1812   LLT Ty = MRI.getType(DstReg);
1813   Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
1814 
1815   if (Arg->isMasked()) {
1816     // TODO: Should we try to emit this once in the entry block?
1817     const LLT S32 = LLT::scalar(32);
1818     const unsigned Mask = Arg->getMask();
1819     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
1820 
1821     Register AndMaskSrc = LiveIn;
1822 
1823     if (Shift != 0) {
1824       auto ShiftAmt = B.buildConstant(S32, Shift);
1825       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
1826     }
1827 
1828     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
1829   } else
1830     B.buildCopy(DstReg, LiveIn);
1831 
1832   // Insert the argument copy if it doens't already exist.
1833   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
1834   if (!MRI.getVRegDef(LiveIn)) {
1835     // FIXME: Should have scoped insert pt
1836     MachineBasicBlock &OrigInsBB = B.getMBB();
1837     auto OrigInsPt = B.getInsertPt();
1838 
1839     MachineBasicBlock &EntryMBB = B.getMF().front();
1840     EntryMBB.addLiveIn(Arg->getRegister());
1841     B.setInsertPt(EntryMBB, EntryMBB.begin());
1842     B.buildCopy(LiveIn, Arg->getRegister());
1843 
1844     B.setInsertPt(OrigInsBB, OrigInsPt);
1845   }
1846 
1847   return true;
1848 }
1849 
1850 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
1851   MachineInstr &MI,
1852   MachineRegisterInfo &MRI,
1853   MachineIRBuilder &B,
1854   AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
1855   B.setInstr(MI);
1856 
1857   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1858 
1859   const ArgDescriptor *Arg;
1860   const TargetRegisterClass *RC;
1861   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
1862   if (!Arg) {
1863     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
1864     return false;
1865   }
1866 
1867   if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
1868     MI.eraseFromParent();
1869     return true;
1870   }
1871 
1872   return false;
1873 }
1874 
1875 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
1876                                        MachineRegisterInfo &MRI,
1877                                        MachineIRBuilder &B) const {
1878   B.setInstr(MI);
1879   Register Dst = MI.getOperand(0).getReg();
1880   LLT DstTy = MRI.getType(Dst);
1881   LLT S16 = LLT::scalar(16);
1882   LLT S32 = LLT::scalar(32);
1883   LLT S64 = LLT::scalar(64);
1884 
1885   if (legalizeFastUnsafeFDIV(MI, MRI, B))
1886     return true;
1887 
1888   if (DstTy == S16)
1889     return legalizeFDIV16(MI, MRI, B);
1890   if (DstTy == S32)
1891     return legalizeFDIV32(MI, MRI, B);
1892   if (DstTy == S64)
1893     return legalizeFDIV64(MI, MRI, B);
1894 
1895   return false;
1896 }
1897 
1898 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
1899                                                  MachineRegisterInfo &MRI,
1900                                                  MachineIRBuilder &B) const {
1901   Register Res = MI.getOperand(0).getReg();
1902   Register LHS = MI.getOperand(1).getReg();
1903   Register RHS = MI.getOperand(2).getReg();
1904 
1905   uint16_t Flags = MI.getFlags();
1906 
1907   LLT ResTy = MRI.getType(Res);
1908   LLT S32 = LLT::scalar(32);
1909   LLT S64 = LLT::scalar(64);
1910 
1911   const MachineFunction &MF = B.getMF();
1912   bool Unsafe =
1913     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
1914 
1915   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
1916     return false;
1917 
1918   if (!Unsafe && ResTy == S32 &&
1919       MF.getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals)
1920     return false;
1921 
1922   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
1923     // 1 / x -> RCP(x)
1924     if (CLHS->isExactlyValue(1.0)) {
1925       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
1926         .addUse(RHS)
1927         .setMIFlags(Flags);
1928 
1929       MI.eraseFromParent();
1930       return true;
1931     }
1932 
1933     // -1 / x -> RCP( FNEG(x) )
1934     if (CLHS->isExactlyValue(-1.0)) {
1935       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
1936       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
1937         .addUse(FNeg.getReg(0))
1938         .setMIFlags(Flags);
1939 
1940       MI.eraseFromParent();
1941       return true;
1942     }
1943   }
1944 
1945   // x / y -> x * (1.0 / y)
1946   if (Unsafe) {
1947     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
1948       .addUse(RHS)
1949       .setMIFlags(Flags);
1950     B.buildFMul(Res, LHS, RCP, Flags);
1951 
1952     MI.eraseFromParent();
1953     return true;
1954   }
1955 
1956   return false;
1957 }
1958 
1959 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
1960                                          MachineRegisterInfo &MRI,
1961                                          MachineIRBuilder &B) const {
1962   B.setInstr(MI);
1963   Register Res = MI.getOperand(0).getReg();
1964   Register LHS = MI.getOperand(1).getReg();
1965   Register RHS = MI.getOperand(2).getReg();
1966 
1967   uint16_t Flags = MI.getFlags();
1968 
1969   LLT S16 = LLT::scalar(16);
1970   LLT S32 = LLT::scalar(32);
1971 
1972   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
1973   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
1974 
1975   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
1976     .addUse(RHSExt.getReg(0))
1977     .setMIFlags(Flags);
1978 
1979   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
1980   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
1981 
1982   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
1983     .addUse(RDst.getReg(0))
1984     .addUse(RHS)
1985     .addUse(LHS)
1986     .setMIFlags(Flags);
1987 
1988   MI.eraseFromParent();
1989   return true;
1990 }
1991 
1992 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
1993 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
1994 static void toggleSPDenormMode(bool Enable,
1995                                MachineIRBuilder &B,
1996                                const GCNSubtarget &ST,
1997                                AMDGPU::SIModeRegisterDefaults Mode) {
1998   // Set SP denorm mode to this value.
1999   unsigned SPDenormMode =
2000     Enable ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
2001 
2002   if (ST.hasDenormModeInst()) {
2003     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2004     unsigned DPDenormModeDefault = Mode.FP64FP16Denormals
2005                                    ? FP_DENORM_FLUSH_NONE
2006                                    : FP_DENORM_FLUSH_IN_FLUSH_OUT;
2007 
2008     unsigned NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2009     B.buildInstr(AMDGPU::S_DENORM_MODE)
2010       .addImm(NewDenormModeValue);
2011 
2012   } else {
2013     // Select FP32 bit field in mode register.
2014     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2015                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2016                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2017 
2018     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2019       .addImm(SPDenormMode)
2020       .addImm(SPDenormModeBitField);
2021   }
2022 }
2023 
2024 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2025                                          MachineRegisterInfo &MRI,
2026                                          MachineIRBuilder &B) const {
2027   B.setInstr(MI);
2028   Register Res = MI.getOperand(0).getReg();
2029   Register LHS = MI.getOperand(1).getReg();
2030   Register RHS = MI.getOperand(2).getReg();
2031   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2032   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2033 
2034   uint16_t Flags = MI.getFlags();
2035 
2036   LLT S32 = LLT::scalar(32);
2037   LLT S1 = LLT::scalar(1);
2038 
2039   auto One = B.buildFConstant(S32, 1.0f);
2040 
2041   auto DenominatorScaled =
2042     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2043       .addUse(RHS)
2044       .addUse(LHS)
2045       .addImm(1)
2046       .setMIFlags(Flags);
2047   auto NumeratorScaled =
2048     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2049       .addUse(LHS)
2050       .addUse(RHS)
2051       .addImm(0)
2052       .setMIFlags(Flags);
2053 
2054   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2055     .addUse(DenominatorScaled.getReg(0))
2056     .setMIFlags(Flags);
2057   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2058 
2059   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2060   // aren't modeled as reading it.
2061   if (!Mode.FP32Denormals)
2062     toggleSPDenormMode(true, B, ST, Mode);
2063 
2064   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2065   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2066   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2067   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2068   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2069   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2070 
2071   if (!Mode.FP32Denormals)
2072     toggleSPDenormMode(false, B, ST, Mode);
2073 
2074   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2075     .addUse(Fma4.getReg(0))
2076     .addUse(Fma1.getReg(0))
2077     .addUse(Fma3.getReg(0))
2078     .addUse(NumeratorScaled.getReg(1))
2079     .setMIFlags(Flags);
2080 
2081   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2082     .addUse(Fmas.getReg(0))
2083     .addUse(RHS)
2084     .addUse(LHS)
2085     .setMIFlags(Flags);
2086 
2087   MI.eraseFromParent();
2088   return true;
2089 }
2090 
2091 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
2092                                          MachineRegisterInfo &MRI,
2093                                          MachineIRBuilder &B) const {
2094   B.setInstr(MI);
2095   Register Res = MI.getOperand(0).getReg();
2096   Register LHS = MI.getOperand(1).getReg();
2097   Register RHS = MI.getOperand(2).getReg();
2098 
2099   uint16_t Flags = MI.getFlags();
2100 
2101   LLT S64 = LLT::scalar(64);
2102   LLT S1 = LLT::scalar(1);
2103 
2104   auto One = B.buildFConstant(S64, 1.0);
2105 
2106   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2107     .addUse(LHS)
2108     .addUse(RHS)
2109     .addImm(1)
2110     .setMIFlags(Flags);
2111 
2112   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
2113 
2114   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
2115     .addUse(DivScale0.getReg(0))
2116     .setMIFlags(Flags);
2117 
2118   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
2119   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
2120   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
2121 
2122   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2123     .addUse(LHS)
2124     .addUse(RHS)
2125     .addImm(0)
2126     .setMIFlags(Flags);
2127 
2128   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
2129   auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags);
2130   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
2131 
2132   Register Scale;
2133   if (!ST.hasUsableDivScaleConditionOutput()) {
2134     // Workaround a hardware bug on SI where the condition output from div_scale
2135     // is not usable.
2136 
2137     Scale = MRI.createGenericVirtualRegister(S1);
2138 
2139     LLT S32 = LLT::scalar(32);
2140 
2141     auto NumUnmerge = B.buildUnmerge(S32, LHS);
2142     auto DenUnmerge = B.buildUnmerge(S32, RHS);
2143     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
2144     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
2145 
2146     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
2147                               Scale1Unmerge.getReg(1));
2148     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
2149                               Scale0Unmerge.getReg(1));
2150     B.buildXor(Scale, CmpNum, CmpDen);
2151   } else {
2152     Scale = DivScale1.getReg(1);
2153   }
2154 
2155   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
2156     .addUse(Fma4.getReg(0))
2157     .addUse(Fma3.getReg(0))
2158     .addUse(Mul.getReg(0))
2159     .addUse(Scale)
2160     .setMIFlags(Flags);
2161 
2162   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
2163     .addUse(Fmas.getReg(0))
2164     .addUse(RHS)
2165     .addUse(LHS)
2166     .setMIFlags(Flags);
2167 
2168   MI.eraseFromParent();
2169   return true;
2170 }
2171 
2172 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
2173                                                  MachineRegisterInfo &MRI,
2174                                                  MachineIRBuilder &B) const {
2175   B.setInstr(MI);
2176   Register Res = MI.getOperand(0).getReg();
2177   Register LHS = MI.getOperand(2).getReg();
2178   Register RHS = MI.getOperand(3).getReg();
2179   uint16_t Flags = MI.getFlags();
2180 
2181   LLT S32 = LLT::scalar(32);
2182   LLT S1 = LLT::scalar(1);
2183 
2184   auto Abs = B.buildFAbs(S32, RHS, Flags);
2185   const APFloat C0Val(1.0f);
2186 
2187   auto C0 = B.buildConstant(S32, 0x6f800000);
2188   auto C1 = B.buildConstant(S32, 0x2f800000);
2189   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
2190 
2191   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
2192   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
2193 
2194   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
2195 
2196   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2197     .addUse(Mul0.getReg(0))
2198     .setMIFlags(Flags);
2199 
2200   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
2201 
2202   B.buildFMul(Res, Sel, Mul1, Flags);
2203 
2204   MI.eraseFromParent();
2205   return true;
2206 }
2207 
2208 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
2209                                                  MachineRegisterInfo &MRI,
2210                                                  MachineIRBuilder &B) const {
2211   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2212   if (!MFI->isEntryFunction()) {
2213     return legalizePreloadedArgIntrin(MI, MRI, B,
2214                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
2215   }
2216 
2217   B.setInstr(MI);
2218 
2219   uint64_t Offset =
2220     ST.getTargetLowering()->getImplicitParameterOffset(
2221       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
2222   Register DstReg = MI.getOperand(0).getReg();
2223   LLT DstTy = MRI.getType(DstReg);
2224   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
2225 
2226   const ArgDescriptor *Arg;
2227   const TargetRegisterClass *RC;
2228   std::tie(Arg, RC)
2229     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2230   if (!Arg)
2231     return false;
2232 
2233   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
2234   if (!loadInputValue(KernargPtrReg, B, Arg))
2235     return false;
2236 
2237   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
2238   MI.eraseFromParent();
2239   return true;
2240 }
2241 
2242 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
2243                                               MachineRegisterInfo &MRI,
2244                                               MachineIRBuilder &B,
2245                                               unsigned AddrSpace) const {
2246   B.setInstr(MI);
2247   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
2248   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
2249   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
2250   MI.eraseFromParent();
2251   return true;
2252 }
2253 
2254 /// Handle register layout difference for f16 images for some subtargets.
2255 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
2256                                              MachineRegisterInfo &MRI,
2257                                              Register Reg) const {
2258   if (!ST.hasUnpackedD16VMem())
2259     return Reg;
2260 
2261   const LLT S16 = LLT::scalar(16);
2262   const LLT S32 = LLT::scalar(32);
2263   LLT StoreVT = MRI.getType(Reg);
2264   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
2265 
2266   auto Unmerge = B.buildUnmerge(S16, Reg);
2267 
2268   SmallVector<Register, 4> WideRegs;
2269   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
2270     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
2271 
2272   int NumElts = StoreVT.getNumElements();
2273 
2274   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
2275 }
2276 
2277 bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI,
2278                                                  MachineRegisterInfo &MRI,
2279                                                  MachineIRBuilder &B,
2280                                                  bool IsFormat) const {
2281   // TODO: Reject f16 format on targets where unsupported.
2282   Register VData = MI.getOperand(1).getReg();
2283   LLT Ty = MRI.getType(VData);
2284 
2285   B.setInstr(MI);
2286 
2287   const LLT S32 = LLT::scalar(32);
2288   const LLT S16 = LLT::scalar(16);
2289 
2290   // Fixup illegal register types for i8 stores.
2291   if (Ty == LLT::scalar(8) || Ty == S16) {
2292     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
2293     MI.getOperand(1).setReg(AnyExt);
2294     return true;
2295   }
2296 
2297   if (Ty.isVector()) {
2298     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
2299       if (IsFormat)
2300         MI.getOperand(1).setReg(handleD16VData(B, MRI, VData));
2301       return true;
2302     }
2303 
2304     return Ty.getElementType() == S32 && Ty.getNumElements() <= 4;
2305   }
2306 
2307   return Ty == S32;
2308 }
2309 
2310 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
2311                                             MachineRegisterInfo &MRI,
2312                                             MachineIRBuilder &B) const {
2313   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
2314   auto IntrID = MI.getIntrinsicID();
2315   switch (IntrID) {
2316   case Intrinsic::amdgcn_if:
2317   case Intrinsic::amdgcn_else: {
2318     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
2319       const SIRegisterInfo *TRI
2320         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
2321 
2322       B.setInstr(*BrCond);
2323       Register Def = MI.getOperand(1).getReg();
2324       Register Use = MI.getOperand(3).getReg();
2325 
2326       if (IntrID == Intrinsic::amdgcn_if) {
2327         B.buildInstr(AMDGPU::SI_IF)
2328           .addDef(Def)
2329           .addUse(Use)
2330           .addMBB(BrCond->getOperand(1).getMBB());
2331       } else {
2332         B.buildInstr(AMDGPU::SI_ELSE)
2333           .addDef(Def)
2334           .addUse(Use)
2335           .addMBB(BrCond->getOperand(1).getMBB())
2336           .addImm(0);
2337       }
2338 
2339       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
2340       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
2341       MI.eraseFromParent();
2342       BrCond->eraseFromParent();
2343       return true;
2344     }
2345 
2346     return false;
2347   }
2348   case Intrinsic::amdgcn_loop: {
2349     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
2350       const SIRegisterInfo *TRI
2351         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
2352 
2353       B.setInstr(*BrCond);
2354       Register Reg = MI.getOperand(2).getReg();
2355       B.buildInstr(AMDGPU::SI_LOOP)
2356         .addUse(Reg)
2357         .addMBB(BrCond->getOperand(1).getMBB());
2358       MI.eraseFromParent();
2359       BrCond->eraseFromParent();
2360       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
2361       return true;
2362     }
2363 
2364     return false;
2365   }
2366   case Intrinsic::amdgcn_kernarg_segment_ptr:
2367     return legalizePreloadedArgIntrin(
2368       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2369   case Intrinsic::amdgcn_implicitarg_ptr:
2370     return legalizeImplicitArgPtr(MI, MRI, B);
2371   case Intrinsic::amdgcn_workitem_id_x:
2372     return legalizePreloadedArgIntrin(MI, MRI, B,
2373                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
2374   case Intrinsic::amdgcn_workitem_id_y:
2375     return legalizePreloadedArgIntrin(MI, MRI, B,
2376                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
2377   case Intrinsic::amdgcn_workitem_id_z:
2378     return legalizePreloadedArgIntrin(MI, MRI, B,
2379                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
2380   case Intrinsic::amdgcn_workgroup_id_x:
2381     return legalizePreloadedArgIntrin(MI, MRI, B,
2382                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
2383   case Intrinsic::amdgcn_workgroup_id_y:
2384     return legalizePreloadedArgIntrin(MI, MRI, B,
2385                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
2386   case Intrinsic::amdgcn_workgroup_id_z:
2387     return legalizePreloadedArgIntrin(MI, MRI, B,
2388                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
2389   case Intrinsic::amdgcn_dispatch_ptr:
2390     return legalizePreloadedArgIntrin(MI, MRI, B,
2391                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
2392   case Intrinsic::amdgcn_queue_ptr:
2393     return legalizePreloadedArgIntrin(MI, MRI, B,
2394                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
2395   case Intrinsic::amdgcn_implicit_buffer_ptr:
2396     return legalizePreloadedArgIntrin(
2397       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
2398   case Intrinsic::amdgcn_dispatch_id:
2399     return legalizePreloadedArgIntrin(MI, MRI, B,
2400                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
2401   case Intrinsic::amdgcn_fdiv_fast:
2402     return legalizeFDIVFastIntrin(MI, MRI, B);
2403   case Intrinsic::amdgcn_is_shared:
2404     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
2405   case Intrinsic::amdgcn_is_private:
2406     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
2407   case Intrinsic::amdgcn_wavefrontsize: {
2408     B.setInstr(MI);
2409     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
2410     MI.eraseFromParent();
2411     return true;
2412   }
2413   case Intrinsic::amdgcn_raw_buffer_store:
2414     return legalizeRawBufferStore(MI, MRI, B, false);
2415   case Intrinsic::amdgcn_raw_buffer_store_format:
2416     return legalizeRawBufferStore(MI, MRI, B, true);
2417   default:
2418     return true;
2419   }
2420 
2421   return true;
2422 }
2423