1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
19 #endif
20 
21 #include "AMDGPU.h"
22 #include "AMDGPULegalizerInfo.h"
23 #include "AMDGPUTargetMachine.h"
24 #include "SIMachineFunctionInfo.h"
25 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/CodeGen/TargetOpcodes.h"
28 #include "llvm/CodeGen/ValueTypes.h"
29 #include "llvm/IR/DerivedTypes.h"
30 #include "llvm/IR/DiagnosticInfo.h"
31 #include "llvm/IR/Type.h"
32 #include "llvm/Support/Debug.h"
33 
34 #define DEBUG_TYPE "amdgpu-legalinfo"
35 
36 using namespace llvm;
37 using namespace LegalizeActions;
38 using namespace LegalizeMutations;
39 using namespace LegalityPredicates;
40 
41 
42 static LegalityPredicate isMultiple32(unsigned TypeIdx,
43                                       unsigned MaxSize = 1024) {
44   return [=](const LegalityQuery &Query) {
45     const LLT Ty = Query.Types[TypeIdx];
46     const LLT EltTy = Ty.getScalarType();
47     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
48   };
49 }
50 
51 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
52   return [=](const LegalityQuery &Query) {
53     return Query.Types[TypeIdx].getSizeInBits() == Size;
54   };
55 }
56 
57 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
58   return [=](const LegalityQuery &Query) {
59     const LLT Ty = Query.Types[TypeIdx];
60     return Ty.isVector() &&
61            Ty.getNumElements() % 2 != 0 &&
62            Ty.getElementType().getSizeInBits() < 32 &&
63            Ty.getSizeInBits() % 32 != 0;
64   };
65 }
66 
67 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
68   return [=](const LegalityQuery &Query) {
69     const LLT Ty = Query.Types[TypeIdx];
70     const LLT EltTy = Ty.getScalarType();
71     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
72   };
73 }
74 
75 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
76   return [=](const LegalityQuery &Query) {
77     const LLT Ty = Query.Types[TypeIdx];
78     const LLT EltTy = Ty.getElementType();
79     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
80   };
81 }
82 
83 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
84   return [=](const LegalityQuery &Query) {
85     const LLT Ty = Query.Types[TypeIdx];
86     const LLT EltTy = Ty.getElementType();
87     unsigned Size = Ty.getSizeInBits();
88     unsigned Pieces = (Size + 63) / 64;
89     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
90     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
91   };
92 }
93 
94 // Increase the number of vector elements to reach the next multiple of 32-bit
95 // type.
96 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
97   return [=](const LegalityQuery &Query) {
98     const LLT Ty = Query.Types[TypeIdx];
99 
100     const LLT EltTy = Ty.getElementType();
101     const int Size = Ty.getSizeInBits();
102     const int EltSize = EltTy.getSizeInBits();
103     const int NextMul32 = (Size + 31) / 32;
104 
105     assert(EltSize < 32);
106 
107     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
108     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
109   };
110 }
111 
112 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
113   return [=](const LegalityQuery &Query) {
114     const LLT QueryTy = Query.Types[TypeIdx];
115     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
116   };
117 }
118 
119 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
120   return [=](const LegalityQuery &Query) {
121     const LLT QueryTy = Query.Types[TypeIdx];
122     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
123   };
124 }
125 
126 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
127   return [=](const LegalityQuery &Query) {
128     const LLT QueryTy = Query.Types[TypeIdx];
129     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
130   };
131 }
132 
133 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
134 // v2s16.
135 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
136   return [=](const LegalityQuery &Query) {
137     const LLT Ty = Query.Types[TypeIdx];
138     if (Ty.isVector()) {
139       const int EltSize = Ty.getElementType().getSizeInBits();
140       return EltSize == 32 || EltSize == 64 ||
141             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
142              EltSize == 128 || EltSize == 256;
143     }
144 
145     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
146   };
147 }
148 
149 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
150   return [=](const LegalityQuery &Query) {
151     return Query.Types[TypeIdx].getElementType() == Type;
152   };
153 }
154 
155 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
156   return [=](const LegalityQuery &Query) {
157     const LLT Ty = Query.Types[TypeIdx];
158     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
159            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
160   };
161 }
162 
163 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
164                                          const GCNTargetMachine &TM)
165   :  ST(ST_) {
166   using namespace TargetOpcode;
167 
168   auto GetAddrSpacePtr = [&TM](unsigned AS) {
169     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
170   };
171 
172   const LLT S1 = LLT::scalar(1);
173   const LLT S8 = LLT::scalar(8);
174   const LLT S16 = LLT::scalar(16);
175   const LLT S32 = LLT::scalar(32);
176   const LLT S64 = LLT::scalar(64);
177   const LLT S96 = LLT::scalar(96);
178   const LLT S128 = LLT::scalar(128);
179   const LLT S256 = LLT::scalar(256);
180   const LLT S1024 = LLT::scalar(1024);
181 
182   const LLT V2S16 = LLT::vector(2, 16);
183   const LLT V4S16 = LLT::vector(4, 16);
184 
185   const LLT V2S32 = LLT::vector(2, 32);
186   const LLT V3S32 = LLT::vector(3, 32);
187   const LLT V4S32 = LLT::vector(4, 32);
188   const LLT V5S32 = LLT::vector(5, 32);
189   const LLT V6S32 = LLT::vector(6, 32);
190   const LLT V7S32 = LLT::vector(7, 32);
191   const LLT V8S32 = LLT::vector(8, 32);
192   const LLT V9S32 = LLT::vector(9, 32);
193   const LLT V10S32 = LLT::vector(10, 32);
194   const LLT V11S32 = LLT::vector(11, 32);
195   const LLT V12S32 = LLT::vector(12, 32);
196   const LLT V13S32 = LLT::vector(13, 32);
197   const LLT V14S32 = LLT::vector(14, 32);
198   const LLT V15S32 = LLT::vector(15, 32);
199   const LLT V16S32 = LLT::vector(16, 32);
200   const LLT V32S32 = LLT::vector(32, 32);
201 
202   const LLT V2S64 = LLT::vector(2, 64);
203   const LLT V3S64 = LLT::vector(3, 64);
204   const LLT V4S64 = LLT::vector(4, 64);
205   const LLT V5S64 = LLT::vector(5, 64);
206   const LLT V6S64 = LLT::vector(6, 64);
207   const LLT V7S64 = LLT::vector(7, 64);
208   const LLT V8S64 = LLT::vector(8, 64);
209   const LLT V16S64 = LLT::vector(16, 64);
210 
211   std::initializer_list<LLT> AllS32Vectors =
212     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
213      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
214   std::initializer_list<LLT> AllS64Vectors =
215     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
216 
217   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
218   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
219   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
220   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
221   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
222   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
223   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
224 
225   const LLT CodePtr = FlatPtr;
226 
227   const std::initializer_list<LLT> AddrSpaces64 = {
228     GlobalPtr, ConstantPtr, FlatPtr
229   };
230 
231   const std::initializer_list<LLT> AddrSpaces32 = {
232     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
233   };
234 
235   const std::initializer_list<LLT> FPTypesBase = {
236     S32, S64
237   };
238 
239   const std::initializer_list<LLT> FPTypes16 = {
240     S32, S64, S16
241   };
242 
243   const std::initializer_list<LLT> FPTypesPK16 = {
244     S32, S64, S16, V2S16
245   };
246 
247   setAction({G_BRCOND, S1}, Legal); // VCC branches
248   setAction({G_BRCOND, S32}, Legal); // SCC branches
249 
250   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
251   // elements for v3s16
252   getActionDefinitionsBuilder(G_PHI)
253     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
254     .legalFor(AllS32Vectors)
255     .legalFor(AllS64Vectors)
256     .legalFor(AddrSpaces64)
257     .legalFor(AddrSpaces32)
258     .clampScalar(0, S32, S256)
259     .widenScalarToNextPow2(0, 32)
260     .clampMaxNumElements(0, S32, 16)
261     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
262     .legalIf(isPointer(0));
263 
264   if (ST.has16BitInsts()) {
265     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
266       .legalFor({S32, S16})
267       .clampScalar(0, S16, S32)
268       .scalarize(0);
269   } else {
270     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
271       .legalFor({S32})
272       .clampScalar(0, S32, S32)
273       .scalarize(0);
274   }
275 
276   // FIXME: Not really legal. Placeholder for custom lowering.
277   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
278     .legalFor({S32, S64})
279     .clampScalar(0, S32, S64)
280     .widenScalarToNextPow2(0, 32)
281     .scalarize(0);
282 
283   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
284     .legalFor({S32})
285     .clampScalar(0, S32, S32)
286     .scalarize(0);
287 
288   // Report legal for any types we can handle anywhere. For the cases only legal
289   // on the SALU, RegBankSelect will be able to re-legalize.
290   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
291     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
292     .clampScalar(0, S32, S64)
293     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
294     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
295     .widenScalarToNextPow2(0)
296     .scalarize(0);
297 
298   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
299                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
300     .legalFor({{S32, S1}, {S32, S32}})
301     .clampScalar(0, S32, S32)
302     .scalarize(0); // TODO: Implement.
303 
304   getActionDefinitionsBuilder({G_SADDO, G_SSUBO})
305     .lower();
306 
307   getActionDefinitionsBuilder(G_BITCAST)
308     // Don't worry about the size constraint.
309     .legalIf(all(isRegisterType(0), isRegisterType(1)))
310     // FIXME: Testing hack
311     .legalForCartesianProduct({S16, LLT::vector(2, 8), })
312     .lower();
313 
314 
315   getActionDefinitionsBuilder(G_FCONSTANT)
316     .legalFor({S32, S64, S16})
317     .clampScalar(0, S16, S64);
318 
319   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
320     .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
321                ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
322     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
323     .clampScalarOrElt(0, S32, S1024)
324     .legalIf(isMultiple32(0))
325     .widenScalarToNextPow2(0, 32)
326     .clampMaxNumElements(0, S32, 16);
327 
328 
329   // FIXME: i1 operands to intrinsics should always be legal, but other i1
330   // values may not be legal.  We need to figure out how to distinguish
331   // between these two scenarios.
332   getActionDefinitionsBuilder(G_CONSTANT)
333     .legalFor({S1, S32, S64, S16, GlobalPtr,
334                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
335     .clampScalar(0, S32, S64)
336     .widenScalarToNextPow2(0)
337     .legalIf(isPointer(0));
338 
339   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
340   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
341     .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr});
342 
343 
344   auto &FPOpActions = getActionDefinitionsBuilder(
345     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
346     .legalFor({S32, S64});
347   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
348     .customFor({S32, S64});
349   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
350     .customFor({S32, S64});
351 
352   if (ST.has16BitInsts()) {
353     if (ST.hasVOP3PInsts())
354       FPOpActions.legalFor({S16, V2S16});
355     else
356       FPOpActions.legalFor({S16});
357 
358     TrigActions.customFor({S16});
359     FDIVActions.customFor({S16});
360   }
361 
362   auto &MinNumMaxNum = getActionDefinitionsBuilder({
363       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
364 
365   if (ST.hasVOP3PInsts()) {
366     MinNumMaxNum.customFor(FPTypesPK16)
367       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
368       .clampMaxNumElements(0, S16, 2)
369       .clampScalar(0, S16, S64)
370       .scalarize(0);
371   } else if (ST.has16BitInsts()) {
372     MinNumMaxNum.customFor(FPTypes16)
373       .clampScalar(0, S16, S64)
374       .scalarize(0);
375   } else {
376     MinNumMaxNum.customFor(FPTypesBase)
377       .clampScalar(0, S32, S64)
378       .scalarize(0);
379   }
380 
381   if (ST.hasVOP3PInsts())
382     FPOpActions.clampMaxNumElements(0, S16, 2);
383 
384   FPOpActions
385     .scalarize(0)
386     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
387 
388   TrigActions
389     .scalarize(0)
390     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
391 
392   FDIVActions
393     .scalarize(0)
394     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
395 
396   getActionDefinitionsBuilder({G_FNEG, G_FABS})
397     .legalFor(FPTypesPK16)
398     .clampMaxNumElements(0, S16, 2)
399     .scalarize(0)
400     .clampScalar(0, S16, S64);
401 
402   // TODO: Implement
403   getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
404 
405   if (ST.has16BitInsts()) {
406     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
407       .legalFor({S32, S64, S16})
408       .scalarize(0)
409       .clampScalar(0, S16, S64);
410   } else {
411     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
412       .legalFor({S32, S64})
413       .scalarize(0)
414       .clampScalar(0, S32, S64);
415   }
416 
417   getActionDefinitionsBuilder(G_FPTRUNC)
418     .legalFor({{S32, S64}, {S16, S32}})
419     .scalarize(0);
420 
421   getActionDefinitionsBuilder(G_FPEXT)
422     .legalFor({{S64, S32}, {S32, S16}})
423     .lowerFor({{S64, S16}}) // FIXME: Implement
424     .scalarize(0);
425 
426   // TODO: Verify V_BFI_B32 is generated from expanded bit ops.
427   getActionDefinitionsBuilder(G_FCOPYSIGN).lower();
428 
429   getActionDefinitionsBuilder(G_FSUB)
430       // Use actual fsub instruction
431       .legalFor({S32})
432       // Must use fadd + fneg
433       .lowerFor({S64, S16, V2S16})
434       .scalarize(0)
435       .clampScalar(0, S32, S64);
436 
437   // Whether this is legal depends on the floating point mode for the function.
438   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
439   if (ST.hasMadF16())
440     FMad.customFor({S32, S16});
441   else
442     FMad.customFor({S32});
443   FMad.scalarize(0)
444       .lower();
445 
446   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
447     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
448                {S32, S1}, {S64, S1}, {S16, S1},
449                {S96, S32},
450                // FIXME: Hack
451                {S64, LLT::scalar(33)},
452                {S32, S8}, {S32, LLT::scalar(24)}})
453     .scalarize(0)
454     .clampScalar(0, S32, S64);
455 
456   // TODO: Split s1->s64 during regbankselect for VALU.
457   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
458     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
459     .lowerFor({{S32, S64}})
460     .lowerIf(typeIs(1, S1))
461     .customFor({{S64, S64}});
462   if (ST.has16BitInsts())
463     IToFP.legalFor({{S16, S16}});
464   IToFP.clampScalar(1, S32, S64)
465        .scalarize(0);
466 
467   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
468     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}});
469   if (ST.has16BitInsts())
470     FPToI.legalFor({{S16, S16}});
471   else
472     FPToI.minScalar(1, S32);
473 
474   FPToI.minScalar(0, S32)
475        .scalarize(0);
476 
477   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
478     .scalarize(0)
479     .lower();
480 
481   if (ST.has16BitInsts()) {
482     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
483       .legalFor({S16, S32, S64})
484       .clampScalar(0, S16, S64)
485       .scalarize(0);
486   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
487     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
488       .legalFor({S32, S64})
489       .clampScalar(0, S32, S64)
490       .scalarize(0);
491   } else {
492     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
493       .legalFor({S32})
494       .customFor({S64})
495       .clampScalar(0, S32, S64)
496       .scalarize(0);
497   }
498 
499   getActionDefinitionsBuilder(G_PTR_ADD)
500     .legalForCartesianProduct(AddrSpaces64, {S64})
501     .legalForCartesianProduct(AddrSpaces32, {S32})
502     .scalarize(0);
503 
504   getActionDefinitionsBuilder(G_PTR_MASK)
505     .scalarize(0)
506     .alwaysLegal();
507 
508   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
509 
510   auto &CmpBuilder =
511     getActionDefinitionsBuilder(G_ICMP)
512     // The compare output type differs based on the register bank of the output,
513     // so make both s1 and s32 legal.
514     //
515     // Scalar compares producing output in scc will be promoted to s32, as that
516     // is the allocatable register type that will be needed for the copy from
517     // scc. This will be promoted during RegBankSelect, and we assume something
518     // before that won't try to use s32 result types.
519     //
520     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
521     // bank.
522     .legalForCartesianProduct(
523       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
524     .legalForCartesianProduct(
525       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
526   if (ST.has16BitInsts()) {
527     CmpBuilder.legalFor({{S1, S16}});
528   }
529 
530   CmpBuilder
531     .widenScalarToNextPow2(1)
532     .clampScalar(1, S32, S64)
533     .scalarize(0)
534     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
535 
536   getActionDefinitionsBuilder(G_FCMP)
537     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
538     .widenScalarToNextPow2(1)
539     .clampScalar(1, S32, S64)
540     .scalarize(0);
541 
542   // FIXME: fexp, flog2, flog10 needs to be custom lowered.
543   getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2,
544                                G_FLOG, G_FLOG2, G_FLOG10})
545     .legalFor({S32})
546     .scalarize(0);
547 
548   // The 64-bit versions produce 32-bit results, but only on the SALU.
549   getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF,
550                                G_CTTZ, G_CTTZ_ZERO_UNDEF,
551                                G_CTPOP})
552     .legalFor({{S32, S32}, {S32, S64}})
553     .clampScalar(0, S32, S32)
554     .clampScalar(1, S32, S64)
555     .scalarize(0)
556     .widenScalarToNextPow2(0, 32)
557     .widenScalarToNextPow2(1, 32);
558 
559   // TODO: Expand for > s32
560   getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE})
561     .legalFor({S32})
562     .clampScalar(0, S32, S32)
563     .scalarize(0);
564 
565   if (ST.has16BitInsts()) {
566     if (ST.hasVOP3PInsts()) {
567       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
568         .legalFor({S32, S16, V2S16})
569         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
570         .clampMaxNumElements(0, S16, 2)
571         .clampScalar(0, S16, S32)
572         .widenScalarToNextPow2(0)
573         .scalarize(0);
574     } else {
575       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
576         .legalFor({S32, S16})
577         .widenScalarToNextPow2(0)
578         .clampScalar(0, S16, S32)
579         .scalarize(0);
580     }
581   } else {
582     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
583       .legalFor({S32})
584       .clampScalar(0, S32, S32)
585       .widenScalarToNextPow2(0)
586       .scalarize(0);
587   }
588 
589   auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
590     return [=](const LegalityQuery &Query) {
591       return Query.Types[TypeIdx0].getSizeInBits() <
592              Query.Types[TypeIdx1].getSizeInBits();
593     };
594   };
595 
596   auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
597     return [=](const LegalityQuery &Query) {
598       return Query.Types[TypeIdx0].getSizeInBits() >
599              Query.Types[TypeIdx1].getSizeInBits();
600     };
601   };
602 
603   getActionDefinitionsBuilder(G_INTTOPTR)
604     // List the common cases
605     .legalForCartesianProduct(AddrSpaces64, {S64})
606     .legalForCartesianProduct(AddrSpaces32, {S32})
607     .scalarize(0)
608     // Accept any address space as long as the size matches
609     .legalIf(sameSize(0, 1))
610     .widenScalarIf(smallerThan(1, 0),
611       [](const LegalityQuery &Query) {
612         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
613       })
614     .narrowScalarIf(greaterThan(1, 0),
615       [](const LegalityQuery &Query) {
616         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
617       });
618 
619   getActionDefinitionsBuilder(G_PTRTOINT)
620     // List the common cases
621     .legalForCartesianProduct(AddrSpaces64, {S64})
622     .legalForCartesianProduct(AddrSpaces32, {S32})
623     .scalarize(0)
624     // Accept any address space as long as the size matches
625     .legalIf(sameSize(0, 1))
626     .widenScalarIf(smallerThan(0, 1),
627       [](const LegalityQuery &Query) {
628         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
629       })
630     .narrowScalarIf(
631       greaterThan(0, 1),
632       [](const LegalityQuery &Query) {
633         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
634       });
635 
636   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
637     .scalarize(0)
638     .custom();
639 
640   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
641   // handle some operations by just promoting the register during
642   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
643   auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned {
644     switch (AS) {
645     // FIXME: Private element size.
646     case AMDGPUAS::PRIVATE_ADDRESS:
647       return 32;
648     // FIXME: Check subtarget
649     case AMDGPUAS::LOCAL_ADDRESS:
650       return ST.useDS128() ? 128 : 64;
651 
652     // Treat constant and global as identical. SMRD loads are sometimes usable
653     // for global loads (ideally constant address space should be eliminated)
654     // depending on the context. Legality cannot be context dependent, but
655     // RegBankSelect can split the load as necessary depending on the pointer
656     // register bank/uniformity and if the memory is invariant or not written in
657     // a kernel.
658     case AMDGPUAS::CONSTANT_ADDRESS:
659     case AMDGPUAS::GLOBAL_ADDRESS:
660       return 512;
661     default:
662       return 128;
663     }
664   };
665 
666   const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool {
667     const LLT DstTy = Query.Types[0];
668 
669     // Split vector extloads.
670     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
671     unsigned Align = Query.MMODescrs[0].AlignInBits;
672 
673     if (MemSize < DstTy.getSizeInBits())
674       MemSize = std::max(MemSize, Align);
675 
676     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
677       return true;
678 
679     const LLT PtrTy = Query.Types[1];
680     unsigned AS = PtrTy.getAddressSpace();
681     if (MemSize > maxSizeForAddrSpace(AS))
682       return true;
683 
684     // Catch weird sized loads that don't evenly divide into the access sizes
685     // TODO: May be able to widen depending on alignment etc.
686     unsigned NumRegs = MemSize / 32;
687     if (NumRegs == 3 && !ST.hasDwordx3LoadStores())
688       return true;
689 
690     if (Align < MemSize) {
691       const SITargetLowering *TLI = ST.getTargetLowering();
692       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
693     }
694 
695     return false;
696   };
697 
698   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
699   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
700   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
701 
702   // TODO: Refine based on subtargets which support unaligned access or 128-bit
703   // LDS
704   // TODO: Unsupported flat for SI.
705 
706   for (unsigned Op : {G_LOAD, G_STORE}) {
707     const bool IsStore = Op == G_STORE;
708 
709     auto &Actions = getActionDefinitionsBuilder(Op);
710     // Whitelist the common cases.
711     // TODO: Pointer loads
712     // TODO: Wide constant loads
713     // TODO: Only CI+ has 3x loads
714     // TODO: Loads to s16 on gfx9
715     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
716                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
717                                       {V3S32, GlobalPtr, 96, GlobalAlign32},
718                                       {S96, GlobalPtr, 96, GlobalAlign32},
719                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
720                                       {S128, GlobalPtr, 128, GlobalAlign32},
721                                       {S64, GlobalPtr, 64, GlobalAlign32},
722                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
723                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
724                                       {S32, GlobalPtr, 8, GlobalAlign8},
725                                       {S32, GlobalPtr, 16, GlobalAlign16},
726 
727                                       {S32, LocalPtr, 32, 32},
728                                       {S64, LocalPtr, 64, 32},
729                                       {V2S32, LocalPtr, 64, 32},
730                                       {S32, LocalPtr, 8, 8},
731                                       {S32, LocalPtr, 16, 16},
732                                       {V2S16, LocalPtr, 32, 32},
733 
734                                       {S32, PrivatePtr, 32, 32},
735                                       {S32, PrivatePtr, 8, 8},
736                                       {S32, PrivatePtr, 16, 16},
737                                       {V2S16, PrivatePtr, 32, 32},
738 
739                                       {S32, FlatPtr, 32, GlobalAlign32},
740                                       {S32, FlatPtr, 16, GlobalAlign16},
741                                       {S32, FlatPtr, 8, GlobalAlign8},
742                                       {V2S16, FlatPtr, 32, GlobalAlign32},
743 
744                                       {S32, ConstantPtr, 32, GlobalAlign32},
745                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
746                                       {V3S32, ConstantPtr, 96, GlobalAlign32},
747                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
748                                       {S64, ConstantPtr, 64, GlobalAlign32},
749                                       {S128, ConstantPtr, 128, GlobalAlign32},
750                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
751     Actions
752         .customIf(typeIs(1, Constant32Ptr))
753         .narrowScalarIf(
754             [=](const LegalityQuery &Query) -> bool {
755               return !Query.Types[0].isVector() && needToSplitLoad(Query);
756             },
757             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
758               const LLT DstTy = Query.Types[0];
759               const LLT PtrTy = Query.Types[1];
760 
761               const unsigned DstSize = DstTy.getSizeInBits();
762               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
763 
764               // Split extloads.
765               if (DstSize > MemSize)
766                 return std::make_pair(0, LLT::scalar(MemSize));
767 
768               if (DstSize > 32 && (DstSize % 32 != 0)) {
769                 // FIXME: Need a way to specify non-extload of larger size if
770                 // suitably aligned.
771                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
772               }
773 
774               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
775               if (MemSize > MaxSize)
776                 return std::make_pair(0, LLT::scalar(MaxSize));
777 
778               unsigned Align = Query.MMODescrs[0].AlignInBits;
779               return std::make_pair(0, LLT::scalar(Align));
780             })
781         .fewerElementsIf(
782             [=](const LegalityQuery &Query) -> bool {
783               return Query.Types[0].isVector() && needToSplitLoad(Query);
784             },
785             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
786               const LLT DstTy = Query.Types[0];
787               const LLT PtrTy = Query.Types[1];
788 
789               LLT EltTy = DstTy.getElementType();
790               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
791 
792               // Split if it's too large for the address space.
793               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
794                 unsigned NumElts = DstTy.getNumElements();
795                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
796 
797                 // FIXME: Refine when odd breakdowns handled
798                 // The scalars will need to be re-legalized.
799                 if (NumPieces == 1 || NumPieces >= NumElts ||
800                     NumElts % NumPieces != 0)
801                   return std::make_pair(0, EltTy);
802 
803                 return std::make_pair(0,
804                                       LLT::vector(NumElts / NumPieces, EltTy));
805               }
806 
807               // Need to split because of alignment.
808               unsigned Align = Query.MMODescrs[0].AlignInBits;
809               unsigned EltSize = EltTy.getSizeInBits();
810               if (EltSize > Align &&
811                   (EltSize / Align < DstTy.getNumElements())) {
812                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
813               }
814 
815               // May need relegalization for the scalars.
816               return std::make_pair(0, EltTy);
817             })
818         .minScalar(0, S32);
819 
820     if (IsStore)
821       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
822 
823     // TODO: Need a bitcast lower option?
824     Actions
825         .legalIf([=](const LegalityQuery &Query) {
826           const LLT Ty0 = Query.Types[0];
827           unsigned Size = Ty0.getSizeInBits();
828           unsigned MemSize = Query.MMODescrs[0].SizeInBits;
829           unsigned Align = Query.MMODescrs[0].AlignInBits;
830 
831           // FIXME: Widening store from alignment not valid.
832           if (MemSize < Size)
833             MemSize = std::max(MemSize, Align);
834 
835           // No extending vector loads.
836           if (Size > MemSize && Ty0.isVector())
837             return false;
838 
839           switch (MemSize) {
840           case 8:
841           case 16:
842             return Size == 32;
843           case 32:
844           case 64:
845           case 128:
846             return true;
847           case 96:
848             return ST.hasDwordx3LoadStores();
849           case 256:
850           case 512:
851             return true;
852           default:
853             return false;
854           }
855         })
856         .widenScalarToNextPow2(0)
857         // TODO: v3s32->v4s32 with alignment
858         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
859   }
860 
861   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
862                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
863                                                   {S32, GlobalPtr, 16, 2 * 8},
864                                                   {S32, LocalPtr, 8, 8},
865                                                   {S32, LocalPtr, 16, 16},
866                                                   {S32, PrivatePtr, 8, 8},
867                                                   {S32, PrivatePtr, 16, 16},
868                                                   {S32, ConstantPtr, 8, 8},
869                                                   {S32, ConstantPtr, 16, 2 * 8}});
870   if (ST.hasFlatAddressSpace()) {
871     ExtLoads.legalForTypesWithMemDesc(
872         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
873   }
874 
875   ExtLoads.clampScalar(0, S32, S32)
876           .widenScalarToNextPow2(0)
877           .unsupportedIfMemSizeNotPow2()
878           .lower();
879 
880   auto &Atomics = getActionDefinitionsBuilder(
881     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
882      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
883      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
884      G_ATOMICRMW_UMIN})
885     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
886                {S64, GlobalPtr}, {S64, LocalPtr}});
887   if (ST.hasFlatAddressSpace()) {
888     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
889   }
890 
891   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
892     .legalFor({{S32, LocalPtr}});
893 
894   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
895   // demarshalling
896   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
897     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
898                 {S32, FlatPtr}, {S64, FlatPtr}})
899     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
900                {S32, RegionPtr}, {S64, RegionPtr}});
901 
902   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS)
903     .lower();
904 
905   // TODO: Pointer types, any 32-bit or 64-bit vector
906 
907   // Condition should be s32 for scalar, s1 for vector.
908   getActionDefinitionsBuilder(G_SELECT)
909     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
910           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
911           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
912     .clampScalar(0, S16, S64)
913     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
914     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
915     .scalarize(1)
916     .clampMaxNumElements(0, S32, 2)
917     .clampMaxNumElements(0, LocalPtr, 2)
918     .clampMaxNumElements(0, PrivatePtr, 2)
919     .scalarize(0)
920     .widenScalarToNextPow2(0)
921     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
922 
923   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
924   // be more flexible with the shift amount type.
925   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
926     .legalFor({{S32, S32}, {S64, S32}});
927   if (ST.has16BitInsts()) {
928     if (ST.hasVOP3PInsts()) {
929       Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
930             .clampMaxNumElements(0, S16, 2);
931     } else
932       Shifts.legalFor({{S16, S32}, {S16, S16}});
933 
934     // TODO: Support 16-bit shift amounts
935     Shifts.clampScalar(1, S32, S32);
936     Shifts.clampScalar(0, S16, S64);
937     Shifts.widenScalarToNextPow2(0, 16);
938   } else {
939     // Make sure we legalize the shift amount type first, as the general
940     // expansion for the shifted type will produce much worse code if it hasn't
941     // been truncated already.
942     Shifts.clampScalar(1, S32, S32);
943     Shifts.clampScalar(0, S32, S64);
944     Shifts.widenScalarToNextPow2(0, 32);
945   }
946   Shifts.scalarize(0);
947 
948   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
949     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
950     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
951     unsigned IdxTypeIdx = 2;
952 
953     getActionDefinitionsBuilder(Op)
954       .customIf([=](const LegalityQuery &Query) {
955           const LLT EltTy = Query.Types[EltTypeIdx];
956           const LLT VecTy = Query.Types[VecTypeIdx];
957           const LLT IdxTy = Query.Types[IdxTypeIdx];
958           return (EltTy.getSizeInBits() == 16 ||
959                   EltTy.getSizeInBits() % 32 == 0) &&
960                  VecTy.getSizeInBits() % 32 == 0 &&
961                  VecTy.getSizeInBits() <= 1024 &&
962                  IdxTy.getSizeInBits() == 32;
963         })
964       .clampScalar(EltTypeIdx, S32, S64)
965       .clampScalar(VecTypeIdx, S32, S64)
966       .clampScalar(IdxTypeIdx, S32, S32);
967   }
968 
969   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
970     .unsupportedIf([=](const LegalityQuery &Query) {
971         const LLT &EltTy = Query.Types[1].getElementType();
972         return Query.Types[0] != EltTy;
973       });
974 
975   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
976     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
977     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
978 
979     // FIXME: Doesn't handle extract of illegal sizes.
980     getActionDefinitionsBuilder(Op)
981       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
982       // FIXME: Multiples of 16 should not be legal.
983       .legalIf([=](const LegalityQuery &Query) {
984           const LLT BigTy = Query.Types[BigTyIdx];
985           const LLT LitTy = Query.Types[LitTyIdx];
986           return (BigTy.getSizeInBits() % 32 == 0) &&
987                  (LitTy.getSizeInBits() % 16 == 0);
988         })
989       .widenScalarIf(
990         [=](const LegalityQuery &Query) {
991           const LLT BigTy = Query.Types[BigTyIdx];
992           return (BigTy.getScalarSizeInBits() < 16);
993         },
994         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
995       .widenScalarIf(
996         [=](const LegalityQuery &Query) {
997           const LLT LitTy = Query.Types[LitTyIdx];
998           return (LitTy.getScalarSizeInBits() < 16);
999         },
1000         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1001       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1002       .widenScalarToNextPow2(BigTyIdx, 32);
1003 
1004   }
1005 
1006   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1007     .legalForCartesianProduct(AllS32Vectors, {S32})
1008     .legalForCartesianProduct(AllS64Vectors, {S64})
1009     .clampNumElements(0, V16S32, V32S32)
1010     .clampNumElements(0, V2S64, V16S64)
1011     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1012 
1013   if (ST.hasScalarPackInsts())
1014     BuildVector.legalFor({V2S16, S32});
1015 
1016   BuildVector
1017     .minScalarSameAs(1, 0)
1018     .legalIf(isRegisterType(0))
1019     .minScalarOrElt(0, S32);
1020 
1021   if (ST.hasScalarPackInsts()) {
1022     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1023       .legalFor({V2S16, S32})
1024       .lower();
1025   } else {
1026     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1027       .lower();
1028   }
1029 
1030   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1031     .legalIf(isRegisterType(0));
1032 
1033   // TODO: Don't fully scalarize v2s16 pieces
1034   getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1035 
1036   // Merge/Unmerge
1037   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1038     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1039     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1040 
1041     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1042       const LLT &Ty = Query.Types[TypeIdx];
1043       if (Ty.isVector()) {
1044         const LLT &EltTy = Ty.getElementType();
1045         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
1046           return true;
1047         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1048           return true;
1049       }
1050       return false;
1051     };
1052 
1053     auto &Builder = getActionDefinitionsBuilder(Op)
1054       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1055       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1056       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1057       // valid.
1058       .clampScalar(LitTyIdx, S16, S256)
1059       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1060       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1061       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1062                            elementTypeIs(1, S16)),
1063                        changeTo(1, V2S16))
1064       // Break up vectors with weird elements into scalars
1065       .fewerElementsIf(
1066         [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
1067         scalarize(0))
1068       .fewerElementsIf(
1069         [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
1070         scalarize(1))
1071       .clampScalar(BigTyIdx, S32, S1024)
1072       .lowerFor({{S16, V2S16}});
1073 
1074     if (Op == G_MERGE_VALUES) {
1075       Builder.widenScalarIf(
1076         // TODO: Use 16-bit shifts if legal for 8-bit values?
1077         [=](const LegalityQuery &Query) {
1078           const LLT Ty = Query.Types[LitTyIdx];
1079           return Ty.getSizeInBits() < 32;
1080         },
1081         changeTo(LitTyIdx, S32));
1082     }
1083 
1084     Builder.widenScalarIf(
1085       [=](const LegalityQuery &Query) {
1086         const LLT Ty = Query.Types[BigTyIdx];
1087         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1088           Ty.getSizeInBits() % 16 != 0;
1089       },
1090       [=](const LegalityQuery &Query) {
1091         // Pick the next power of 2, or a multiple of 64 over 128.
1092         // Whichever is smaller.
1093         const LLT &Ty = Query.Types[BigTyIdx];
1094         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1095         if (NewSizeInBits >= 256) {
1096           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1097           if (RoundedTo < NewSizeInBits)
1098             NewSizeInBits = RoundedTo;
1099         }
1100         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1101       })
1102       .legalIf([=](const LegalityQuery &Query) {
1103           const LLT &BigTy = Query.Types[BigTyIdx];
1104           const LLT &LitTy = Query.Types[LitTyIdx];
1105 
1106           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1107             return false;
1108           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1109             return false;
1110 
1111           return BigTy.getSizeInBits() % 16 == 0 &&
1112                  LitTy.getSizeInBits() % 16 == 0 &&
1113                  BigTy.getSizeInBits() <= 1024;
1114         })
1115       // Any vectors left are the wrong size. Scalarize them.
1116       .scalarize(0)
1117       .scalarize(1);
1118   }
1119 
1120   getActionDefinitionsBuilder(G_SEXT_INREG).lower();
1121 
1122   getActionDefinitionsBuilder({G_READ_REGISTER, G_WRITE_REGISTER}).lower();
1123 
1124   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1125     .legalFor({S64});
1126 
1127   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1128         G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1129         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1130     .unsupported();
1131 
1132   computeTables();
1133   verify(*ST.getInstrInfo());
1134 }
1135 
1136 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1137                                          MachineRegisterInfo &MRI,
1138                                          MachineIRBuilder &B,
1139                                          GISelChangeObserver &Observer) const {
1140   switch (MI.getOpcode()) {
1141   case TargetOpcode::G_ADDRSPACE_CAST:
1142     return legalizeAddrSpaceCast(MI, MRI, B);
1143   case TargetOpcode::G_FRINT:
1144     return legalizeFrint(MI, MRI, B);
1145   case TargetOpcode::G_FCEIL:
1146     return legalizeFceil(MI, MRI, B);
1147   case TargetOpcode::G_INTRINSIC_TRUNC:
1148     return legalizeIntrinsicTrunc(MI, MRI, B);
1149   case TargetOpcode::G_SITOFP:
1150     return legalizeITOFP(MI, MRI, B, true);
1151   case TargetOpcode::G_UITOFP:
1152     return legalizeITOFP(MI, MRI, B, false);
1153   case TargetOpcode::G_FMINNUM:
1154   case TargetOpcode::G_FMAXNUM:
1155   case TargetOpcode::G_FMINNUM_IEEE:
1156   case TargetOpcode::G_FMAXNUM_IEEE:
1157     return legalizeMinNumMaxNum(MI, MRI, B);
1158   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1159     return legalizeExtractVectorElt(MI, MRI, B);
1160   case TargetOpcode::G_INSERT_VECTOR_ELT:
1161     return legalizeInsertVectorElt(MI, MRI, B);
1162   case TargetOpcode::G_FSIN:
1163   case TargetOpcode::G_FCOS:
1164     return legalizeSinCos(MI, MRI, B);
1165   case TargetOpcode::G_GLOBAL_VALUE:
1166     return legalizeGlobalValue(MI, MRI, B);
1167   case TargetOpcode::G_LOAD:
1168     return legalizeLoad(MI, MRI, B, Observer);
1169   case TargetOpcode::G_FMAD:
1170     return legalizeFMad(MI, MRI, B);
1171   case TargetOpcode::G_FDIV:
1172     return legalizeFDIV(MI, MRI, B);
1173   case TargetOpcode::G_ATOMIC_CMPXCHG:
1174     return legalizeAtomicCmpXChg(MI, MRI, B);
1175   default:
1176     return false;
1177   }
1178 
1179   llvm_unreachable("expected switch to return");
1180 }
1181 
1182 Register AMDGPULegalizerInfo::getSegmentAperture(
1183   unsigned AS,
1184   MachineRegisterInfo &MRI,
1185   MachineIRBuilder &B) const {
1186   MachineFunction &MF = B.getMF();
1187   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1188   const LLT S32 = LLT::scalar(32);
1189 
1190   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1191 
1192   if (ST.hasApertureRegs()) {
1193     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1194     // getreg.
1195     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1196         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1197         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1198     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1199         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1200         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1201     unsigned Encoding =
1202         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1203         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1204         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1205 
1206     Register ApertureReg = MRI.createGenericVirtualRegister(S32);
1207     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1208 
1209     B.buildInstr(AMDGPU::S_GETREG_B32)
1210       .addDef(GetReg)
1211       .addImm(Encoding);
1212     MRI.setType(GetReg, S32);
1213 
1214     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1215     B.buildInstr(TargetOpcode::G_SHL)
1216       .addDef(ApertureReg)
1217       .addUse(GetReg)
1218       .addUse(ShiftAmt.getReg(0));
1219 
1220     return ApertureReg;
1221   }
1222 
1223   Register QueuePtr = MRI.createGenericVirtualRegister(
1224     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1225 
1226   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1227   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1228     return Register();
1229 
1230   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1231   // private_segment_aperture_base_hi.
1232   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1233 
1234   // TODO: can we be smarter about machine pointer info?
1235   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1236   MachineMemOperand *MMO = MF.getMachineMemOperand(
1237     PtrInfo,
1238     MachineMemOperand::MOLoad |
1239     MachineMemOperand::MODereferenceable |
1240     MachineMemOperand::MOInvariant,
1241     4,
1242     MinAlign(64, StructOffset));
1243 
1244   Register LoadResult = MRI.createGenericVirtualRegister(S32);
1245   Register LoadAddr;
1246 
1247   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1248   B.buildLoad(LoadResult, LoadAddr, *MMO);
1249   return LoadResult;
1250 }
1251 
1252 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1253   MachineInstr &MI, MachineRegisterInfo &MRI,
1254   MachineIRBuilder &B) const {
1255   MachineFunction &MF = B.getMF();
1256 
1257   B.setInstr(MI);
1258 
1259   const LLT S32 = LLT::scalar(32);
1260   Register Dst = MI.getOperand(0).getReg();
1261   Register Src = MI.getOperand(1).getReg();
1262 
1263   LLT DstTy = MRI.getType(Dst);
1264   LLT SrcTy = MRI.getType(Src);
1265   unsigned DestAS = DstTy.getAddressSpace();
1266   unsigned SrcAS = SrcTy.getAddressSpace();
1267 
1268   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1269   // vector element.
1270   assert(!DstTy.isVector());
1271 
1272   const AMDGPUTargetMachine &TM
1273     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1274 
1275   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1276   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1277     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1278     return true;
1279   }
1280 
1281   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1282     // Truncate.
1283     B.buildExtract(Dst, Src, 0);
1284     MI.eraseFromParent();
1285     return true;
1286   }
1287 
1288   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1289     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1290     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1291 
1292     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1293     // another. Merge operands are required to be the same type, but creating an
1294     // extra ptrtoint would be kind of pointless.
1295     auto HighAddr = B.buildConstant(
1296       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1297     B.buildMerge(Dst, {Src, HighAddr.getReg(0)});
1298     MI.eraseFromParent();
1299     return true;
1300   }
1301 
1302   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1303     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1304            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1305     unsigned NullVal = TM.getNullPointerValue(DestAS);
1306 
1307     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1308     auto FlatNull = B.buildConstant(SrcTy, 0);
1309 
1310     Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy);
1311 
1312     // Extract low 32-bits of the pointer.
1313     B.buildExtract(PtrLo32, Src, 0);
1314 
1315     Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1316     B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
1317     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1318 
1319     MI.eraseFromParent();
1320     return true;
1321   }
1322 
1323   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1324     return false;
1325 
1326   if (!ST.hasFlatAddressSpace())
1327     return false;
1328 
1329   auto SegmentNull =
1330       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1331   auto FlatNull =
1332       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1333 
1334   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1335   if (!ApertureReg.isValid())
1336     return false;
1337 
1338   Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1339   B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
1340 
1341   Register BuildPtr = MRI.createGenericVirtualRegister(DstTy);
1342 
1343   // Coerce the type of the low half of the result so we can use merge_values.
1344   Register SrcAsInt = MRI.createGenericVirtualRegister(S32);
1345   B.buildInstr(TargetOpcode::G_PTRTOINT)
1346     .addDef(SrcAsInt)
1347     .addUse(Src);
1348 
1349   // TODO: Should we allow mismatched types but matching sizes in merges to
1350   // avoid the ptrtoint?
1351   B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
1352   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
1353 
1354   MI.eraseFromParent();
1355   return true;
1356 }
1357 
1358 bool AMDGPULegalizerInfo::legalizeFrint(
1359   MachineInstr &MI, MachineRegisterInfo &MRI,
1360   MachineIRBuilder &B) const {
1361   B.setInstr(MI);
1362 
1363   Register Src = MI.getOperand(1).getReg();
1364   LLT Ty = MRI.getType(Src);
1365   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1366 
1367   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1368   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1369 
1370   auto C1 = B.buildFConstant(Ty, C1Val);
1371   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1372 
1373   // TODO: Should this propagate fast-math-flags?
1374   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1375   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1376 
1377   auto C2 = B.buildFConstant(Ty, C2Val);
1378   auto Fabs = B.buildFAbs(Ty, Src);
1379 
1380   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1381   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1382   return true;
1383 }
1384 
1385 bool AMDGPULegalizerInfo::legalizeFceil(
1386   MachineInstr &MI, MachineRegisterInfo &MRI,
1387   MachineIRBuilder &B) const {
1388   B.setInstr(MI);
1389 
1390   const LLT S1 = LLT::scalar(1);
1391   const LLT S64 = LLT::scalar(64);
1392 
1393   Register Src = MI.getOperand(1).getReg();
1394   assert(MRI.getType(Src) == S64);
1395 
1396   // result = trunc(src)
1397   // if (src > 0.0 && src != result)
1398   //   result += 1.0
1399 
1400   auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src});
1401 
1402   const auto Zero = B.buildFConstant(S64, 0.0);
1403   const auto One = B.buildFConstant(S64, 1.0);
1404   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1405   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1406   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1407   auto Add = B.buildSelect(S64, And, One, Zero);
1408 
1409   // TODO: Should this propagate fast-math-flags?
1410   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1411   return true;
1412 }
1413 
1414 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1415                                               MachineIRBuilder &B) {
1416   const unsigned FractBits = 52;
1417   const unsigned ExpBits = 11;
1418   LLT S32 = LLT::scalar(32);
1419 
1420   auto Const0 = B.buildConstant(S32, FractBits - 32);
1421   auto Const1 = B.buildConstant(S32, ExpBits);
1422 
1423   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1424     .addUse(Const0.getReg(0))
1425     .addUse(Const1.getReg(0));
1426 
1427   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1428 }
1429 
1430 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1431   MachineInstr &MI, MachineRegisterInfo &MRI,
1432   MachineIRBuilder &B) const {
1433   B.setInstr(MI);
1434 
1435   const LLT S1 = LLT::scalar(1);
1436   const LLT S32 = LLT::scalar(32);
1437   const LLT S64 = LLT::scalar(64);
1438 
1439   Register Src = MI.getOperand(1).getReg();
1440   assert(MRI.getType(Src) == S64);
1441 
1442   // TODO: Should this use extract since the low half is unused?
1443   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1444   Register Hi = Unmerge.getReg(1);
1445 
1446   // Extract the upper half, since this is where we will find the sign and
1447   // exponent.
1448   auto Exp = extractF64Exponent(Hi, B);
1449 
1450   const unsigned FractBits = 52;
1451 
1452   // Extract the sign bit.
1453   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1454   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1455 
1456   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1457 
1458   const auto Zero32 = B.buildConstant(S32, 0);
1459 
1460   // Extend back to 64-bits.
1461   auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)});
1462 
1463   auto Shr = B.buildAShr(S64, FractMask, Exp);
1464   auto Not = B.buildNot(S64, Shr);
1465   auto Tmp0 = B.buildAnd(S64, Src, Not);
1466   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1467 
1468   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1469   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1470 
1471   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1472   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1473   return true;
1474 }
1475 
1476 bool AMDGPULegalizerInfo::legalizeITOFP(
1477   MachineInstr &MI, MachineRegisterInfo &MRI,
1478   MachineIRBuilder &B, bool Signed) const {
1479   B.setInstr(MI);
1480 
1481   Register Dst = MI.getOperand(0).getReg();
1482   Register Src = MI.getOperand(1).getReg();
1483 
1484   const LLT S64 = LLT::scalar(64);
1485   const LLT S32 = LLT::scalar(32);
1486 
1487   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1488 
1489   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1490 
1491   auto CvtHi = Signed ?
1492     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1493     B.buildUITOFP(S64, Unmerge.getReg(1));
1494 
1495   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1496 
1497   auto ThirtyTwo = B.buildConstant(S32, 32);
1498   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1499     .addUse(CvtHi.getReg(0))
1500     .addUse(ThirtyTwo.getReg(0));
1501 
1502   // TODO: Should this propagate fast-math-flags?
1503   B.buildFAdd(Dst, LdExp, CvtLo);
1504   MI.eraseFromParent();
1505   return true;
1506 }
1507 
1508 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1509   MachineInstr &MI, MachineRegisterInfo &MRI,
1510   MachineIRBuilder &B) const {
1511   MachineFunction &MF = B.getMF();
1512   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1513 
1514   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1515                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1516 
1517   // With ieee_mode disabled, the instructions have the correct behavior
1518   // already for G_FMINNUM/G_FMAXNUM
1519   if (!MFI->getMode().IEEE)
1520     return !IsIEEEOp;
1521 
1522   if (IsIEEEOp)
1523     return true;
1524 
1525   MachineIRBuilder HelperBuilder(MI);
1526   GISelObserverWrapper DummyObserver;
1527   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1528   HelperBuilder.setInstr(MI);
1529   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1530 }
1531 
1532 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1533   MachineInstr &MI, MachineRegisterInfo &MRI,
1534   MachineIRBuilder &B) const {
1535   // TODO: Should move some of this into LegalizerHelper.
1536 
1537   // TODO: Promote dynamic indexing of s16 to s32
1538   // TODO: Dynamic s64 indexing is only legal for SGPR.
1539   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
1540   if (!IdxVal) // Dynamic case will be selected to register indexing.
1541     return true;
1542 
1543   Register Dst = MI.getOperand(0).getReg();
1544   Register Vec = MI.getOperand(1).getReg();
1545 
1546   LLT VecTy = MRI.getType(Vec);
1547   LLT EltTy = VecTy.getElementType();
1548   assert(EltTy == MRI.getType(Dst));
1549 
1550   B.setInstr(MI);
1551 
1552   if (IdxVal.getValue() < VecTy.getNumElements())
1553     B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits());
1554   else
1555     B.buildUndef(Dst);
1556 
1557   MI.eraseFromParent();
1558   return true;
1559 }
1560 
1561 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1562   MachineInstr &MI, MachineRegisterInfo &MRI,
1563   MachineIRBuilder &B) const {
1564   // TODO: Should move some of this into LegalizerHelper.
1565 
1566   // TODO: Promote dynamic indexing of s16 to s32
1567   // TODO: Dynamic s64 indexing is only legal for SGPR.
1568   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI);
1569   if (!IdxVal) // Dynamic case will be selected to register indexing.
1570     return true;
1571 
1572   Register Dst = MI.getOperand(0).getReg();
1573   Register Vec = MI.getOperand(1).getReg();
1574   Register Ins = MI.getOperand(2).getReg();
1575 
1576   LLT VecTy = MRI.getType(Vec);
1577   LLT EltTy = VecTy.getElementType();
1578   assert(EltTy == MRI.getType(Ins));
1579 
1580   B.setInstr(MI);
1581 
1582   if (IdxVal.getValue() < VecTy.getNumElements())
1583     B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits());
1584   else
1585     B.buildUndef(Dst);
1586 
1587   MI.eraseFromParent();
1588   return true;
1589 }
1590 
1591 bool AMDGPULegalizerInfo::legalizeSinCos(
1592   MachineInstr &MI, MachineRegisterInfo &MRI,
1593   MachineIRBuilder &B) const {
1594   B.setInstr(MI);
1595 
1596   Register DstReg = MI.getOperand(0).getReg();
1597   Register SrcReg = MI.getOperand(1).getReg();
1598   LLT Ty = MRI.getType(DstReg);
1599   unsigned Flags = MI.getFlags();
1600 
1601   Register TrigVal;
1602   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1603   if (ST.hasTrigReducedRange()) {
1604     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1605     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1606       .addUse(MulVal.getReg(0))
1607       .setMIFlags(Flags).getReg(0);
1608   } else
1609     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1610 
1611   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1612     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1613   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1614     .addUse(TrigVal)
1615     .setMIFlags(Flags);
1616   MI.eraseFromParent();
1617   return true;
1618 }
1619 
1620 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1621   Register DstReg, LLT PtrTy,
1622   MachineIRBuilder &B, const GlobalValue *GV,
1623   unsigned Offset, unsigned GAFlags) const {
1624   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1625   // to the following code sequence:
1626   //
1627   // For constant address space:
1628   //   s_getpc_b64 s[0:1]
1629   //   s_add_u32 s0, s0, $symbol
1630   //   s_addc_u32 s1, s1, 0
1631   //
1632   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1633   //   a fixup or relocation is emitted to replace $symbol with a literal
1634   //   constant, which is a pc-relative offset from the encoding of the $symbol
1635   //   operand to the global variable.
1636   //
1637   // For global address space:
1638   //   s_getpc_b64 s[0:1]
1639   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1640   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1641   //
1642   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1643   //   fixups or relocations are emitted to replace $symbol@*@lo and
1644   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1645   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
1646   //   operand to the global variable.
1647   //
1648   // What we want here is an offset from the value returned by s_getpc
1649   // (which is the address of the s_add_u32 instruction) to the global
1650   // variable, but since the encoding of $symbol starts 4 bytes after the start
1651   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1652   // small. This requires us to add 4 to the global variable offset in order to
1653   // compute the correct address.
1654 
1655   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1656 
1657   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1658     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1659 
1660   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1661     .addDef(PCReg);
1662 
1663   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1664   if (GAFlags == SIInstrInfo::MO_NONE)
1665     MIB.addImm(0);
1666   else
1667     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1668 
1669   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1670 
1671   if (PtrTy.getSizeInBits() == 32)
1672     B.buildExtract(DstReg, PCReg, 0);
1673   return true;
1674  }
1675 
1676 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1677   MachineInstr &MI, MachineRegisterInfo &MRI,
1678   MachineIRBuilder &B) const {
1679   Register DstReg = MI.getOperand(0).getReg();
1680   LLT Ty = MRI.getType(DstReg);
1681   unsigned AS = Ty.getAddressSpace();
1682 
1683   const GlobalValue *GV = MI.getOperand(1).getGlobal();
1684   MachineFunction &MF = B.getMF();
1685   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1686   B.setInstr(MI);
1687 
1688   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1689     if (!MFI->isEntryFunction()) {
1690       const Function &Fn = MF.getFunction();
1691       DiagnosticInfoUnsupported BadLDSDecl(
1692         Fn, "local memory global used by non-kernel function", MI.getDebugLoc());
1693       Fn.getContext().diagnose(BadLDSDecl);
1694     }
1695 
1696     // TODO: We could emit code to handle the initialization somewhere.
1697     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1698       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1699       MI.eraseFromParent();
1700       return true;
1701     }
1702 
1703     const Function &Fn = MF.getFunction();
1704     DiagnosticInfoUnsupported BadInit(
1705       Fn, "unsupported initializer for address space", MI.getDebugLoc());
1706     Fn.getContext().diagnose(BadInit);
1707     return true;
1708   }
1709 
1710   const SITargetLowering *TLI = ST.getTargetLowering();
1711 
1712   if (TLI->shouldEmitFixup(GV)) {
1713     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
1714     MI.eraseFromParent();
1715     return true;
1716   }
1717 
1718   if (TLI->shouldEmitPCReloc(GV)) {
1719     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
1720     MI.eraseFromParent();
1721     return true;
1722   }
1723 
1724   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1725   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
1726 
1727   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
1728     MachinePointerInfo::getGOT(MF),
1729     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1730     MachineMemOperand::MOInvariant,
1731     8 /*Size*/, 8 /*Align*/);
1732 
1733   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
1734 
1735   if (Ty.getSizeInBits() == 32) {
1736     // Truncate if this is a 32-bit constant adrdess.
1737     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
1738     B.buildExtract(DstReg, Load, 0);
1739   } else
1740     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
1741 
1742   MI.eraseFromParent();
1743   return true;
1744 }
1745 
1746 bool AMDGPULegalizerInfo::legalizeLoad(
1747   MachineInstr &MI, MachineRegisterInfo &MRI,
1748   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
1749   B.setInstr(MI);
1750   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1751   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
1752   Observer.changingInstr(MI);
1753   MI.getOperand(1).setReg(Cast.getReg(0));
1754   Observer.changedInstr(MI);
1755   return true;
1756 }
1757 
1758 bool AMDGPULegalizerInfo::legalizeFMad(
1759   MachineInstr &MI, MachineRegisterInfo &MRI,
1760   MachineIRBuilder &B) const {
1761   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1762   assert(Ty.isScalar());
1763 
1764   MachineFunction &MF = B.getMF();
1765   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1766 
1767   // TODO: Always legal with future ftz flag.
1768   if (Ty == LLT::scalar(32) && !MFI->getMode().FP32Denormals)
1769     return true;
1770   if (Ty == LLT::scalar(16) && !MFI->getMode().FP64FP16Denormals)
1771     return true;
1772 
1773 
1774   MachineIRBuilder HelperBuilder(MI);
1775   GISelObserverWrapper DummyObserver;
1776   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1777   HelperBuilder.setMBB(*MI.getParent());
1778   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
1779 }
1780 
1781 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
1782   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
1783   Register DstReg = MI.getOperand(0).getReg();
1784   Register PtrReg = MI.getOperand(1).getReg();
1785   Register CmpVal = MI.getOperand(2).getReg();
1786   Register NewVal = MI.getOperand(3).getReg();
1787 
1788   assert(SITargetLowering::isFlatGlobalAddrSpace(
1789            MRI.getType(PtrReg).getAddressSpace()) &&
1790          "this should not have been custom lowered");
1791 
1792   LLT ValTy = MRI.getType(CmpVal);
1793   LLT VecTy = LLT::vector(2, ValTy);
1794 
1795   B.setInstr(MI);
1796   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
1797 
1798   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
1799     .addDef(DstReg)
1800     .addUse(PtrReg)
1801     .addUse(PackedVal)
1802     .setMemRefs(MI.memoperands());
1803 
1804   MI.eraseFromParent();
1805   return true;
1806 }
1807 
1808 // Return the use branch instruction, otherwise null if the usage is invalid.
1809 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
1810                                        MachineRegisterInfo &MRI,
1811                                        MachineInstr *&Br) {
1812   Register CondDef = MI.getOperand(0).getReg();
1813   if (!MRI.hasOneNonDBGUse(CondDef))
1814     return nullptr;
1815 
1816   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
1817   if (UseMI.getParent() != MI.getParent() ||
1818       UseMI.getOpcode() != AMDGPU::G_BRCOND)
1819     return nullptr;
1820 
1821   // Make sure the cond br is followed by a G_BR
1822   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
1823   if (Next != MI.getParent()->end()) {
1824     if (Next->getOpcode() != AMDGPU::G_BR)
1825       return nullptr;
1826     Br = &*Next;
1827   }
1828 
1829   return &UseMI;
1830 }
1831 
1832 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
1833                                                 Register Reg, LLT Ty) const {
1834   Register LiveIn = MRI.getLiveInVirtReg(Reg);
1835   if (LiveIn)
1836     return LiveIn;
1837 
1838   Register NewReg = MRI.createGenericVirtualRegister(Ty);
1839   MRI.addLiveIn(Reg, NewReg);
1840   return NewReg;
1841 }
1842 
1843 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
1844                                          const ArgDescriptor *Arg) const {
1845   if (!Arg->isRegister() || !Arg->getRegister().isValid())
1846     return false; // TODO: Handle these
1847 
1848   assert(Arg->getRegister().isPhysical());
1849 
1850   MachineRegisterInfo &MRI = *B.getMRI();
1851 
1852   LLT Ty = MRI.getType(DstReg);
1853   Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
1854 
1855   if (Arg->isMasked()) {
1856     // TODO: Should we try to emit this once in the entry block?
1857     const LLT S32 = LLT::scalar(32);
1858     const unsigned Mask = Arg->getMask();
1859     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
1860 
1861     Register AndMaskSrc = LiveIn;
1862 
1863     if (Shift != 0) {
1864       auto ShiftAmt = B.buildConstant(S32, Shift);
1865       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
1866     }
1867 
1868     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
1869   } else
1870     B.buildCopy(DstReg, LiveIn);
1871 
1872   // Insert the argument copy if it doens't already exist.
1873   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
1874   if (!MRI.getVRegDef(LiveIn)) {
1875     // FIXME: Should have scoped insert pt
1876     MachineBasicBlock &OrigInsBB = B.getMBB();
1877     auto OrigInsPt = B.getInsertPt();
1878 
1879     MachineBasicBlock &EntryMBB = B.getMF().front();
1880     EntryMBB.addLiveIn(Arg->getRegister());
1881     B.setInsertPt(EntryMBB, EntryMBB.begin());
1882     B.buildCopy(LiveIn, Arg->getRegister());
1883 
1884     B.setInsertPt(OrigInsBB, OrigInsPt);
1885   }
1886 
1887   return true;
1888 }
1889 
1890 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
1891   MachineInstr &MI,
1892   MachineRegisterInfo &MRI,
1893   MachineIRBuilder &B,
1894   AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
1895   B.setInstr(MI);
1896 
1897   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1898 
1899   const ArgDescriptor *Arg;
1900   const TargetRegisterClass *RC;
1901   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
1902   if (!Arg) {
1903     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
1904     return false;
1905   }
1906 
1907   if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
1908     MI.eraseFromParent();
1909     return true;
1910   }
1911 
1912   return false;
1913 }
1914 
1915 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
1916                                        MachineRegisterInfo &MRI,
1917                                        MachineIRBuilder &B) const {
1918   B.setInstr(MI);
1919   Register Dst = MI.getOperand(0).getReg();
1920   LLT DstTy = MRI.getType(Dst);
1921   LLT S16 = LLT::scalar(16);
1922   LLT S32 = LLT::scalar(32);
1923   LLT S64 = LLT::scalar(64);
1924 
1925   if (legalizeFastUnsafeFDIV(MI, MRI, B))
1926     return true;
1927 
1928   if (DstTy == S16)
1929     return legalizeFDIV16(MI, MRI, B);
1930   if (DstTy == S32)
1931     return legalizeFDIV32(MI, MRI, B);
1932   if (DstTy == S64)
1933     return legalizeFDIV64(MI, MRI, B);
1934 
1935   return false;
1936 }
1937 
1938 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
1939                                                  MachineRegisterInfo &MRI,
1940                                                  MachineIRBuilder &B) const {
1941   Register Res = MI.getOperand(0).getReg();
1942   Register LHS = MI.getOperand(1).getReg();
1943   Register RHS = MI.getOperand(2).getReg();
1944 
1945   uint16_t Flags = MI.getFlags();
1946 
1947   LLT ResTy = MRI.getType(Res);
1948   LLT S32 = LLT::scalar(32);
1949   LLT S64 = LLT::scalar(64);
1950 
1951   const MachineFunction &MF = B.getMF();
1952   bool Unsafe =
1953     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
1954 
1955   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
1956     return false;
1957 
1958   if (!Unsafe && ResTy == S32 &&
1959       MF.getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals)
1960     return false;
1961 
1962   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
1963     // 1 / x -> RCP(x)
1964     if (CLHS->isExactlyValue(1.0)) {
1965       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
1966         .addUse(RHS)
1967         .setMIFlags(Flags);
1968 
1969       MI.eraseFromParent();
1970       return true;
1971     }
1972 
1973     // -1 / x -> RCP( FNEG(x) )
1974     if (CLHS->isExactlyValue(-1.0)) {
1975       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
1976       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
1977         .addUse(FNeg.getReg(0))
1978         .setMIFlags(Flags);
1979 
1980       MI.eraseFromParent();
1981       return true;
1982     }
1983   }
1984 
1985   // x / y -> x * (1.0 / y)
1986   if (Unsafe) {
1987     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
1988       .addUse(RHS)
1989       .setMIFlags(Flags);
1990     B.buildFMul(Res, LHS, RCP, Flags);
1991 
1992     MI.eraseFromParent();
1993     return true;
1994   }
1995 
1996   return false;
1997 }
1998 
1999 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2000                                          MachineRegisterInfo &MRI,
2001                                          MachineIRBuilder &B) const {
2002   B.setInstr(MI);
2003   Register Res = MI.getOperand(0).getReg();
2004   Register LHS = MI.getOperand(1).getReg();
2005   Register RHS = MI.getOperand(2).getReg();
2006 
2007   uint16_t Flags = MI.getFlags();
2008 
2009   LLT S16 = LLT::scalar(16);
2010   LLT S32 = LLT::scalar(32);
2011 
2012   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2013   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2014 
2015   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2016     .addUse(RHSExt.getReg(0))
2017     .setMIFlags(Flags);
2018 
2019   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2020   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2021 
2022   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2023     .addUse(RDst.getReg(0))
2024     .addUse(RHS)
2025     .addUse(LHS)
2026     .setMIFlags(Flags);
2027 
2028   MI.eraseFromParent();
2029   return true;
2030 }
2031 
2032 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2033 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2034 static void toggleSPDenormMode(bool Enable,
2035                                MachineIRBuilder &B,
2036                                const GCNSubtarget &ST,
2037                                AMDGPU::SIModeRegisterDefaults Mode) {
2038   // Set SP denorm mode to this value.
2039   unsigned SPDenormMode =
2040     Enable ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
2041 
2042   if (ST.hasDenormModeInst()) {
2043     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2044     unsigned DPDenormModeDefault = Mode.FP64FP16Denormals
2045                                    ? FP_DENORM_FLUSH_NONE
2046                                    : FP_DENORM_FLUSH_IN_FLUSH_OUT;
2047 
2048     unsigned NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2049     B.buildInstr(AMDGPU::S_DENORM_MODE)
2050       .addImm(NewDenormModeValue);
2051 
2052   } else {
2053     // Select FP32 bit field in mode register.
2054     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2055                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2056                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2057 
2058     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2059       .addImm(SPDenormMode)
2060       .addImm(SPDenormModeBitField);
2061   }
2062 }
2063 
2064 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2065                                          MachineRegisterInfo &MRI,
2066                                          MachineIRBuilder &B) const {
2067   B.setInstr(MI);
2068   Register Res = MI.getOperand(0).getReg();
2069   Register LHS = MI.getOperand(1).getReg();
2070   Register RHS = MI.getOperand(2).getReg();
2071   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2072   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2073 
2074   uint16_t Flags = MI.getFlags();
2075 
2076   LLT S32 = LLT::scalar(32);
2077   LLT S1 = LLT::scalar(1);
2078 
2079   auto One = B.buildFConstant(S32, 1.0f);
2080 
2081   auto DenominatorScaled =
2082     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2083       .addUse(RHS)
2084       .addUse(LHS)
2085       .addImm(1)
2086       .setMIFlags(Flags);
2087   auto NumeratorScaled =
2088     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2089       .addUse(LHS)
2090       .addUse(RHS)
2091       .addImm(0)
2092       .setMIFlags(Flags);
2093 
2094   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2095     .addUse(DenominatorScaled.getReg(0))
2096     .setMIFlags(Flags);
2097   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2098 
2099   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2100   // aren't modeled as reading it.
2101   if (!Mode.FP32Denormals)
2102     toggleSPDenormMode(true, B, ST, Mode);
2103 
2104   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2105   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2106   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2107   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2108   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2109   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2110 
2111   if (!Mode.FP32Denormals)
2112     toggleSPDenormMode(false, B, ST, Mode);
2113 
2114   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2115     .addUse(Fma4.getReg(0))
2116     .addUse(Fma1.getReg(0))
2117     .addUse(Fma3.getReg(0))
2118     .addUse(NumeratorScaled.getReg(1))
2119     .setMIFlags(Flags);
2120 
2121   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2122     .addUse(Fmas.getReg(0))
2123     .addUse(RHS)
2124     .addUse(LHS)
2125     .setMIFlags(Flags);
2126 
2127   MI.eraseFromParent();
2128   return true;
2129 }
2130 
2131 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
2132                                          MachineRegisterInfo &MRI,
2133                                          MachineIRBuilder &B) const {
2134   B.setInstr(MI);
2135   Register Res = MI.getOperand(0).getReg();
2136   Register LHS = MI.getOperand(1).getReg();
2137   Register RHS = MI.getOperand(2).getReg();
2138 
2139   uint16_t Flags = MI.getFlags();
2140 
2141   LLT S64 = LLT::scalar(64);
2142   LLT S1 = LLT::scalar(1);
2143 
2144   auto One = B.buildFConstant(S64, 1.0);
2145 
2146   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2147     .addUse(LHS)
2148     .addUse(RHS)
2149     .addImm(1)
2150     .setMIFlags(Flags);
2151 
2152   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
2153 
2154   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
2155     .addUse(DivScale0.getReg(0))
2156     .setMIFlags(Flags);
2157 
2158   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
2159   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
2160   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
2161 
2162   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2163     .addUse(LHS)
2164     .addUse(RHS)
2165     .addImm(0)
2166     .setMIFlags(Flags);
2167 
2168   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
2169   auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags);
2170   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
2171 
2172   Register Scale;
2173   if (!ST.hasUsableDivScaleConditionOutput()) {
2174     // Workaround a hardware bug on SI where the condition output from div_scale
2175     // is not usable.
2176 
2177     Scale = MRI.createGenericVirtualRegister(S1);
2178 
2179     LLT S32 = LLT::scalar(32);
2180 
2181     auto NumUnmerge = B.buildUnmerge(S32, LHS);
2182     auto DenUnmerge = B.buildUnmerge(S32, RHS);
2183     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
2184     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
2185 
2186     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
2187                               Scale1Unmerge.getReg(1));
2188     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
2189                               Scale0Unmerge.getReg(1));
2190     B.buildXor(Scale, CmpNum, CmpDen);
2191   } else {
2192     Scale = DivScale1.getReg(1);
2193   }
2194 
2195   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
2196     .addUse(Fma4.getReg(0))
2197     .addUse(Fma3.getReg(0))
2198     .addUse(Mul.getReg(0))
2199     .addUse(Scale)
2200     .setMIFlags(Flags);
2201 
2202   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
2203     .addUse(Fmas.getReg(0))
2204     .addUse(RHS)
2205     .addUse(LHS)
2206     .setMIFlags(Flags);
2207 
2208   MI.eraseFromParent();
2209   return true;
2210 }
2211 
2212 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
2213                                                  MachineRegisterInfo &MRI,
2214                                                  MachineIRBuilder &B) const {
2215   B.setInstr(MI);
2216   Register Res = MI.getOperand(0).getReg();
2217   Register LHS = MI.getOperand(2).getReg();
2218   Register RHS = MI.getOperand(3).getReg();
2219   uint16_t Flags = MI.getFlags();
2220 
2221   LLT S32 = LLT::scalar(32);
2222   LLT S1 = LLT::scalar(1);
2223 
2224   auto Abs = B.buildFAbs(S32, RHS, Flags);
2225   const APFloat C0Val(1.0f);
2226 
2227   auto C0 = B.buildConstant(S32, 0x6f800000);
2228   auto C1 = B.buildConstant(S32, 0x2f800000);
2229   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
2230 
2231   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
2232   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
2233 
2234   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
2235 
2236   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2237     .addUse(Mul0.getReg(0))
2238     .setMIFlags(Flags);
2239 
2240   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
2241 
2242   B.buildFMul(Res, Sel, Mul1, Flags);
2243 
2244   MI.eraseFromParent();
2245   return true;
2246 }
2247 
2248 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
2249                                                  MachineRegisterInfo &MRI,
2250                                                  MachineIRBuilder &B) const {
2251   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2252   if (!MFI->isEntryFunction()) {
2253     return legalizePreloadedArgIntrin(MI, MRI, B,
2254                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
2255   }
2256 
2257   B.setInstr(MI);
2258 
2259   uint64_t Offset =
2260     ST.getTargetLowering()->getImplicitParameterOffset(
2261       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
2262   Register DstReg = MI.getOperand(0).getReg();
2263   LLT DstTy = MRI.getType(DstReg);
2264   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
2265 
2266   const ArgDescriptor *Arg;
2267   const TargetRegisterClass *RC;
2268   std::tie(Arg, RC)
2269     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2270   if (!Arg)
2271     return false;
2272 
2273   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
2274   if (!loadInputValue(KernargPtrReg, B, Arg))
2275     return false;
2276 
2277   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
2278   MI.eraseFromParent();
2279   return true;
2280 }
2281 
2282 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
2283                                               MachineRegisterInfo &MRI,
2284                                               MachineIRBuilder &B,
2285                                               unsigned AddrSpace) const {
2286   B.setInstr(MI);
2287   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
2288   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
2289   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
2290   MI.eraseFromParent();
2291   return true;
2292 }
2293 
2294 /// Handle register layout difference for f16 images for some subtargets.
2295 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
2296                                              MachineRegisterInfo &MRI,
2297                                              Register Reg) const {
2298   if (!ST.hasUnpackedD16VMem())
2299     return Reg;
2300 
2301   const LLT S16 = LLT::scalar(16);
2302   const LLT S32 = LLT::scalar(32);
2303   LLT StoreVT = MRI.getType(Reg);
2304   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
2305 
2306   auto Unmerge = B.buildUnmerge(S16, Reg);
2307 
2308   SmallVector<Register, 4> WideRegs;
2309   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
2310     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
2311 
2312   int NumElts = StoreVT.getNumElements();
2313 
2314   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
2315 }
2316 
2317 bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI,
2318                                                  MachineRegisterInfo &MRI,
2319                                                  MachineIRBuilder &B,
2320                                                  bool IsFormat) const {
2321   // TODO: Reject f16 format on targets where unsupported.
2322   Register VData = MI.getOperand(1).getReg();
2323   LLT Ty = MRI.getType(VData);
2324 
2325   B.setInstr(MI);
2326 
2327   const LLT S32 = LLT::scalar(32);
2328   const LLT S16 = LLT::scalar(16);
2329 
2330   // Fixup illegal register types for i8 stores.
2331   if (Ty == LLT::scalar(8) || Ty == S16) {
2332     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
2333     MI.getOperand(1).setReg(AnyExt);
2334     return true;
2335   }
2336 
2337   if (Ty.isVector()) {
2338     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
2339       if (IsFormat)
2340         MI.getOperand(1).setReg(handleD16VData(B, MRI, VData));
2341       return true;
2342     }
2343 
2344     return Ty.getElementType() == S32 && Ty.getNumElements() <= 4;
2345   }
2346 
2347   return Ty == S32;
2348 }
2349 
2350 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
2351                                             MachineRegisterInfo &MRI,
2352                                             MachineIRBuilder &B) const {
2353   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
2354   auto IntrID = MI.getIntrinsicID();
2355   switch (IntrID) {
2356   case Intrinsic::amdgcn_if:
2357   case Intrinsic::amdgcn_else: {
2358     MachineInstr *Br = nullptr;
2359     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
2360       const SIRegisterInfo *TRI
2361         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
2362 
2363       B.setInstr(*BrCond);
2364       Register Def = MI.getOperand(1).getReg();
2365       Register Use = MI.getOperand(3).getReg();
2366 
2367       MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB();
2368       if (Br)
2369         BrTarget = Br->getOperand(0).getMBB();
2370 
2371       if (IntrID == Intrinsic::amdgcn_if) {
2372         B.buildInstr(AMDGPU::SI_IF)
2373           .addDef(Def)
2374           .addUse(Use)
2375           .addMBB(BrTarget);
2376       } else {
2377         B.buildInstr(AMDGPU::SI_ELSE)
2378           .addDef(Def)
2379           .addUse(Use)
2380           .addMBB(BrTarget)
2381           .addImm(0);
2382       }
2383 
2384       if (Br)
2385         Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB());
2386 
2387       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
2388       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
2389       MI.eraseFromParent();
2390       BrCond->eraseFromParent();
2391       return true;
2392     }
2393 
2394     return false;
2395   }
2396   case Intrinsic::amdgcn_loop: {
2397     MachineInstr *Br = nullptr;
2398     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
2399       const SIRegisterInfo *TRI
2400         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
2401 
2402       B.setInstr(*BrCond);
2403 
2404       // FIXME: Need to adjust branch targets based on unconditional branch.
2405       Register Reg = MI.getOperand(2).getReg();
2406       B.buildInstr(AMDGPU::SI_LOOP)
2407         .addUse(Reg)
2408         .addMBB(BrCond->getOperand(1).getMBB());
2409       MI.eraseFromParent();
2410       BrCond->eraseFromParent();
2411       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
2412       return true;
2413     }
2414 
2415     return false;
2416   }
2417   case Intrinsic::amdgcn_kernarg_segment_ptr:
2418     return legalizePreloadedArgIntrin(
2419       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2420   case Intrinsic::amdgcn_implicitarg_ptr:
2421     return legalizeImplicitArgPtr(MI, MRI, B);
2422   case Intrinsic::amdgcn_workitem_id_x:
2423     return legalizePreloadedArgIntrin(MI, MRI, B,
2424                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
2425   case Intrinsic::amdgcn_workitem_id_y:
2426     return legalizePreloadedArgIntrin(MI, MRI, B,
2427                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
2428   case Intrinsic::amdgcn_workitem_id_z:
2429     return legalizePreloadedArgIntrin(MI, MRI, B,
2430                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
2431   case Intrinsic::amdgcn_workgroup_id_x:
2432     return legalizePreloadedArgIntrin(MI, MRI, B,
2433                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
2434   case Intrinsic::amdgcn_workgroup_id_y:
2435     return legalizePreloadedArgIntrin(MI, MRI, B,
2436                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
2437   case Intrinsic::amdgcn_workgroup_id_z:
2438     return legalizePreloadedArgIntrin(MI, MRI, B,
2439                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
2440   case Intrinsic::amdgcn_dispatch_ptr:
2441     return legalizePreloadedArgIntrin(MI, MRI, B,
2442                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
2443   case Intrinsic::amdgcn_queue_ptr:
2444     return legalizePreloadedArgIntrin(MI, MRI, B,
2445                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
2446   case Intrinsic::amdgcn_implicit_buffer_ptr:
2447     return legalizePreloadedArgIntrin(
2448       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
2449   case Intrinsic::amdgcn_dispatch_id:
2450     return legalizePreloadedArgIntrin(MI, MRI, B,
2451                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
2452   case Intrinsic::amdgcn_fdiv_fast:
2453     return legalizeFDIVFastIntrin(MI, MRI, B);
2454   case Intrinsic::amdgcn_is_shared:
2455     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
2456   case Intrinsic::amdgcn_is_private:
2457     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
2458   case Intrinsic::amdgcn_wavefrontsize: {
2459     B.setInstr(MI);
2460     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
2461     MI.eraseFromParent();
2462     return true;
2463   }
2464   case Intrinsic::amdgcn_raw_buffer_store:
2465     return legalizeRawBufferStore(MI, MRI, B, false);
2466   case Intrinsic::amdgcn_raw_buffer_store_format:
2467     return legalizeRawBufferStore(MI, MRI, B, true);
2468   default:
2469     return true;
2470   }
2471 
2472   return true;
2473 }
2474