1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
19 #endif
20 
21 #include "AMDGPULegalizerInfo.h"
22 
23 #include "AMDGPU.h"
24 #include "AMDGPUGlobalISelUtils.h"
25 #include "AMDGPUTargetMachine.h"
26 #include "SIMachineFunctionInfo.h"
27 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
28 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
29 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
30 #include "llvm/CodeGen/TargetOpcodes.h"
31 #include "llvm/CodeGen/ValueTypes.h"
32 #include "llvm/IR/DerivedTypes.h"
33 #include "llvm/IR/DiagnosticInfo.h"
34 #include "llvm/IR/Type.h"
35 #include "llvm/Support/Debug.h"
36 
37 #define DEBUG_TYPE "amdgpu-legalinfo"
38 
39 using namespace llvm;
40 using namespace LegalizeActions;
41 using namespace LegalizeMutations;
42 using namespace LegalityPredicates;
43 using namespace MIPatternMatch;
44 
45 // Round the number of elements to the next power of two elements
46 static LLT getPow2VectorType(LLT Ty) {
47   unsigned NElts = Ty.getNumElements();
48   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
49   return Ty.changeNumElements(Pow2NElts);
50 }
51 
52 // Round the number of bits to the next power of two bits
53 static LLT getPow2ScalarType(LLT Ty) {
54   unsigned Bits = Ty.getSizeInBits();
55   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
56   return LLT::scalar(Pow2Bits);
57 }
58 
59 static LegalityPredicate isMultiple32(unsigned TypeIdx,
60                                       unsigned MaxSize = 1024) {
61   return [=](const LegalityQuery &Query) {
62     const LLT Ty = Query.Types[TypeIdx];
63     const LLT EltTy = Ty.getScalarType();
64     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
65   };
66 }
67 
68 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
69   return [=](const LegalityQuery &Query) {
70     return Query.Types[TypeIdx].getSizeInBits() == Size;
71   };
72 }
73 
74 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
75   return [=](const LegalityQuery &Query) {
76     const LLT Ty = Query.Types[TypeIdx];
77     return Ty.isVector() &&
78            Ty.getNumElements() % 2 != 0 &&
79            Ty.getElementType().getSizeInBits() < 32 &&
80            Ty.getSizeInBits() % 32 != 0;
81   };
82 }
83 
84 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
85   return [=](const LegalityQuery &Query) {
86     const LLT Ty = Query.Types[TypeIdx];
87     const LLT EltTy = Ty.getScalarType();
88     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
89   };
90 }
91 
92 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
93   return [=](const LegalityQuery &Query) {
94     const LLT Ty = Query.Types[TypeIdx];
95     const LLT EltTy = Ty.getElementType();
96     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
97   };
98 }
99 
100 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
101   return [=](const LegalityQuery &Query) {
102     const LLT Ty = Query.Types[TypeIdx];
103     const LLT EltTy = Ty.getElementType();
104     unsigned Size = Ty.getSizeInBits();
105     unsigned Pieces = (Size + 63) / 64;
106     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
107     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
108   };
109 }
110 
111 // Increase the number of vector elements to reach the next multiple of 32-bit
112 // type.
113 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
114   return [=](const LegalityQuery &Query) {
115     const LLT Ty = Query.Types[TypeIdx];
116 
117     const LLT EltTy = Ty.getElementType();
118     const int Size = Ty.getSizeInBits();
119     const int EltSize = EltTy.getSizeInBits();
120     const int NextMul32 = (Size + 31) / 32;
121 
122     assert(EltSize < 32);
123 
124     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
125     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
126   };
127 }
128 
129 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
130   return [=](const LegalityQuery &Query) {
131     const LLT QueryTy = Query.Types[TypeIdx];
132     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
133   };
134 }
135 
136 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
137   return [=](const LegalityQuery &Query) {
138     const LLT QueryTy = Query.Types[TypeIdx];
139     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
140   };
141 }
142 
143 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
144   return [=](const LegalityQuery &Query) {
145     const LLT QueryTy = Query.Types[TypeIdx];
146     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
147   };
148 }
149 
150 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
151 // v2s16.
152 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
153   return [=](const LegalityQuery &Query) {
154     const LLT Ty = Query.Types[TypeIdx];
155     if (Ty.isVector()) {
156       const int EltSize = Ty.getElementType().getSizeInBits();
157       return EltSize == 32 || EltSize == 64 ||
158             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
159              EltSize == 128 || EltSize == 256;
160     }
161 
162     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
163   };
164 }
165 
166 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
167   return [=](const LegalityQuery &Query) {
168     const LLT QueryTy = Query.Types[TypeIdx];
169     return QueryTy.isVector() && QueryTy.getElementType() == Type;
170   };
171 }
172 
173 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
174   return [=](const LegalityQuery &Query) {
175     const LLT Ty = Query.Types[TypeIdx];
176     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
177            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
178   };
179 }
180 
181 static LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1) {
182   return [=](const LegalityQuery &Query) {
183     return Query.Types[TypeIdx0].getSizeInBits() <
184            Query.Types[TypeIdx1].getSizeInBits();
185   };
186 }
187 
188 static LegalityPredicate greaterThan(unsigned TypeIdx0, unsigned TypeIdx1) {
189   return [=](const LegalityQuery &Query) {
190     return Query.Types[TypeIdx0].getSizeInBits() >
191            Query.Types[TypeIdx1].getSizeInBits();
192   };
193 }
194 
195 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
196                                          const GCNTargetMachine &TM)
197   :  ST(ST_) {
198   using namespace TargetOpcode;
199 
200   auto GetAddrSpacePtr = [&TM](unsigned AS) {
201     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
202   };
203 
204   const LLT S1 = LLT::scalar(1);
205   const LLT S16 = LLT::scalar(16);
206   const LLT S32 = LLT::scalar(32);
207   const LLT S64 = LLT::scalar(64);
208   const LLT S128 = LLT::scalar(128);
209   const LLT S256 = LLT::scalar(256);
210   const LLT S1024 = LLT::scalar(1024);
211 
212   const LLT V2S16 = LLT::vector(2, 16);
213   const LLT V4S16 = LLT::vector(4, 16);
214 
215   const LLT V2S32 = LLT::vector(2, 32);
216   const LLT V3S32 = LLT::vector(3, 32);
217   const LLT V4S32 = LLT::vector(4, 32);
218   const LLT V5S32 = LLT::vector(5, 32);
219   const LLT V6S32 = LLT::vector(6, 32);
220   const LLT V7S32 = LLT::vector(7, 32);
221   const LLT V8S32 = LLT::vector(8, 32);
222   const LLT V9S32 = LLT::vector(9, 32);
223   const LLT V10S32 = LLT::vector(10, 32);
224   const LLT V11S32 = LLT::vector(11, 32);
225   const LLT V12S32 = LLT::vector(12, 32);
226   const LLT V13S32 = LLT::vector(13, 32);
227   const LLT V14S32 = LLT::vector(14, 32);
228   const LLT V15S32 = LLT::vector(15, 32);
229   const LLT V16S32 = LLT::vector(16, 32);
230   const LLT V32S32 = LLT::vector(32, 32);
231 
232   const LLT V2S64 = LLT::vector(2, 64);
233   const LLT V3S64 = LLT::vector(3, 64);
234   const LLT V4S64 = LLT::vector(4, 64);
235   const LLT V5S64 = LLT::vector(5, 64);
236   const LLT V6S64 = LLT::vector(6, 64);
237   const LLT V7S64 = LLT::vector(7, 64);
238   const LLT V8S64 = LLT::vector(8, 64);
239   const LLT V16S64 = LLT::vector(16, 64);
240 
241   std::initializer_list<LLT> AllS32Vectors =
242     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
243      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
244   std::initializer_list<LLT> AllS64Vectors =
245     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
246 
247   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
248   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
249   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
250   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
251   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
252   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
253   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
254 
255   const LLT CodePtr = FlatPtr;
256 
257   const std::initializer_list<LLT> AddrSpaces64 = {
258     GlobalPtr, ConstantPtr, FlatPtr
259   };
260 
261   const std::initializer_list<LLT> AddrSpaces32 = {
262     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
263   };
264 
265   const std::initializer_list<LLT> FPTypesBase = {
266     S32, S64
267   };
268 
269   const std::initializer_list<LLT> FPTypes16 = {
270     S32, S64, S16
271   };
272 
273   const std::initializer_list<LLT> FPTypesPK16 = {
274     S32, S64, S16, V2S16
275   };
276 
277   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
278 
279   setAction({G_BRCOND, S1}, Legal); // VCC branches
280   setAction({G_BRCOND, S32}, Legal); // SCC branches
281 
282   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
283   // elements for v3s16
284   getActionDefinitionsBuilder(G_PHI)
285     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
286     .legalFor(AllS32Vectors)
287     .legalFor(AllS64Vectors)
288     .legalFor(AddrSpaces64)
289     .legalFor(AddrSpaces32)
290     .clampScalar(0, S32, S256)
291     .widenScalarToNextPow2(0, 32)
292     .clampMaxNumElements(0, S32, 16)
293     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
294     .legalIf(isPointer(0));
295 
296   if (ST.has16BitInsts()) {
297     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
298       .legalFor({S32, S16})
299       .clampScalar(0, S16, S32)
300       .scalarize(0)
301       .widenScalarToNextPow2(0, 32);
302   } else {
303     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
304       .legalFor({S32})
305       .clampScalar(0, S32, S32)
306       .scalarize(0);
307   }
308 
309   // FIXME: Not really legal. Placeholder for custom lowering.
310   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
311     .legalFor({S32, S64})
312     .clampScalar(0, S32, S64)
313     .widenScalarToNextPow2(0, 32)
314     .scalarize(0);
315 
316   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
317     .legalFor({S32})
318     .clampScalar(0, S32, S32)
319     .scalarize(0);
320 
321   // Report legal for any types we can handle anywhere. For the cases only legal
322   // on the SALU, RegBankSelect will be able to re-legalize.
323   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
324     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
325     .clampScalar(0, S32, S64)
326     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
327     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
328     .widenScalarToNextPow2(0)
329     .scalarize(0);
330 
331   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
332                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
333     .legalFor({{S32, S1}, {S32, S32}})
334     .clampScalar(0, S32, S32)
335     .scalarize(0); // TODO: Implement.
336 
337   getActionDefinitionsBuilder(G_BITCAST)
338     // Don't worry about the size constraint.
339     .legalIf(all(isRegisterType(0), isRegisterType(1)))
340     .lower();
341 
342 
343   getActionDefinitionsBuilder(G_CONSTANT)
344     .legalFor({S1, S32, S64, S16, GlobalPtr,
345                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
346     .clampScalar(0, S32, S64)
347     .widenScalarToNextPow2(0)
348     .legalIf(isPointer(0));
349 
350   getActionDefinitionsBuilder(G_FCONSTANT)
351     .legalFor({S32, S64, S16})
352     .clampScalar(0, S16, S64);
353 
354   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
355     .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
356                ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
357     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
358     .clampScalarOrElt(0, S32, S1024)
359     .legalIf(isMultiple32(0))
360     .widenScalarToNextPow2(0, 32)
361     .clampMaxNumElements(0, S32, 16);
362 
363   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
364   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
365     .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr});
366   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
367 
368   auto &FPOpActions = getActionDefinitionsBuilder(
369     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
370     .legalFor({S32, S64});
371   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
372     .customFor({S32, S64});
373   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
374     .customFor({S32, S64});
375 
376   if (ST.has16BitInsts()) {
377     if (ST.hasVOP3PInsts())
378       FPOpActions.legalFor({S16, V2S16});
379     else
380       FPOpActions.legalFor({S16});
381 
382     TrigActions.customFor({S16});
383     FDIVActions.customFor({S16});
384   }
385 
386   auto &MinNumMaxNum = getActionDefinitionsBuilder({
387       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
388 
389   if (ST.hasVOP3PInsts()) {
390     MinNumMaxNum.customFor(FPTypesPK16)
391       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
392       .clampMaxNumElements(0, S16, 2)
393       .clampScalar(0, S16, S64)
394       .scalarize(0);
395   } else if (ST.has16BitInsts()) {
396     MinNumMaxNum.customFor(FPTypes16)
397       .clampScalar(0, S16, S64)
398       .scalarize(0);
399   } else {
400     MinNumMaxNum.customFor(FPTypesBase)
401       .clampScalar(0, S32, S64)
402       .scalarize(0);
403   }
404 
405   if (ST.hasVOP3PInsts())
406     FPOpActions.clampMaxNumElements(0, S16, 2);
407 
408   FPOpActions
409     .scalarize(0)
410     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
411 
412   TrigActions
413     .scalarize(0)
414     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
415 
416   FDIVActions
417     .scalarize(0)
418     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
419 
420   getActionDefinitionsBuilder({G_FNEG, G_FABS})
421     .legalFor(FPTypesPK16)
422     .clampMaxNumElements(0, S16, 2)
423     .scalarize(0)
424     .clampScalar(0, S16, S64);
425 
426   if (ST.has16BitInsts()) {
427     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
428       .legalFor({S32, S64, S16})
429       .scalarize(0)
430       .clampScalar(0, S16, S64);
431   } else {
432     getActionDefinitionsBuilder(G_FSQRT)
433       .legalFor({S32, S64})
434       .scalarize(0)
435       .clampScalar(0, S32, S64);
436 
437     if (ST.hasFractBug()) {
438       getActionDefinitionsBuilder(G_FFLOOR)
439         .customFor({S64})
440         .legalFor({S32, S64})
441         .scalarize(0)
442         .clampScalar(0, S32, S64);
443     } else {
444       getActionDefinitionsBuilder(G_FFLOOR)
445         .legalFor({S32, S64})
446         .scalarize(0)
447         .clampScalar(0, S32, S64);
448     }
449   }
450 
451   getActionDefinitionsBuilder(G_FPTRUNC)
452     .legalFor({{S32, S64}, {S16, S32}})
453     .scalarize(0)
454     .lower();
455 
456   getActionDefinitionsBuilder(G_FPEXT)
457     .legalFor({{S64, S32}, {S32, S16}})
458     .lowerFor({{S64, S16}}) // FIXME: Implement
459     .scalarize(0);
460 
461   getActionDefinitionsBuilder(G_FSUB)
462       // Use actual fsub instruction
463       .legalFor({S32})
464       // Must use fadd + fneg
465       .lowerFor({S64, S16, V2S16})
466       .scalarize(0)
467       .clampScalar(0, S32, S64);
468 
469   // Whether this is legal depends on the floating point mode for the function.
470   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
471   if (ST.hasMadF16())
472     FMad.customFor({S32, S16});
473   else
474     FMad.customFor({S32});
475   FMad.scalarize(0)
476       .lower();
477 
478   getActionDefinitionsBuilder(G_TRUNC)
479     .alwaysLegal();
480 
481   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
482     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
483                {S32, S1}, {S64, S1}, {S16, S1}})
484     .scalarize(0)
485     .clampScalar(0, S32, S64)
486     .widenScalarToNextPow2(1, 32);
487 
488   // TODO: Split s1->s64 during regbankselect for VALU.
489   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
490     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
491     .lowerFor({{S32, S64}})
492     .lowerIf(typeIs(1, S1))
493     .customFor({{S64, S64}});
494   if (ST.has16BitInsts())
495     IToFP.legalFor({{S16, S16}});
496   IToFP.clampScalar(1, S32, S64)
497        .scalarize(0)
498        .widenScalarToNextPow2(1);
499 
500   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
501     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
502     .customFor({{S64, S64}});
503   if (ST.has16BitInsts())
504     FPToI.legalFor({{S16, S16}});
505   else
506     FPToI.minScalar(1, S32);
507 
508   FPToI.minScalar(0, S32)
509        .scalarize(0)
510        .lower();
511 
512   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
513     .scalarize(0)
514     .lower();
515 
516   if (ST.has16BitInsts()) {
517     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
518       .legalFor({S16, S32, S64})
519       .clampScalar(0, S16, S64)
520       .scalarize(0);
521   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
522     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
523       .legalFor({S32, S64})
524       .clampScalar(0, S32, S64)
525       .scalarize(0);
526   } else {
527     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
528       .legalFor({S32})
529       .customFor({S64})
530       .clampScalar(0, S32, S64)
531       .scalarize(0);
532   }
533 
534   getActionDefinitionsBuilder({G_PTR_ADD, G_PTR_MASK})
535     .scalarize(0)
536     .alwaysLegal();
537 
538   auto &CmpBuilder =
539     getActionDefinitionsBuilder(G_ICMP)
540     // The compare output type differs based on the register bank of the output,
541     // so make both s1 and s32 legal.
542     //
543     // Scalar compares producing output in scc will be promoted to s32, as that
544     // is the allocatable register type that will be needed for the copy from
545     // scc. This will be promoted during RegBankSelect, and we assume something
546     // before that won't try to use s32 result types.
547     //
548     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
549     // bank.
550     .legalForCartesianProduct(
551       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
552     .legalForCartesianProduct(
553       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
554   if (ST.has16BitInsts()) {
555     CmpBuilder.legalFor({{S1, S16}});
556   }
557 
558   CmpBuilder
559     .widenScalarToNextPow2(1)
560     .clampScalar(1, S32, S64)
561     .scalarize(0)
562     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
563 
564   getActionDefinitionsBuilder(G_FCMP)
565     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
566     .widenScalarToNextPow2(1)
567     .clampScalar(1, S32, S64)
568     .scalarize(0);
569 
570   // FIXME: fpow has a selection pattern that should move to custom lowering.
571   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2, G_FPOW});
572   if (ST.has16BitInsts())
573     Exp2Ops.legalFor({S32, S16});
574   else
575     Exp2Ops.legalFor({S32});
576   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
577   Exp2Ops.scalarize(0);
578 
579   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10});
580   if (ST.has16BitInsts())
581     ExpOps.customFor({{S32}, {S16}});
582   else
583     ExpOps.customFor({S32});
584   ExpOps.clampScalar(0, MinScalarFPTy, S32)
585         .scalarize(0);
586 
587   // The 64-bit versions produce 32-bit results, but only on the SALU.
588   getActionDefinitionsBuilder(G_CTPOP)
589     .legalFor({{S32, S32}, {S32, S64}})
590     .clampScalar(0, S32, S32)
591     .clampScalar(1, S32, S64)
592     .scalarize(0)
593     .widenScalarToNextPow2(0, 32)
594     .widenScalarToNextPow2(1, 32);
595 
596   // The hardware instructions return a different result on 0 than the generic
597   // instructions expect. The hardware produces -1, but these produce the
598   // bitwidth.
599   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
600     .scalarize(0)
601     .clampScalar(0, S32, S32)
602     .clampScalar(1, S32, S64)
603     .widenScalarToNextPow2(0, 32)
604     .widenScalarToNextPow2(1, 32)
605     .lower();
606 
607   // The 64-bit versions produce 32-bit results, but only on the SALU.
608   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
609     .legalFor({{S32, S32}, {S32, S64}})
610     .clampScalar(0, S32, S32)
611     .clampScalar(1, S32, S64)
612     .scalarize(0)
613     .widenScalarToNextPow2(0, 32)
614     .widenScalarToNextPow2(1, 32);
615 
616   getActionDefinitionsBuilder(G_BITREVERSE)
617     .legalFor({S32})
618     .clampScalar(0, S32, S32)
619     .scalarize(0);
620 
621   if (ST.has16BitInsts()) {
622     getActionDefinitionsBuilder(G_BSWAP)
623       .legalFor({S16, S32, V2S16})
624       .clampMaxNumElements(0, S16, 2)
625       // FIXME: Fixing non-power-of-2 before clamp is workaround for
626       // narrowScalar limitation.
627       .widenScalarToNextPow2(0)
628       .clampScalar(0, S16, S32)
629       .scalarize(0);
630 
631     if (ST.hasVOP3PInsts()) {
632       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
633         .legalFor({S32, S16, V2S16})
634         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
635         .clampMaxNumElements(0, S16, 2)
636         .clampScalar(0, S16, S32)
637         .widenScalarToNextPow2(0)
638         .scalarize(0);
639     } else {
640       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
641         .legalFor({S32, S16})
642         .widenScalarToNextPow2(0)
643         .clampScalar(0, S16, S32)
644         .scalarize(0);
645     }
646   } else {
647     // TODO: Should have same legality without v_perm_b32
648     getActionDefinitionsBuilder(G_BSWAP)
649       .legalFor({S32})
650       .lowerIf(narrowerThan(0, 32))
651       // FIXME: Fixing non-power-of-2 before clamp is workaround for
652       // narrowScalar limitation.
653       .widenScalarToNextPow2(0)
654       .maxScalar(0, S32)
655       .scalarize(0)
656       .lower();
657 
658     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
659       .legalFor({S32})
660       .clampScalar(0, S32, S32)
661       .widenScalarToNextPow2(0)
662       .scalarize(0);
663   }
664 
665   getActionDefinitionsBuilder(G_INTTOPTR)
666     // List the common cases
667     .legalForCartesianProduct(AddrSpaces64, {S64})
668     .legalForCartesianProduct(AddrSpaces32, {S32})
669     .scalarize(0)
670     // Accept any address space as long as the size matches
671     .legalIf(sameSize(0, 1))
672     .widenScalarIf(smallerThan(1, 0),
673       [](const LegalityQuery &Query) {
674         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
675       })
676     .narrowScalarIf(greaterThan(1, 0),
677       [](const LegalityQuery &Query) {
678         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
679       });
680 
681   getActionDefinitionsBuilder(G_PTRTOINT)
682     // List the common cases
683     .legalForCartesianProduct(AddrSpaces64, {S64})
684     .legalForCartesianProduct(AddrSpaces32, {S32})
685     .scalarize(0)
686     // Accept any address space as long as the size matches
687     .legalIf(sameSize(0, 1))
688     .widenScalarIf(smallerThan(0, 1),
689       [](const LegalityQuery &Query) {
690         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
691       })
692     .narrowScalarIf(
693       greaterThan(0, 1),
694       [](const LegalityQuery &Query) {
695         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
696       });
697 
698   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
699     .scalarize(0)
700     .custom();
701 
702   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
703   // handle some operations by just promoting the register during
704   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
705   auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned {
706     switch (AS) {
707     // FIXME: Private element size.
708     case AMDGPUAS::PRIVATE_ADDRESS:
709       return 32;
710     // FIXME: Check subtarget
711     case AMDGPUAS::LOCAL_ADDRESS:
712       return ST.useDS128() ? 128 : 64;
713 
714     // Treat constant and global as identical. SMRD loads are sometimes usable
715     // for global loads (ideally constant address space should be eliminated)
716     // depending on the context. Legality cannot be context dependent, but
717     // RegBankSelect can split the load as necessary depending on the pointer
718     // register bank/uniformity and if the memory is invariant or not written in
719     // a kernel.
720     case AMDGPUAS::CONSTANT_ADDRESS:
721     case AMDGPUAS::GLOBAL_ADDRESS:
722       return IsLoad ? 512 : 128;
723     default:
724       return 128;
725     }
726   };
727 
728   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
729                                     bool IsLoad) -> bool {
730     const LLT DstTy = Query.Types[0];
731 
732     // Split vector extloads.
733     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
734     unsigned Align = Query.MMODescrs[0].AlignInBits;
735 
736     if (MemSize < DstTy.getSizeInBits())
737       MemSize = std::max(MemSize, Align);
738 
739     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
740       return true;
741 
742     const LLT PtrTy = Query.Types[1];
743     unsigned AS = PtrTy.getAddressSpace();
744     if (MemSize > maxSizeForAddrSpace(AS, IsLoad))
745       return true;
746 
747     // Catch weird sized loads that don't evenly divide into the access sizes
748     // TODO: May be able to widen depending on alignment etc.
749     unsigned NumRegs = (MemSize + 31) / 32;
750     if (NumRegs == 3) {
751       if (!ST.hasDwordx3LoadStores())
752         return true;
753     } else {
754       // If the alignment allows, these should have been widened.
755       if (!isPowerOf2_32(NumRegs))
756         return true;
757     }
758 
759     if (Align < MemSize) {
760       const SITargetLowering *TLI = ST.getTargetLowering();
761       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
762     }
763 
764     return false;
765   };
766 
767   const auto shouldWidenLoadResult = [=](const LegalityQuery &Query) -> bool {
768     unsigned Size = Query.Types[0].getSizeInBits();
769     if (isPowerOf2_32(Size))
770       return false;
771 
772     if (Size == 96 && ST.hasDwordx3LoadStores())
773       return false;
774 
775     unsigned AddrSpace = Query.Types[1].getAddressSpace();
776     if (Size >= maxSizeForAddrSpace(AddrSpace, true))
777       return false;
778 
779     unsigned Align = Query.MMODescrs[0].AlignInBits;
780     unsigned RoundedSize = NextPowerOf2(Size);
781     return (Align >= RoundedSize);
782   };
783 
784   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
785   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
786   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
787 
788   // TODO: Refine based on subtargets which support unaligned access or 128-bit
789   // LDS
790   // TODO: Unsupported flat for SI.
791 
792   for (unsigned Op : {G_LOAD, G_STORE}) {
793     const bool IsStore = Op == G_STORE;
794 
795     auto &Actions = getActionDefinitionsBuilder(Op);
796     // Whitelist the common cases.
797     // TODO: Loads to s16 on gfx9
798     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
799                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
800                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
801                                       {S128, GlobalPtr, 128, GlobalAlign32},
802                                       {S64, GlobalPtr, 64, GlobalAlign32},
803                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
804                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
805                                       {S32, GlobalPtr, 8, GlobalAlign8},
806                                       {S32, GlobalPtr, 16, GlobalAlign16},
807 
808                                       {S32, LocalPtr, 32, 32},
809                                       {S64, LocalPtr, 64, 32},
810                                       {V2S32, LocalPtr, 64, 32},
811                                       {S32, LocalPtr, 8, 8},
812                                       {S32, LocalPtr, 16, 16},
813                                       {V2S16, LocalPtr, 32, 32},
814 
815                                       {S32, PrivatePtr, 32, 32},
816                                       {S32, PrivatePtr, 8, 8},
817                                       {S32, PrivatePtr, 16, 16},
818                                       {V2S16, PrivatePtr, 32, 32},
819 
820                                       {S32, FlatPtr, 32, GlobalAlign32},
821                                       {S32, FlatPtr, 16, GlobalAlign16},
822                                       {S32, FlatPtr, 8, GlobalAlign8},
823                                       {V2S16, FlatPtr, 32, GlobalAlign32},
824 
825                                       {S32, ConstantPtr, 32, GlobalAlign32},
826                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
827                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
828                                       {S64, ConstantPtr, 64, GlobalAlign32},
829                                       {S128, ConstantPtr, 128, GlobalAlign32},
830                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
831     Actions
832         .customIf(typeIs(1, Constant32Ptr))
833         // Widen suitably aligned loads by loading extra elements.
834         .moreElementsIf([=](const LegalityQuery &Query) {
835             const LLT Ty = Query.Types[0];
836             return Op == G_LOAD && Ty.isVector() &&
837                    shouldWidenLoadResult(Query);
838           }, moreElementsToNextPow2(0))
839         .widenScalarIf([=](const LegalityQuery &Query) {
840             const LLT Ty = Query.Types[0];
841             return Op == G_LOAD && !Ty.isVector() &&
842                    shouldWidenLoadResult(Query);
843           }, widenScalarOrEltToNextPow2(0))
844         .narrowScalarIf(
845             [=](const LegalityQuery &Query) -> bool {
846               return !Query.Types[0].isVector() &&
847                      needToSplitMemOp(Query, Op == G_LOAD);
848             },
849             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
850               const LLT DstTy = Query.Types[0];
851               const LLT PtrTy = Query.Types[1];
852 
853               const unsigned DstSize = DstTy.getSizeInBits();
854               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
855 
856               // Split extloads.
857               if (DstSize > MemSize)
858                 return std::make_pair(0, LLT::scalar(MemSize));
859 
860               if (!isPowerOf2_32(DstSize)) {
861                 // We're probably decomposing an odd sized store. Try to split
862                 // to the widest type. TODO: Account for alignment. As-is it
863                 // should be OK, since the new parts will be further legalized.
864                 unsigned FloorSize = PowerOf2Floor(DstSize);
865                 return std::make_pair(0, LLT::scalar(FloorSize));
866               }
867 
868               if (DstSize > 32 && (DstSize % 32 != 0)) {
869                 // FIXME: Need a way to specify non-extload of larger size if
870                 // suitably aligned.
871                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
872               }
873 
874               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
875                                                      Op == G_LOAD);
876               if (MemSize > MaxSize)
877                 return std::make_pair(0, LLT::scalar(MaxSize));
878 
879               unsigned Align = Query.MMODescrs[0].AlignInBits;
880               return std::make_pair(0, LLT::scalar(Align));
881             })
882         .fewerElementsIf(
883             [=](const LegalityQuery &Query) -> bool {
884               return Query.Types[0].isVector() &&
885                      needToSplitMemOp(Query, Op == G_LOAD);
886             },
887             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
888               const LLT DstTy = Query.Types[0];
889               const LLT PtrTy = Query.Types[1];
890 
891               LLT EltTy = DstTy.getElementType();
892               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
893                                                      Op == G_LOAD);
894 
895               // FIXME: Handle widened to power of 2 results better. This ends
896               // up scalarizing.
897               // FIXME: 3 element stores scalarized on SI
898 
899               // Split if it's too large for the address space.
900               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
901                 unsigned NumElts = DstTy.getNumElements();
902                 unsigned EltSize = EltTy.getSizeInBits();
903 
904                 if (MaxSize % EltSize == 0) {
905                   return std::make_pair(
906                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
907                 }
908 
909                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
910 
911                 // FIXME: Refine when odd breakdowns handled
912                 // The scalars will need to be re-legalized.
913                 if (NumPieces == 1 || NumPieces >= NumElts ||
914                     NumElts % NumPieces != 0)
915                   return std::make_pair(0, EltTy);
916 
917                 return std::make_pair(0,
918                                       LLT::vector(NumElts / NumPieces, EltTy));
919               }
920 
921               // FIXME: We could probably handle weird extending loads better.
922               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
923               if (DstTy.getSizeInBits() > MemSize)
924                 return std::make_pair(0, EltTy);
925 
926               unsigned EltSize = EltTy.getSizeInBits();
927               unsigned DstSize = DstTy.getSizeInBits();
928               if (!isPowerOf2_32(DstSize)) {
929                 // We're probably decomposing an odd sized store. Try to split
930                 // to the widest type. TODO: Account for alignment. As-is it
931                 // should be OK, since the new parts will be further legalized.
932                 unsigned FloorSize = PowerOf2Floor(DstSize);
933                 return std::make_pair(
934                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
935               }
936 
937               // Need to split because of alignment.
938               unsigned Align = Query.MMODescrs[0].AlignInBits;
939               if (EltSize > Align &&
940                   (EltSize / Align < DstTy.getNumElements())) {
941                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
942               }
943 
944               // May need relegalization for the scalars.
945               return std::make_pair(0, EltTy);
946             })
947         .minScalar(0, S32);
948 
949     if (IsStore)
950       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
951 
952     // TODO: Need a bitcast lower option?
953     Actions
954         .legalIf([=](const LegalityQuery &Query) {
955           const LLT Ty0 = Query.Types[0];
956           unsigned Size = Ty0.getSizeInBits();
957           unsigned MemSize = Query.MMODescrs[0].SizeInBits;
958           unsigned Align = Query.MMODescrs[0].AlignInBits;
959 
960           // FIXME: Widening store from alignment not valid.
961           if (MemSize < Size)
962             MemSize = std::max(MemSize, Align);
963 
964           // No extending vector loads.
965           if (Size > MemSize && Ty0.isVector())
966             return false;
967 
968           switch (MemSize) {
969           case 8:
970           case 16:
971             return Size == 32;
972           case 32:
973           case 64:
974           case 128:
975             return true;
976           case 96:
977             return ST.hasDwordx3LoadStores();
978           case 256:
979           case 512:
980             return true;
981           default:
982             return false;
983           }
984         })
985         .widenScalarToNextPow2(0)
986         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
987   }
988 
989   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
990                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
991                                                   {S32, GlobalPtr, 16, 2 * 8},
992                                                   {S32, LocalPtr, 8, 8},
993                                                   {S32, LocalPtr, 16, 16},
994                                                   {S32, PrivatePtr, 8, 8},
995                                                   {S32, PrivatePtr, 16, 16},
996                                                   {S32, ConstantPtr, 8, 8},
997                                                   {S32, ConstantPtr, 16, 2 * 8}});
998   if (ST.hasFlatAddressSpace()) {
999     ExtLoads.legalForTypesWithMemDesc(
1000         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1001   }
1002 
1003   ExtLoads.clampScalar(0, S32, S32)
1004           .widenScalarToNextPow2(0)
1005           .unsupportedIfMemSizeNotPow2()
1006           .lower();
1007 
1008   auto &Atomics = getActionDefinitionsBuilder(
1009     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1010      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1011      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1012      G_ATOMICRMW_UMIN})
1013     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1014                {S64, GlobalPtr}, {S64, LocalPtr}});
1015   if (ST.hasFlatAddressSpace()) {
1016     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1017   }
1018 
1019   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1020     .legalFor({{S32, LocalPtr}});
1021 
1022   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1023   // demarshalling
1024   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1025     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1026                 {S32, FlatPtr}, {S64, FlatPtr}})
1027     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1028                {S32, RegionPtr}, {S64, RegionPtr}});
1029   // TODO: Pointer types, any 32-bit or 64-bit vector
1030 
1031   // Condition should be s32 for scalar, s1 for vector.
1032   getActionDefinitionsBuilder(G_SELECT)
1033     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1034           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1035           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1036     .clampScalar(0, S16, S64)
1037     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1038     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1039     .scalarize(1)
1040     .clampMaxNumElements(0, S32, 2)
1041     .clampMaxNumElements(0, LocalPtr, 2)
1042     .clampMaxNumElements(0, PrivatePtr, 2)
1043     .scalarize(0)
1044     .widenScalarToNextPow2(0)
1045     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1046 
1047   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1048   // be more flexible with the shift amount type.
1049   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1050     .legalFor({{S32, S32}, {S64, S32}});
1051   if (ST.has16BitInsts()) {
1052     if (ST.hasVOP3PInsts()) {
1053       Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
1054             .clampMaxNumElements(0, S16, 2);
1055     } else
1056       Shifts.legalFor({{S16, S32}, {S16, S16}});
1057 
1058     // TODO: Support 16-bit shift amounts
1059     Shifts.clampScalar(1, S32, S32);
1060     Shifts.clampScalar(0, S16, S64);
1061     Shifts.widenScalarToNextPow2(0, 16);
1062   } else {
1063     // Make sure we legalize the shift amount type first, as the general
1064     // expansion for the shifted type will produce much worse code if it hasn't
1065     // been truncated already.
1066     Shifts.clampScalar(1, S32, S32);
1067     Shifts.clampScalar(0, S32, S64);
1068     Shifts.widenScalarToNextPow2(0, 32);
1069   }
1070   Shifts.scalarize(0);
1071 
1072   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1073     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1074     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1075     unsigned IdxTypeIdx = 2;
1076 
1077     getActionDefinitionsBuilder(Op)
1078       .customIf([=](const LegalityQuery &Query) {
1079           const LLT EltTy = Query.Types[EltTypeIdx];
1080           const LLT VecTy = Query.Types[VecTypeIdx];
1081           const LLT IdxTy = Query.Types[IdxTypeIdx];
1082           return (EltTy.getSizeInBits() == 16 ||
1083                   EltTy.getSizeInBits() % 32 == 0) &&
1084                  VecTy.getSizeInBits() % 32 == 0 &&
1085                  VecTy.getSizeInBits() <= 1024 &&
1086                  IdxTy.getSizeInBits() == 32;
1087         })
1088       .clampScalar(EltTypeIdx, S32, S64)
1089       .clampScalar(VecTypeIdx, S32, S64)
1090       .clampScalar(IdxTypeIdx, S32, S32);
1091   }
1092 
1093   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1094     .unsupportedIf([=](const LegalityQuery &Query) {
1095         const LLT &EltTy = Query.Types[1].getElementType();
1096         return Query.Types[0] != EltTy;
1097       });
1098 
1099   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1100     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1101     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1102 
1103     // FIXME: Doesn't handle extract of illegal sizes.
1104     getActionDefinitionsBuilder(Op)
1105       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1106       // FIXME: Multiples of 16 should not be legal.
1107       .legalIf([=](const LegalityQuery &Query) {
1108           const LLT BigTy = Query.Types[BigTyIdx];
1109           const LLT LitTy = Query.Types[LitTyIdx];
1110           return (BigTy.getSizeInBits() % 32 == 0) &&
1111                  (LitTy.getSizeInBits() % 16 == 0);
1112         })
1113       .widenScalarIf(
1114         [=](const LegalityQuery &Query) {
1115           const LLT BigTy = Query.Types[BigTyIdx];
1116           return (BigTy.getScalarSizeInBits() < 16);
1117         },
1118         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1119       .widenScalarIf(
1120         [=](const LegalityQuery &Query) {
1121           const LLT LitTy = Query.Types[LitTyIdx];
1122           return (LitTy.getScalarSizeInBits() < 16);
1123         },
1124         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1125       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1126       .widenScalarToNextPow2(BigTyIdx, 32);
1127 
1128   }
1129 
1130   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1131     .legalForCartesianProduct(AllS32Vectors, {S32})
1132     .legalForCartesianProduct(AllS64Vectors, {S64})
1133     .clampNumElements(0, V16S32, V32S32)
1134     .clampNumElements(0, V2S64, V16S64)
1135     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1136 
1137   if (ST.hasScalarPackInsts()) {
1138     BuildVector
1139       // FIXME: Should probably widen s1 vectors straight to s32
1140       .minScalarOrElt(0, S16)
1141       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1142       .minScalar(1, S32);
1143 
1144     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1145       .legalFor({V2S16, S32})
1146       .lower();
1147     BuildVector.minScalarOrElt(0, S32);
1148   } else {
1149     BuildVector.customFor({V2S16, S16});
1150     BuildVector.minScalarOrElt(0, S32);
1151 
1152     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1153       .customFor({V2S16, S32})
1154       .lower();
1155   }
1156 
1157   BuildVector.legalIf(isRegisterType(0));
1158 
1159   // FIXME: Clamp maximum size
1160   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1161     .legalIf(isRegisterType(0));
1162 
1163   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1164   // pre-legalize.
1165   if (ST.hasVOP3PInsts()) {
1166     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1167       .customFor({V2S16, V2S16})
1168       .lower();
1169   } else
1170     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1171 
1172   // Merge/Unmerge
1173   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1174     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1175     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1176 
1177     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1178       const LLT &Ty = Query.Types[TypeIdx];
1179       if (Ty.isVector()) {
1180         const LLT &EltTy = Ty.getElementType();
1181         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
1182           return true;
1183         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1184           return true;
1185       }
1186       return false;
1187     };
1188 
1189     auto &Builder = getActionDefinitionsBuilder(Op)
1190       // Try to widen to s16 first for small types.
1191       // TODO: Only do this on targets with legal s16 shifts
1192       .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1193 
1194       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1195       .lowerFor({{S16, V2S16}})
1196       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1197       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1198                            elementTypeIs(1, S16)),
1199                        changeTo(1, V2S16))
1200       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1201       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1202       // valid.
1203       .clampScalar(LitTyIdx, S32, S256)
1204       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1205       // Break up vectors with weird elements into scalars
1206       .fewerElementsIf(
1207         [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
1208         scalarize(0))
1209       .fewerElementsIf(
1210         [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
1211         scalarize(1))
1212       .clampScalar(BigTyIdx, S32, S1024);
1213 
1214     if (Op == G_MERGE_VALUES) {
1215       Builder.widenScalarIf(
1216         // TODO: Use 16-bit shifts if legal for 8-bit values?
1217         [=](const LegalityQuery &Query) {
1218           const LLT Ty = Query.Types[LitTyIdx];
1219           return Ty.getSizeInBits() < 32;
1220         },
1221         changeTo(LitTyIdx, S32));
1222     }
1223 
1224     Builder.widenScalarIf(
1225       [=](const LegalityQuery &Query) {
1226         const LLT Ty = Query.Types[BigTyIdx];
1227         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1228           Ty.getSizeInBits() % 16 != 0;
1229       },
1230       [=](const LegalityQuery &Query) {
1231         // Pick the next power of 2, or a multiple of 64 over 128.
1232         // Whichever is smaller.
1233         const LLT &Ty = Query.Types[BigTyIdx];
1234         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1235         if (NewSizeInBits >= 256) {
1236           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1237           if (RoundedTo < NewSizeInBits)
1238             NewSizeInBits = RoundedTo;
1239         }
1240         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1241       })
1242       .legalIf([=](const LegalityQuery &Query) {
1243           const LLT &BigTy = Query.Types[BigTyIdx];
1244           const LLT &LitTy = Query.Types[LitTyIdx];
1245 
1246           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1247             return false;
1248           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1249             return false;
1250 
1251           return BigTy.getSizeInBits() % 16 == 0 &&
1252                  LitTy.getSizeInBits() % 16 == 0 &&
1253                  BigTy.getSizeInBits() <= 1024;
1254         })
1255       // Any vectors left are the wrong size. Scalarize them.
1256       .scalarize(0)
1257       .scalarize(1);
1258   }
1259 
1260   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1261   // RegBankSelect.
1262   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1263     .legalFor({{S32}, {S64}});
1264 
1265   if (ST.hasVOP3PInsts()) {
1266     SextInReg.lowerFor({{V2S16}})
1267       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1268       // get more vector shift opportunities, since we'll get those when
1269       // expanded.
1270       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1271   } else if (ST.has16BitInsts()) {
1272     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1273   } else {
1274     // Prefer to promote to s32 before lowering if we don't have 16-bit
1275     // shifts. This avoid a lot of intermediate truncate and extend operations.
1276     SextInReg.lowerFor({{S32}, {S64}});
1277   }
1278 
1279   SextInReg
1280     .scalarize(0)
1281     .clampScalar(0, S32, S64)
1282     .lower();
1283 
1284   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1285     .legalFor({S64});
1286 
1287   getActionDefinitionsBuilder({
1288       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1289       G_FCOPYSIGN,
1290 
1291       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1292       G_READ_REGISTER,
1293       G_WRITE_REGISTER,
1294 
1295       G_SADDO, G_SSUBO,
1296 
1297        // TODO: Implement
1298       G_FMINIMUM, G_FMAXIMUM
1299     }).lower();
1300 
1301   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1302         G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1303         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1304     .unsupported();
1305 
1306   computeTables();
1307   verify(*ST.getInstrInfo());
1308 }
1309 
1310 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1311                                          MachineRegisterInfo &MRI,
1312                                          MachineIRBuilder &B,
1313                                          GISelChangeObserver &Observer) const {
1314   switch (MI.getOpcode()) {
1315   case TargetOpcode::G_ADDRSPACE_CAST:
1316     return legalizeAddrSpaceCast(MI, MRI, B);
1317   case TargetOpcode::G_FRINT:
1318     return legalizeFrint(MI, MRI, B);
1319   case TargetOpcode::G_FCEIL:
1320     return legalizeFceil(MI, MRI, B);
1321   case TargetOpcode::G_INTRINSIC_TRUNC:
1322     return legalizeIntrinsicTrunc(MI, MRI, B);
1323   case TargetOpcode::G_SITOFP:
1324     return legalizeITOFP(MI, MRI, B, true);
1325   case TargetOpcode::G_UITOFP:
1326     return legalizeITOFP(MI, MRI, B, false);
1327   case TargetOpcode::G_FPTOSI:
1328     return legalizeFPTOI(MI, MRI, B, true);
1329   case TargetOpcode::G_FPTOUI:
1330     return legalizeFPTOI(MI, MRI, B, false);
1331   case TargetOpcode::G_FMINNUM:
1332   case TargetOpcode::G_FMAXNUM:
1333   case TargetOpcode::G_FMINNUM_IEEE:
1334   case TargetOpcode::G_FMAXNUM_IEEE:
1335     return legalizeMinNumMaxNum(MI, MRI, B);
1336   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1337     return legalizeExtractVectorElt(MI, MRI, B);
1338   case TargetOpcode::G_INSERT_VECTOR_ELT:
1339     return legalizeInsertVectorElt(MI, MRI, B);
1340   case TargetOpcode::G_SHUFFLE_VECTOR:
1341     return legalizeShuffleVector(MI, MRI, B);
1342   case TargetOpcode::G_FSIN:
1343   case TargetOpcode::G_FCOS:
1344     return legalizeSinCos(MI, MRI, B);
1345   case TargetOpcode::G_GLOBAL_VALUE:
1346     return legalizeGlobalValue(MI, MRI, B);
1347   case TargetOpcode::G_LOAD:
1348     return legalizeLoad(MI, MRI, B, Observer);
1349   case TargetOpcode::G_FMAD:
1350     return legalizeFMad(MI, MRI, B);
1351   case TargetOpcode::G_FDIV:
1352     return legalizeFDIV(MI, MRI, B);
1353   case TargetOpcode::G_ATOMIC_CMPXCHG:
1354     return legalizeAtomicCmpXChg(MI, MRI, B);
1355   case TargetOpcode::G_FLOG:
1356     return legalizeFlog(MI, B, 1.0f / numbers::log2ef);
1357   case TargetOpcode::G_FLOG10:
1358     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1359   case TargetOpcode::G_FEXP:
1360     return legalizeFExp(MI, B);
1361   case TargetOpcode::G_FFLOOR:
1362     return legalizeFFloor(MI, MRI, B);
1363   case TargetOpcode::G_BUILD_VECTOR:
1364     return legalizeBuildVector(MI, MRI, B);
1365   default:
1366     return false;
1367   }
1368 
1369   llvm_unreachable("expected switch to return");
1370 }
1371 
1372 Register AMDGPULegalizerInfo::getSegmentAperture(
1373   unsigned AS,
1374   MachineRegisterInfo &MRI,
1375   MachineIRBuilder &B) const {
1376   MachineFunction &MF = B.getMF();
1377   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1378   const LLT S32 = LLT::scalar(32);
1379 
1380   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1381 
1382   if (ST.hasApertureRegs()) {
1383     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1384     // getreg.
1385     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1386         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1387         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1388     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1389         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1390         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1391     unsigned Encoding =
1392         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1393         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1394         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1395 
1396     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1397 
1398     B.buildInstr(AMDGPU::S_GETREG_B32)
1399       .addDef(GetReg)
1400       .addImm(Encoding);
1401     MRI.setType(GetReg, S32);
1402 
1403     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1404     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1405   }
1406 
1407   Register QueuePtr = MRI.createGenericVirtualRegister(
1408     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1409 
1410   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1411   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1412     return Register();
1413 
1414   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1415   // private_segment_aperture_base_hi.
1416   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1417 
1418   // TODO: can we be smarter about machine pointer info?
1419   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1420   MachineMemOperand *MMO = MF.getMachineMemOperand(
1421     PtrInfo,
1422     MachineMemOperand::MOLoad |
1423     MachineMemOperand::MODereferenceable |
1424     MachineMemOperand::MOInvariant,
1425     4,
1426     MinAlign(64, StructOffset));
1427 
1428   Register LoadAddr;
1429 
1430   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1431   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1432 }
1433 
1434 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1435   MachineInstr &MI, MachineRegisterInfo &MRI,
1436   MachineIRBuilder &B) const {
1437   MachineFunction &MF = B.getMF();
1438 
1439   B.setInstr(MI);
1440 
1441   const LLT S32 = LLT::scalar(32);
1442   Register Dst = MI.getOperand(0).getReg();
1443   Register Src = MI.getOperand(1).getReg();
1444 
1445   LLT DstTy = MRI.getType(Dst);
1446   LLT SrcTy = MRI.getType(Src);
1447   unsigned DestAS = DstTy.getAddressSpace();
1448   unsigned SrcAS = SrcTy.getAddressSpace();
1449 
1450   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1451   // vector element.
1452   assert(!DstTy.isVector());
1453 
1454   const AMDGPUTargetMachine &TM
1455     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1456 
1457   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1458   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1459     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1460     return true;
1461   }
1462 
1463   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1464     // Truncate.
1465     B.buildExtract(Dst, Src, 0);
1466     MI.eraseFromParent();
1467     return true;
1468   }
1469 
1470   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1471     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1472     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1473 
1474     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1475     // another. Merge operands are required to be the same type, but creating an
1476     // extra ptrtoint would be kind of pointless.
1477     auto HighAddr = B.buildConstant(
1478       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1479     B.buildMerge(Dst, {Src, HighAddr});
1480     MI.eraseFromParent();
1481     return true;
1482   }
1483 
1484   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1485     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1486            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1487     unsigned NullVal = TM.getNullPointerValue(DestAS);
1488 
1489     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1490     auto FlatNull = B.buildConstant(SrcTy, 0);
1491 
1492     // Extract low 32-bits of the pointer.
1493     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1494 
1495     auto CmpRes =
1496         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1497     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1498 
1499     MI.eraseFromParent();
1500     return true;
1501   }
1502 
1503   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1504     return false;
1505 
1506   if (!ST.hasFlatAddressSpace())
1507     return false;
1508 
1509   auto SegmentNull =
1510       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1511   auto FlatNull =
1512       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1513 
1514   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1515   if (!ApertureReg.isValid())
1516     return false;
1517 
1518   auto CmpRes =
1519       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1520 
1521   // Coerce the type of the low half of the result so we can use merge_values.
1522   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1523 
1524   // TODO: Should we allow mismatched types but matching sizes in merges to
1525   // avoid the ptrtoint?
1526   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1527   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1528 
1529   MI.eraseFromParent();
1530   return true;
1531 }
1532 
1533 bool AMDGPULegalizerInfo::legalizeFrint(
1534   MachineInstr &MI, MachineRegisterInfo &MRI,
1535   MachineIRBuilder &B) const {
1536   B.setInstr(MI);
1537 
1538   Register Src = MI.getOperand(1).getReg();
1539   LLT Ty = MRI.getType(Src);
1540   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1541 
1542   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1543   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1544 
1545   auto C1 = B.buildFConstant(Ty, C1Val);
1546   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1547 
1548   // TODO: Should this propagate fast-math-flags?
1549   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1550   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1551 
1552   auto C2 = B.buildFConstant(Ty, C2Val);
1553   auto Fabs = B.buildFAbs(Ty, Src);
1554 
1555   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1556   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1557   return true;
1558 }
1559 
1560 bool AMDGPULegalizerInfo::legalizeFceil(
1561   MachineInstr &MI, MachineRegisterInfo &MRI,
1562   MachineIRBuilder &B) const {
1563   B.setInstr(MI);
1564 
1565   const LLT S1 = LLT::scalar(1);
1566   const LLT S64 = LLT::scalar(64);
1567 
1568   Register Src = MI.getOperand(1).getReg();
1569   assert(MRI.getType(Src) == S64);
1570 
1571   // result = trunc(src)
1572   // if (src > 0.0 && src != result)
1573   //   result += 1.0
1574 
1575   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1576 
1577   const auto Zero = B.buildFConstant(S64, 0.0);
1578   const auto One = B.buildFConstant(S64, 1.0);
1579   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1580   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1581   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1582   auto Add = B.buildSelect(S64, And, One, Zero);
1583 
1584   // TODO: Should this propagate fast-math-flags?
1585   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1586   return true;
1587 }
1588 
1589 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1590                                               MachineIRBuilder &B) {
1591   const unsigned FractBits = 52;
1592   const unsigned ExpBits = 11;
1593   LLT S32 = LLT::scalar(32);
1594 
1595   auto Const0 = B.buildConstant(S32, FractBits - 32);
1596   auto Const1 = B.buildConstant(S32, ExpBits);
1597 
1598   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1599     .addUse(Const0.getReg(0))
1600     .addUse(Const1.getReg(0));
1601 
1602   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1603 }
1604 
1605 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1606   MachineInstr &MI, MachineRegisterInfo &MRI,
1607   MachineIRBuilder &B) const {
1608   B.setInstr(MI);
1609 
1610   const LLT S1 = LLT::scalar(1);
1611   const LLT S32 = LLT::scalar(32);
1612   const LLT S64 = LLT::scalar(64);
1613 
1614   Register Src = MI.getOperand(1).getReg();
1615   assert(MRI.getType(Src) == S64);
1616 
1617   // TODO: Should this use extract since the low half is unused?
1618   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1619   Register Hi = Unmerge.getReg(1);
1620 
1621   // Extract the upper half, since this is where we will find the sign and
1622   // exponent.
1623   auto Exp = extractF64Exponent(Hi, B);
1624 
1625   const unsigned FractBits = 52;
1626 
1627   // Extract the sign bit.
1628   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1629   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1630 
1631   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1632 
1633   const auto Zero32 = B.buildConstant(S32, 0);
1634 
1635   // Extend back to 64-bits.
1636   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1637 
1638   auto Shr = B.buildAShr(S64, FractMask, Exp);
1639   auto Not = B.buildNot(S64, Shr);
1640   auto Tmp0 = B.buildAnd(S64, Src, Not);
1641   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1642 
1643   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1644   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1645 
1646   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1647   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1648   return true;
1649 }
1650 
1651 bool AMDGPULegalizerInfo::legalizeITOFP(
1652   MachineInstr &MI, MachineRegisterInfo &MRI,
1653   MachineIRBuilder &B, bool Signed) const {
1654   B.setInstr(MI);
1655 
1656   Register Dst = MI.getOperand(0).getReg();
1657   Register Src = MI.getOperand(1).getReg();
1658 
1659   const LLT S64 = LLT::scalar(64);
1660   const LLT S32 = LLT::scalar(32);
1661 
1662   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1663 
1664   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1665 
1666   auto CvtHi = Signed ?
1667     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1668     B.buildUITOFP(S64, Unmerge.getReg(1));
1669 
1670   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1671 
1672   auto ThirtyTwo = B.buildConstant(S32, 32);
1673   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1674     .addUse(CvtHi.getReg(0))
1675     .addUse(ThirtyTwo.getReg(0));
1676 
1677   // TODO: Should this propagate fast-math-flags?
1678   B.buildFAdd(Dst, LdExp, CvtLo);
1679   MI.eraseFromParent();
1680   return true;
1681 }
1682 
1683 // TODO: Copied from DAG implementation. Verify logic and document how this
1684 // actually works.
1685 bool AMDGPULegalizerInfo::legalizeFPTOI(
1686   MachineInstr &MI, MachineRegisterInfo &MRI,
1687   MachineIRBuilder &B, bool Signed) const {
1688   B.setInstr(MI);
1689 
1690   Register Dst = MI.getOperand(0).getReg();
1691   Register Src = MI.getOperand(1).getReg();
1692 
1693   const LLT S64 = LLT::scalar(64);
1694   const LLT S32 = LLT::scalar(32);
1695 
1696   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1697 
1698   unsigned Flags = MI.getFlags();
1699 
1700   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
1701   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
1702   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
1703 
1704   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
1705   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
1706   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
1707 
1708   auto Hi = Signed ?
1709     B.buildFPTOSI(S32, FloorMul) :
1710     B.buildFPTOUI(S32, FloorMul);
1711   auto Lo = B.buildFPTOUI(S32, Fma);
1712 
1713   B.buildMerge(Dst, { Lo, Hi });
1714   MI.eraseFromParent();
1715 
1716   return true;
1717 }
1718 
1719 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1720   MachineInstr &MI, MachineRegisterInfo &MRI,
1721   MachineIRBuilder &B) const {
1722   MachineFunction &MF = B.getMF();
1723   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1724 
1725   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1726                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1727 
1728   // With ieee_mode disabled, the instructions have the correct behavior
1729   // already for G_FMINNUM/G_FMAXNUM
1730   if (!MFI->getMode().IEEE)
1731     return !IsIEEEOp;
1732 
1733   if (IsIEEEOp)
1734     return true;
1735 
1736   MachineIRBuilder HelperBuilder(MI);
1737   GISelObserverWrapper DummyObserver;
1738   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1739   HelperBuilder.setInstr(MI);
1740   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1741 }
1742 
1743 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1744   MachineInstr &MI, MachineRegisterInfo &MRI,
1745   MachineIRBuilder &B) const {
1746   // TODO: Should move some of this into LegalizerHelper.
1747 
1748   // TODO: Promote dynamic indexing of s16 to s32
1749 
1750   // FIXME: Artifact combiner probably should have replaced the truncated
1751   // constant before this, so we shouldn't need
1752   // getConstantVRegValWithLookThrough.
1753   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1754     MI.getOperand(2).getReg(), MRI);
1755   if (!IdxVal) // Dynamic case will be selected to register indexing.
1756     return true;
1757 
1758   Register Dst = MI.getOperand(0).getReg();
1759   Register Vec = MI.getOperand(1).getReg();
1760 
1761   LLT VecTy = MRI.getType(Vec);
1762   LLT EltTy = VecTy.getElementType();
1763   assert(EltTy == MRI.getType(Dst));
1764 
1765   B.setInstr(MI);
1766 
1767   if (IdxVal->Value < VecTy.getNumElements())
1768     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
1769   else
1770     B.buildUndef(Dst);
1771 
1772   MI.eraseFromParent();
1773   return true;
1774 }
1775 
1776 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1777   MachineInstr &MI, MachineRegisterInfo &MRI,
1778   MachineIRBuilder &B) const {
1779   // TODO: Should move some of this into LegalizerHelper.
1780 
1781   // TODO: Promote dynamic indexing of s16 to s32
1782 
1783   // FIXME: Artifact combiner probably should have replaced the truncated
1784   // constant before this, so we shouldn't need
1785   // getConstantVRegValWithLookThrough.
1786   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1787     MI.getOperand(3).getReg(), MRI);
1788   if (!IdxVal) // Dynamic case will be selected to register indexing.
1789     return true;
1790 
1791   Register Dst = MI.getOperand(0).getReg();
1792   Register Vec = MI.getOperand(1).getReg();
1793   Register Ins = MI.getOperand(2).getReg();
1794 
1795   LLT VecTy = MRI.getType(Vec);
1796   LLT EltTy = VecTy.getElementType();
1797   assert(EltTy == MRI.getType(Ins));
1798 
1799   B.setInstr(MI);
1800 
1801   if (IdxVal->Value < VecTy.getNumElements())
1802     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
1803   else
1804     B.buildUndef(Dst);
1805 
1806   MI.eraseFromParent();
1807   return true;
1808 }
1809 
1810 static bool isLegalVOP3PShuffleMask(ArrayRef<int> Mask) {
1811   assert(Mask.size() == 2);
1812 
1813   // If one half is undef, the other is trivially in the same reg.
1814   if (Mask[0] == -1 || Mask[1] == -1)
1815     return true;
1816   return ((Mask[0] == 0 || Mask[0] == 1) && (Mask[1] == 0 || Mask[1] == 1)) ||
1817          ((Mask[0] == 2 || Mask[0] == 3) && (Mask[1] == 2 || Mask[1] == 3));
1818 }
1819 
1820 bool AMDGPULegalizerInfo::legalizeShuffleVector(
1821   MachineInstr &MI, MachineRegisterInfo &MRI,
1822   MachineIRBuilder &B) const {
1823   const LLT V2S16 = LLT::vector(2, 16);
1824 
1825   Register Dst = MI.getOperand(0).getReg();
1826   Register Src0 = MI.getOperand(1).getReg();
1827   LLT DstTy = MRI.getType(Dst);
1828   LLT SrcTy = MRI.getType(Src0);
1829 
1830   if (SrcTy == V2S16 && DstTy == V2S16 &&
1831       isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
1832     return true;
1833 
1834   MachineIRBuilder HelperBuilder(MI);
1835   GISelObserverWrapper DummyObserver;
1836   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
1837   HelperBuilder.setInstr(MI);
1838   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
1839 }
1840 
1841 bool AMDGPULegalizerInfo::legalizeSinCos(
1842   MachineInstr &MI, MachineRegisterInfo &MRI,
1843   MachineIRBuilder &B) const {
1844   B.setInstr(MI);
1845 
1846   Register DstReg = MI.getOperand(0).getReg();
1847   Register SrcReg = MI.getOperand(1).getReg();
1848   LLT Ty = MRI.getType(DstReg);
1849   unsigned Flags = MI.getFlags();
1850 
1851   Register TrigVal;
1852   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1853   if (ST.hasTrigReducedRange()) {
1854     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1855     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1856       .addUse(MulVal.getReg(0))
1857       .setMIFlags(Flags).getReg(0);
1858   } else
1859     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1860 
1861   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1862     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1863   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1864     .addUse(TrigVal)
1865     .setMIFlags(Flags);
1866   MI.eraseFromParent();
1867   return true;
1868 }
1869 
1870 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1871   Register DstReg, LLT PtrTy,
1872   MachineIRBuilder &B, const GlobalValue *GV,
1873   unsigned Offset, unsigned GAFlags) const {
1874   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1875   // to the following code sequence:
1876   //
1877   // For constant address space:
1878   //   s_getpc_b64 s[0:1]
1879   //   s_add_u32 s0, s0, $symbol
1880   //   s_addc_u32 s1, s1, 0
1881   //
1882   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1883   //   a fixup or relocation is emitted to replace $symbol with a literal
1884   //   constant, which is a pc-relative offset from the encoding of the $symbol
1885   //   operand to the global variable.
1886   //
1887   // For global address space:
1888   //   s_getpc_b64 s[0:1]
1889   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1890   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1891   //
1892   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1893   //   fixups or relocations are emitted to replace $symbol@*@lo and
1894   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1895   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
1896   //   operand to the global variable.
1897   //
1898   // What we want here is an offset from the value returned by s_getpc
1899   // (which is the address of the s_add_u32 instruction) to the global
1900   // variable, but since the encoding of $symbol starts 4 bytes after the start
1901   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1902   // small. This requires us to add 4 to the global variable offset in order to
1903   // compute the correct address.
1904 
1905   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1906 
1907   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1908     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1909 
1910   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1911     .addDef(PCReg);
1912 
1913   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1914   if (GAFlags == SIInstrInfo::MO_NONE)
1915     MIB.addImm(0);
1916   else
1917     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1918 
1919   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1920 
1921   if (PtrTy.getSizeInBits() == 32)
1922     B.buildExtract(DstReg, PCReg, 0);
1923   return true;
1924  }
1925 
1926 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1927   MachineInstr &MI, MachineRegisterInfo &MRI,
1928   MachineIRBuilder &B) const {
1929   Register DstReg = MI.getOperand(0).getReg();
1930   LLT Ty = MRI.getType(DstReg);
1931   unsigned AS = Ty.getAddressSpace();
1932 
1933   const GlobalValue *GV = MI.getOperand(1).getGlobal();
1934   MachineFunction &MF = B.getMF();
1935   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1936   B.setInstr(MI);
1937 
1938   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1939     if (!MFI->isEntryFunction()) {
1940       const Function &Fn = MF.getFunction();
1941       DiagnosticInfoUnsupported BadLDSDecl(
1942         Fn, "local memory global used by non-kernel function", MI.getDebugLoc());
1943       Fn.getContext().diagnose(BadLDSDecl);
1944     }
1945 
1946     // TODO: We could emit code to handle the initialization somewhere.
1947     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1948       const SITargetLowering *TLI = ST.getTargetLowering();
1949       if (!TLI->shouldUseLDSConstAddress(GV)) {
1950         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
1951         return true; // Leave in place;
1952       }
1953 
1954       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1955       MI.eraseFromParent();
1956       return true;
1957     }
1958 
1959     const Function &Fn = MF.getFunction();
1960     DiagnosticInfoUnsupported BadInit(
1961       Fn, "unsupported initializer for address space", MI.getDebugLoc());
1962     Fn.getContext().diagnose(BadInit);
1963     return true;
1964   }
1965 
1966   const SITargetLowering *TLI = ST.getTargetLowering();
1967 
1968   if (TLI->shouldEmitFixup(GV)) {
1969     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
1970     MI.eraseFromParent();
1971     return true;
1972   }
1973 
1974   if (TLI->shouldEmitPCReloc(GV)) {
1975     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
1976     MI.eraseFromParent();
1977     return true;
1978   }
1979 
1980   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1981   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
1982 
1983   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
1984     MachinePointerInfo::getGOT(MF),
1985     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1986     MachineMemOperand::MOInvariant,
1987     8 /*Size*/, 8 /*Align*/);
1988 
1989   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
1990 
1991   if (Ty.getSizeInBits() == 32) {
1992     // Truncate if this is a 32-bit constant adrdess.
1993     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
1994     B.buildExtract(DstReg, Load, 0);
1995   } else
1996     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
1997 
1998   MI.eraseFromParent();
1999   return true;
2000 }
2001 
2002 bool AMDGPULegalizerInfo::legalizeLoad(
2003   MachineInstr &MI, MachineRegisterInfo &MRI,
2004   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
2005   B.setInstr(MI);
2006   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2007   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
2008   Observer.changingInstr(MI);
2009   MI.getOperand(1).setReg(Cast.getReg(0));
2010   Observer.changedInstr(MI);
2011   return true;
2012 }
2013 
2014 bool AMDGPULegalizerInfo::legalizeFMad(
2015   MachineInstr &MI, MachineRegisterInfo &MRI,
2016   MachineIRBuilder &B) const {
2017   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2018   assert(Ty.isScalar());
2019 
2020   MachineFunction &MF = B.getMF();
2021   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2022 
2023   // TODO: Always legal with future ftz flag.
2024   // FIXME: Do we need just output?
2025   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2026     return true;
2027   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2028     return true;
2029 
2030   MachineIRBuilder HelperBuilder(MI);
2031   GISelObserverWrapper DummyObserver;
2032   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2033   HelperBuilder.setMBB(*MI.getParent());
2034   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2035 }
2036 
2037 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2038   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2039   Register DstReg = MI.getOperand(0).getReg();
2040   Register PtrReg = MI.getOperand(1).getReg();
2041   Register CmpVal = MI.getOperand(2).getReg();
2042   Register NewVal = MI.getOperand(3).getReg();
2043 
2044   assert(SITargetLowering::isFlatGlobalAddrSpace(
2045            MRI.getType(PtrReg).getAddressSpace()) &&
2046          "this should not have been custom lowered");
2047 
2048   LLT ValTy = MRI.getType(CmpVal);
2049   LLT VecTy = LLT::vector(2, ValTy);
2050 
2051   B.setInstr(MI);
2052   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2053 
2054   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2055     .addDef(DstReg)
2056     .addUse(PtrReg)
2057     .addUse(PackedVal)
2058     .setMemRefs(MI.memoperands());
2059 
2060   MI.eraseFromParent();
2061   return true;
2062 }
2063 
2064 bool AMDGPULegalizerInfo::legalizeFlog(
2065   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2066   Register Dst = MI.getOperand(0).getReg();
2067   Register Src = MI.getOperand(1).getReg();
2068   LLT Ty = B.getMRI()->getType(Dst);
2069   unsigned Flags = MI.getFlags();
2070   B.setInstr(MI);
2071 
2072   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2073   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2074 
2075   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2076   MI.eraseFromParent();
2077   return true;
2078 }
2079 
2080 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2081                                        MachineIRBuilder &B) const {
2082   Register Dst = MI.getOperand(0).getReg();
2083   Register Src = MI.getOperand(1).getReg();
2084   unsigned Flags = MI.getFlags();
2085   LLT Ty = B.getMRI()->getType(Dst);
2086   B.setInstr(MI);
2087 
2088   auto K = B.buildFConstant(Ty, numbers::log2e);
2089   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2090   B.buildFExp2(Dst, Mul, Flags);
2091   MI.eraseFromParent();
2092   return true;
2093 }
2094 
2095 // Find a source register, ignoring any possible source modifiers.
2096 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2097   Register ModSrc = OrigSrc;
2098   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2099     ModSrc = SrcFNeg->getOperand(1).getReg();
2100     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2101       ModSrc = SrcFAbs->getOperand(1).getReg();
2102   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2103     ModSrc = SrcFAbs->getOperand(1).getReg();
2104   return ModSrc;
2105 }
2106 
2107 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2108                                          MachineRegisterInfo &MRI,
2109                                          MachineIRBuilder &B) const {
2110   B.setInstr(MI);
2111 
2112   const LLT S1 = LLT::scalar(1);
2113   const LLT S64 = LLT::scalar(64);
2114   Register Dst = MI.getOperand(0).getReg();
2115   Register OrigSrc = MI.getOperand(1).getReg();
2116   unsigned Flags = MI.getFlags();
2117   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2118          "this should not have been custom lowered");
2119 
2120   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2121   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2122   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2123   // V_FRACT bug is:
2124   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2125   //
2126   // Convert floor(x) to (x - fract(x))
2127 
2128   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2129     .addUse(OrigSrc)
2130     .setMIFlags(Flags);
2131 
2132   // Give source modifier matching some assistance before obscuring a foldable
2133   // pattern.
2134 
2135   // TODO: We can avoid the neg on the fract? The input sign to fract
2136   // shouldn't matter?
2137   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2138 
2139   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2140 
2141   Register Min = MRI.createGenericVirtualRegister(S64);
2142 
2143   // We don't need to concern ourselves with the snan handling difference, so
2144   // use the one which will directly select.
2145   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2146   if (MFI->getMode().IEEE)
2147     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2148   else
2149     B.buildFMinNum(Min, Fract, Const, Flags);
2150 
2151   Register CorrectedFract = Min;
2152   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2153     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2154     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2155   }
2156 
2157   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2158   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2159 
2160   MI.eraseFromParent();
2161   return true;
2162 }
2163 
2164 // Turn an illegal packed v2s16 build vector into bit operations.
2165 // TODO: This should probably be a bitcast action in LegalizerHelper.
2166 bool AMDGPULegalizerInfo::legalizeBuildVector(
2167   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2168   Register Dst = MI.getOperand(0).getReg();
2169   LLT DstTy = MRI.getType(Dst);
2170   const LLT S32 = LLT::scalar(32);
2171   const LLT V2S16 = LLT::vector(2, 16);
2172   (void)DstTy;
2173   (void)V2S16;
2174   assert(DstTy == V2S16);
2175 
2176   Register Src0 = MI.getOperand(1).getReg();
2177   Register Src1 = MI.getOperand(2).getReg();
2178   assert(MRI.getType(Src0) == LLT::scalar(16));
2179 
2180   B.setInstr(MI);
2181   auto Merge = B.buildMerge(S32, {Src0, Src1});
2182   B.buildBitcast(Dst, Merge);
2183 
2184   MI.eraseFromParent();
2185   return true;
2186 }
2187 
2188 // Return the use branch instruction, otherwise null if the usage is invalid.
2189 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2190                                        MachineRegisterInfo &MRI,
2191                                        MachineInstr *&Br) {
2192   Register CondDef = MI.getOperand(0).getReg();
2193   if (!MRI.hasOneNonDBGUse(CondDef))
2194     return nullptr;
2195 
2196   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2197   if (UseMI.getParent() != MI.getParent() ||
2198       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2199     return nullptr;
2200 
2201   // Make sure the cond br is followed by a G_BR
2202   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2203   if (Next != MI.getParent()->end()) {
2204     if (Next->getOpcode() != AMDGPU::G_BR)
2205       return nullptr;
2206     Br = &*Next;
2207   }
2208 
2209   return &UseMI;
2210 }
2211 
2212 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
2213                                                 Register Reg, LLT Ty) const {
2214   Register LiveIn = MRI.getLiveInVirtReg(Reg);
2215   if (LiveIn)
2216     return LiveIn;
2217 
2218   Register NewReg = MRI.createGenericVirtualRegister(Ty);
2219   MRI.addLiveIn(Reg, NewReg);
2220   return NewReg;
2221 }
2222 
2223 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2224                                          const ArgDescriptor *Arg) const {
2225   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2226     return false; // TODO: Handle these
2227 
2228   assert(Arg->getRegister().isPhysical());
2229 
2230   MachineRegisterInfo &MRI = *B.getMRI();
2231 
2232   LLT Ty = MRI.getType(DstReg);
2233   Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
2234 
2235   if (Arg->isMasked()) {
2236     // TODO: Should we try to emit this once in the entry block?
2237     const LLT S32 = LLT::scalar(32);
2238     const unsigned Mask = Arg->getMask();
2239     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2240 
2241     Register AndMaskSrc = LiveIn;
2242 
2243     if (Shift != 0) {
2244       auto ShiftAmt = B.buildConstant(S32, Shift);
2245       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2246     }
2247 
2248     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2249   } else
2250     B.buildCopy(DstReg, LiveIn);
2251 
2252   // Insert the argument copy if it doens't already exist.
2253   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
2254   if (!MRI.getVRegDef(LiveIn)) {
2255     // FIXME: Should have scoped insert pt
2256     MachineBasicBlock &OrigInsBB = B.getMBB();
2257     auto OrigInsPt = B.getInsertPt();
2258 
2259     MachineBasicBlock &EntryMBB = B.getMF().front();
2260     EntryMBB.addLiveIn(Arg->getRegister());
2261     B.setInsertPt(EntryMBB, EntryMBB.begin());
2262     B.buildCopy(LiveIn, Arg->getRegister());
2263 
2264     B.setInsertPt(OrigInsBB, OrigInsPt);
2265   }
2266 
2267   return true;
2268 }
2269 
2270 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2271   MachineInstr &MI,
2272   MachineRegisterInfo &MRI,
2273   MachineIRBuilder &B,
2274   AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2275   B.setInstr(MI);
2276 
2277   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2278 
2279   const ArgDescriptor *Arg;
2280   const TargetRegisterClass *RC;
2281   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
2282   if (!Arg) {
2283     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
2284     return false;
2285   }
2286 
2287   if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
2288     MI.eraseFromParent();
2289     return true;
2290   }
2291 
2292   return false;
2293 }
2294 
2295 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2296                                        MachineRegisterInfo &MRI,
2297                                        MachineIRBuilder &B) const {
2298   B.setInstr(MI);
2299   Register Dst = MI.getOperand(0).getReg();
2300   LLT DstTy = MRI.getType(Dst);
2301   LLT S16 = LLT::scalar(16);
2302   LLT S32 = LLT::scalar(32);
2303   LLT S64 = LLT::scalar(64);
2304 
2305   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2306     return true;
2307 
2308   if (DstTy == S16)
2309     return legalizeFDIV16(MI, MRI, B);
2310   if (DstTy == S32)
2311     return legalizeFDIV32(MI, MRI, B);
2312   if (DstTy == S64)
2313     return legalizeFDIV64(MI, MRI, B);
2314 
2315   return false;
2316 }
2317 
2318 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2319                                                  MachineRegisterInfo &MRI,
2320                                                  MachineIRBuilder &B) const {
2321   Register Res = MI.getOperand(0).getReg();
2322   Register LHS = MI.getOperand(1).getReg();
2323   Register RHS = MI.getOperand(2).getReg();
2324 
2325   uint16_t Flags = MI.getFlags();
2326 
2327   LLT ResTy = MRI.getType(Res);
2328   LLT S32 = LLT::scalar(32);
2329   LLT S64 = LLT::scalar(64);
2330 
2331   const MachineFunction &MF = B.getMF();
2332   bool Unsafe =
2333     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2334 
2335   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2336     return false;
2337 
2338   if (!Unsafe && ResTy == S32 &&
2339       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2340     return false;
2341 
2342   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2343     // 1 / x -> RCP(x)
2344     if (CLHS->isExactlyValue(1.0)) {
2345       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2346         .addUse(RHS)
2347         .setMIFlags(Flags);
2348 
2349       MI.eraseFromParent();
2350       return true;
2351     }
2352 
2353     // -1 / x -> RCP( FNEG(x) )
2354     if (CLHS->isExactlyValue(-1.0)) {
2355       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2356       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2357         .addUse(FNeg.getReg(0))
2358         .setMIFlags(Flags);
2359 
2360       MI.eraseFromParent();
2361       return true;
2362     }
2363   }
2364 
2365   // x / y -> x * (1.0 / y)
2366   if (Unsafe) {
2367     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2368       .addUse(RHS)
2369       .setMIFlags(Flags);
2370     B.buildFMul(Res, LHS, RCP, Flags);
2371 
2372     MI.eraseFromParent();
2373     return true;
2374   }
2375 
2376   return false;
2377 }
2378 
2379 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2380                                          MachineRegisterInfo &MRI,
2381                                          MachineIRBuilder &B) const {
2382   B.setInstr(MI);
2383   Register Res = MI.getOperand(0).getReg();
2384   Register LHS = MI.getOperand(1).getReg();
2385   Register RHS = MI.getOperand(2).getReg();
2386 
2387   uint16_t Flags = MI.getFlags();
2388 
2389   LLT S16 = LLT::scalar(16);
2390   LLT S32 = LLT::scalar(32);
2391 
2392   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2393   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2394 
2395   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2396     .addUse(RHSExt.getReg(0))
2397     .setMIFlags(Flags);
2398 
2399   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2400   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2401 
2402   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2403     .addUse(RDst.getReg(0))
2404     .addUse(RHS)
2405     .addUse(LHS)
2406     .setMIFlags(Flags);
2407 
2408   MI.eraseFromParent();
2409   return true;
2410 }
2411 
2412 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2413 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2414 static void toggleSPDenormMode(bool Enable,
2415                                MachineIRBuilder &B,
2416                                const GCNSubtarget &ST,
2417                                AMDGPU::SIModeRegisterDefaults Mode) {
2418   // Set SP denorm mode to this value.
2419   unsigned SPDenormMode =
2420     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
2421 
2422   if (ST.hasDenormModeInst()) {
2423     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2424     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
2425 
2426     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2427     B.buildInstr(AMDGPU::S_DENORM_MODE)
2428       .addImm(NewDenormModeValue);
2429 
2430   } else {
2431     // Select FP32 bit field in mode register.
2432     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2433                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2434                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2435 
2436     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2437       .addImm(SPDenormMode)
2438       .addImm(SPDenormModeBitField);
2439   }
2440 }
2441 
2442 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2443                                          MachineRegisterInfo &MRI,
2444                                          MachineIRBuilder &B) const {
2445   B.setInstr(MI);
2446   Register Res = MI.getOperand(0).getReg();
2447   Register LHS = MI.getOperand(1).getReg();
2448   Register RHS = MI.getOperand(2).getReg();
2449   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2450   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2451 
2452   uint16_t Flags = MI.getFlags();
2453 
2454   LLT S32 = LLT::scalar(32);
2455   LLT S1 = LLT::scalar(1);
2456 
2457   auto One = B.buildFConstant(S32, 1.0f);
2458 
2459   auto DenominatorScaled =
2460     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2461       .addUse(RHS)
2462       .addUse(LHS)
2463       .addImm(1)
2464       .setMIFlags(Flags);
2465   auto NumeratorScaled =
2466     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2467       .addUse(LHS)
2468       .addUse(RHS)
2469       .addImm(0)
2470       .setMIFlags(Flags);
2471 
2472   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2473     .addUse(DenominatorScaled.getReg(0))
2474     .setMIFlags(Flags);
2475   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2476 
2477   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2478   // aren't modeled as reading it.
2479   if (!Mode.allFP32Denormals())
2480     toggleSPDenormMode(true, B, ST, Mode);
2481 
2482   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2483   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2484   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2485   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2486   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2487   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2488 
2489   if (!Mode.allFP32Denormals())
2490     toggleSPDenormMode(false, B, ST, Mode);
2491 
2492   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2493     .addUse(Fma4.getReg(0))
2494     .addUse(Fma1.getReg(0))
2495     .addUse(Fma3.getReg(0))
2496     .addUse(NumeratorScaled.getReg(1))
2497     .setMIFlags(Flags);
2498 
2499   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2500     .addUse(Fmas.getReg(0))
2501     .addUse(RHS)
2502     .addUse(LHS)
2503     .setMIFlags(Flags);
2504 
2505   MI.eraseFromParent();
2506   return true;
2507 }
2508 
2509 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
2510                                          MachineRegisterInfo &MRI,
2511                                          MachineIRBuilder &B) const {
2512   B.setInstr(MI);
2513   Register Res = MI.getOperand(0).getReg();
2514   Register LHS = MI.getOperand(1).getReg();
2515   Register RHS = MI.getOperand(2).getReg();
2516 
2517   uint16_t Flags = MI.getFlags();
2518 
2519   LLT S64 = LLT::scalar(64);
2520   LLT S1 = LLT::scalar(1);
2521 
2522   auto One = B.buildFConstant(S64, 1.0);
2523 
2524   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2525     .addUse(LHS)
2526     .addUse(RHS)
2527     .addImm(1)
2528     .setMIFlags(Flags);
2529 
2530   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
2531 
2532   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
2533     .addUse(DivScale0.getReg(0))
2534     .setMIFlags(Flags);
2535 
2536   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
2537   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
2538   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
2539 
2540   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2541     .addUse(LHS)
2542     .addUse(RHS)
2543     .addImm(0)
2544     .setMIFlags(Flags);
2545 
2546   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
2547   auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags);
2548   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
2549 
2550   Register Scale;
2551   if (!ST.hasUsableDivScaleConditionOutput()) {
2552     // Workaround a hardware bug on SI where the condition output from div_scale
2553     // is not usable.
2554 
2555     LLT S32 = LLT::scalar(32);
2556 
2557     auto NumUnmerge = B.buildUnmerge(S32, LHS);
2558     auto DenUnmerge = B.buildUnmerge(S32, RHS);
2559     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
2560     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
2561 
2562     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
2563                               Scale1Unmerge.getReg(1));
2564     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
2565                               Scale0Unmerge.getReg(1));
2566     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
2567   } else {
2568     Scale = DivScale1.getReg(1);
2569   }
2570 
2571   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
2572     .addUse(Fma4.getReg(0))
2573     .addUse(Fma3.getReg(0))
2574     .addUse(Mul.getReg(0))
2575     .addUse(Scale)
2576     .setMIFlags(Flags);
2577 
2578   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
2579     .addUse(Fmas.getReg(0))
2580     .addUse(RHS)
2581     .addUse(LHS)
2582     .setMIFlags(Flags);
2583 
2584   MI.eraseFromParent();
2585   return true;
2586 }
2587 
2588 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
2589                                                  MachineRegisterInfo &MRI,
2590                                                  MachineIRBuilder &B) const {
2591   B.setInstr(MI);
2592   Register Res = MI.getOperand(0).getReg();
2593   Register LHS = MI.getOperand(2).getReg();
2594   Register RHS = MI.getOperand(3).getReg();
2595   uint16_t Flags = MI.getFlags();
2596 
2597   LLT S32 = LLT::scalar(32);
2598   LLT S1 = LLT::scalar(1);
2599 
2600   auto Abs = B.buildFAbs(S32, RHS, Flags);
2601   const APFloat C0Val(1.0f);
2602 
2603   auto C0 = B.buildConstant(S32, 0x6f800000);
2604   auto C1 = B.buildConstant(S32, 0x2f800000);
2605   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
2606 
2607   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
2608   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
2609 
2610   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
2611 
2612   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2613     .addUse(Mul0.getReg(0))
2614     .setMIFlags(Flags);
2615 
2616   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
2617 
2618   B.buildFMul(Res, Sel, Mul1, Flags);
2619 
2620   MI.eraseFromParent();
2621   return true;
2622 }
2623 
2624 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
2625                                                  MachineRegisterInfo &MRI,
2626                                                  MachineIRBuilder &B) const {
2627   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2628   if (!MFI->isEntryFunction()) {
2629     return legalizePreloadedArgIntrin(MI, MRI, B,
2630                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
2631   }
2632 
2633   B.setInstr(MI);
2634 
2635   uint64_t Offset =
2636     ST.getTargetLowering()->getImplicitParameterOffset(
2637       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
2638   Register DstReg = MI.getOperand(0).getReg();
2639   LLT DstTy = MRI.getType(DstReg);
2640   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
2641 
2642   const ArgDescriptor *Arg;
2643   const TargetRegisterClass *RC;
2644   std::tie(Arg, RC)
2645     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2646   if (!Arg)
2647     return false;
2648 
2649   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
2650   if (!loadInputValue(KernargPtrReg, B, Arg))
2651     return false;
2652 
2653   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
2654   MI.eraseFromParent();
2655   return true;
2656 }
2657 
2658 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
2659                                               MachineRegisterInfo &MRI,
2660                                               MachineIRBuilder &B,
2661                                               unsigned AddrSpace) const {
2662   B.setInstr(MI);
2663   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
2664   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
2665   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
2666   MI.eraseFromParent();
2667   return true;
2668 }
2669 
2670 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
2671 // offset (the offset that is included in bounds checking and swizzling, to be
2672 // split between the instruction's voffset and immoffset fields) and soffset
2673 // (the offset that is excluded from bounds checking and swizzling, to go in
2674 // the instruction's soffset field).  This function takes the first kind of
2675 // offset and figures out how to split it between voffset and immoffset.
2676 std::tuple<Register, unsigned, unsigned>
2677 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
2678                                         Register OrigOffset) const {
2679   const unsigned MaxImm = 4095;
2680   Register BaseReg;
2681   unsigned TotalConstOffset;
2682   MachineInstr *OffsetDef;
2683   const LLT S32 = LLT::scalar(32);
2684 
2685   std::tie(BaseReg, TotalConstOffset, OffsetDef)
2686     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
2687 
2688   unsigned ImmOffset = TotalConstOffset;
2689 
2690   // If the immediate value is too big for the immoffset field, put the value
2691   // and -4096 into the immoffset field so that the value that is copied/added
2692   // for the voffset field is a multiple of 4096, and it stands more chance
2693   // of being CSEd with the copy/add for another similar load/store.
2694   // However, do not do that rounding down to a multiple of 4096 if that is a
2695   // negative number, as it appears to be illegal to have a negative offset
2696   // in the vgpr, even if adding the immediate offset makes it positive.
2697   unsigned Overflow = ImmOffset & ~MaxImm;
2698   ImmOffset -= Overflow;
2699   if ((int32_t)Overflow < 0) {
2700     Overflow += ImmOffset;
2701     ImmOffset = 0;
2702   }
2703 
2704   if (Overflow != 0) {
2705     if (!BaseReg) {
2706       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
2707     } else {
2708       auto OverflowVal = B.buildConstant(S32, Overflow);
2709       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
2710     }
2711   }
2712 
2713   if (!BaseReg)
2714     BaseReg = B.buildConstant(S32, 0).getReg(0);
2715 
2716   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
2717 }
2718 
2719 /// Handle register layout difference for f16 images for some subtargets.
2720 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
2721                                              MachineRegisterInfo &MRI,
2722                                              Register Reg) const {
2723   if (!ST.hasUnpackedD16VMem())
2724     return Reg;
2725 
2726   const LLT S16 = LLT::scalar(16);
2727   const LLT S32 = LLT::scalar(32);
2728   LLT StoreVT = MRI.getType(Reg);
2729   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
2730 
2731   auto Unmerge = B.buildUnmerge(S16, Reg);
2732 
2733   SmallVector<Register, 4> WideRegs;
2734   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
2735     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
2736 
2737   int NumElts = StoreVT.getNumElements();
2738 
2739   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
2740 }
2741 
2742 Register AMDGPULegalizerInfo::fixStoreSourceType(
2743   MachineIRBuilder &B, Register VData, bool IsFormat) const {
2744   MachineRegisterInfo *MRI = B.getMRI();
2745   LLT Ty = MRI->getType(VData);
2746 
2747   const LLT S16 = LLT::scalar(16);
2748 
2749   // Fixup illegal register types for i8 stores.
2750   if (Ty == LLT::scalar(8) || Ty == S16) {
2751     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
2752     return AnyExt;
2753   }
2754 
2755   if (Ty.isVector()) {
2756     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
2757       if (IsFormat)
2758         return handleD16VData(B, *MRI, VData);
2759     }
2760   }
2761 
2762   return VData;
2763 }
2764 
2765 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
2766                                               MachineRegisterInfo &MRI,
2767                                               MachineIRBuilder &B,
2768                                               bool IsTyped,
2769                                               bool IsFormat) const {
2770   B.setInstr(MI);
2771 
2772   Register VData = MI.getOperand(1).getReg();
2773   LLT Ty = MRI.getType(VData);
2774   LLT EltTy = Ty.getScalarType();
2775   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
2776   const LLT S32 = LLT::scalar(32);
2777 
2778   VData = fixStoreSourceType(B, VData, IsFormat);
2779   Register RSrc = MI.getOperand(2).getReg();
2780 
2781   MachineMemOperand *MMO = *MI.memoperands_begin();
2782   const int MemSize = MMO->getSize();
2783 
2784   unsigned ImmOffset;
2785   unsigned TotalOffset;
2786 
2787   // The typed intrinsics add an immediate after the registers.
2788   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
2789 
2790   // The struct intrinsic variants add one additional operand over raw.
2791   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
2792   Register VIndex;
2793   int OpOffset = 0;
2794   if (HasVIndex) {
2795     VIndex = MI.getOperand(3).getReg();
2796     OpOffset = 1;
2797   }
2798 
2799   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
2800   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
2801 
2802   unsigned Format = 0;
2803   if (IsTyped) {
2804     Format = MI.getOperand(5 + OpOffset).getImm();
2805     ++OpOffset;
2806   }
2807 
2808   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
2809 
2810   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
2811   if (TotalOffset != 0)
2812     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
2813 
2814   unsigned Opc;
2815   if (IsTyped) {
2816     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
2817                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
2818   } else if (IsFormat) {
2819     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
2820                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
2821   } else {
2822     switch (MemSize) {
2823     case 1:
2824       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
2825       break;
2826     case 2:
2827       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
2828       break;
2829     default:
2830       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
2831       break;
2832     }
2833   }
2834 
2835   if (!VIndex)
2836     VIndex = B.buildConstant(S32, 0).getReg(0);
2837 
2838   auto MIB = B.buildInstr(Opc)
2839     .addUse(VData)              // vdata
2840     .addUse(RSrc)               // rsrc
2841     .addUse(VIndex)             // vindex
2842     .addUse(VOffset)            // voffset
2843     .addUse(SOffset)            // soffset
2844     .addImm(ImmOffset);         // offset(imm)
2845 
2846   if (IsTyped)
2847     MIB.addImm(Format);
2848 
2849   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
2850      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
2851      .addMemOperand(MMO);
2852 
2853   MI.eraseFromParent();
2854   return true;
2855 }
2856 
2857 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
2858                                              MachineRegisterInfo &MRI,
2859                                              MachineIRBuilder &B,
2860                                              bool IsFormat,
2861                                              bool IsTyped) const {
2862   B.setInstr(MI);
2863 
2864   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
2865   MachineMemOperand *MMO = *MI.memoperands_begin();
2866   const int MemSize = MMO->getSize();
2867   const LLT S32 = LLT::scalar(32);
2868 
2869   Register Dst = MI.getOperand(0).getReg();
2870   Register RSrc = MI.getOperand(2).getReg();
2871 
2872   // The typed intrinsics add an immediate after the registers.
2873   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
2874 
2875   // The struct intrinsic variants add one additional operand over raw.
2876   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
2877   Register VIndex;
2878   int OpOffset = 0;
2879   if (HasVIndex) {
2880     VIndex = MI.getOperand(3).getReg();
2881     OpOffset = 1;
2882   }
2883 
2884   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
2885   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
2886 
2887   unsigned Format = 0;
2888   if (IsTyped) {
2889     Format = MI.getOperand(5 + OpOffset).getImm();
2890     ++OpOffset;
2891   }
2892 
2893   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
2894   unsigned ImmOffset;
2895   unsigned TotalOffset;
2896 
2897   LLT Ty = MRI.getType(Dst);
2898   LLT EltTy = Ty.getScalarType();
2899   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
2900   const bool Unpacked = ST.hasUnpackedD16VMem();
2901 
2902   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
2903   if (TotalOffset != 0)
2904     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
2905 
2906   unsigned Opc;
2907 
2908   if (IsTyped) {
2909     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
2910                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
2911   } else if (IsFormat) {
2912     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
2913                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
2914   } else {
2915     switch (MemSize) {
2916     case 1:
2917       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
2918       break;
2919     case 2:
2920       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
2921       break;
2922     default:
2923       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
2924       break;
2925     }
2926   }
2927 
2928   Register LoadDstReg;
2929 
2930   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
2931   LLT UnpackedTy = Ty.changeElementSize(32);
2932 
2933   if (IsExtLoad)
2934     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
2935   else if (Unpacked && IsD16 && Ty.isVector())
2936     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
2937   else
2938     LoadDstReg = Dst;
2939 
2940   if (!VIndex)
2941     VIndex = B.buildConstant(S32, 0).getReg(0);
2942 
2943   auto MIB = B.buildInstr(Opc)
2944     .addDef(LoadDstReg)         // vdata
2945     .addUse(RSrc)               // rsrc
2946     .addUse(VIndex)             // vindex
2947     .addUse(VOffset)            // voffset
2948     .addUse(SOffset)            // soffset
2949     .addImm(ImmOffset);         // offset(imm)
2950 
2951   if (IsTyped)
2952     MIB.addImm(Format);
2953 
2954   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
2955      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
2956      .addMemOperand(MMO);
2957 
2958   if (LoadDstReg != Dst) {
2959     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2960 
2961     // Widen result for extending loads was widened.
2962     if (IsExtLoad)
2963       B.buildTrunc(Dst, LoadDstReg);
2964     else {
2965       // Repack to original 16-bit vector result
2966       // FIXME: G_TRUNC should work, but legalization currently fails
2967       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
2968       SmallVector<Register, 4> Repack;
2969       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
2970         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
2971       B.buildMerge(Dst, Repack);
2972     }
2973   }
2974 
2975   MI.eraseFromParent();
2976   return true;
2977 }
2978 
2979 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
2980                                                MachineIRBuilder &B,
2981                                                bool IsInc) const {
2982   B.setInstr(MI);
2983   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
2984                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
2985   B.buildInstr(Opc)
2986     .addDef(MI.getOperand(0).getReg())
2987     .addUse(MI.getOperand(2).getReg())
2988     .addUse(MI.getOperand(3).getReg())
2989     .cloneMemRefs(MI);
2990   MI.eraseFromParent();
2991   return true;
2992 }
2993 
2994 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
2995   switch (IntrID) {
2996   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
2997   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
2998     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
2999   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3000   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3001     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3002   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3003   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3004     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3005   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3006   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3007     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3008   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3009   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3010     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3011   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3012   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3013     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3014   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3015   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3016     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3017   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3018   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3019     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3020   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3021   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3022     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3023   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3024   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3025     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3026   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3027   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3028     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3029   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3030   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3031     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3032   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3033   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3034     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3035   default:
3036     llvm_unreachable("unhandled atomic opcode");
3037   }
3038 }
3039 
3040 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3041                                                MachineIRBuilder &B,
3042                                                Intrinsic::ID IID) const {
3043   B.setInstr(MI);
3044 
3045   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3046                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3047 
3048   Register Dst = MI.getOperand(0).getReg();
3049   Register VData = MI.getOperand(2).getReg();
3050 
3051   Register CmpVal;
3052   int OpOffset = 0;
3053 
3054   if (IsCmpSwap) {
3055     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3056     ++OpOffset;
3057   }
3058 
3059   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3060   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
3061 
3062   // The struct intrinsic variants add one additional operand over raw.
3063   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3064   Register VIndex;
3065   if (HasVIndex) {
3066     VIndex = MI.getOperand(4 + OpOffset).getReg();
3067     ++OpOffset;
3068   }
3069 
3070   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3071   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3072   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3073 
3074   MachineMemOperand *MMO = *MI.memoperands_begin();
3075 
3076   unsigned ImmOffset;
3077   unsigned TotalOffset;
3078   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3079   if (TotalOffset != 0)
3080     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3081 
3082   if (!VIndex)
3083     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3084 
3085   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
3086     .addDef(Dst)
3087     .addUse(VData); // vdata
3088 
3089   if (IsCmpSwap)
3090     MIB.addReg(CmpVal);
3091 
3092   MIB.addUse(RSrc)               // rsrc
3093      .addUse(VIndex)             // vindex
3094      .addUse(VOffset)            // voffset
3095      .addUse(SOffset)            // soffset
3096      .addImm(ImmOffset)          // offset(imm)
3097      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3098      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3099      .addMemOperand(MMO);
3100 
3101   MI.eraseFromParent();
3102   return true;
3103 }
3104 
3105 // Produce a vector of s16 elements from s32 pieces.
3106 static void truncToS16Vector(MachineIRBuilder &B, Register DstReg,
3107                              ArrayRef<Register> UnmergeParts) {
3108   const LLT S16 = LLT::scalar(16);
3109 
3110   SmallVector<Register, 4> RemergeParts(UnmergeParts.size());
3111   for (int I = 0, E = UnmergeParts.size(); I != E; ++I)
3112     RemergeParts[I] = B.buildTrunc(S16, UnmergeParts[I]).getReg(0);
3113 
3114   B.buildBuildVector(DstReg, RemergeParts);
3115 }
3116 
3117 /// Convert a set of s32 registers to a result vector with s16 elements.
3118 static void bitcastToS16Vector(MachineIRBuilder &B, Register DstReg,
3119                                ArrayRef<Register> UnmergeParts) {
3120   MachineRegisterInfo &MRI = *B.getMRI();
3121   const LLT V2S16 = LLT::vector(2, 16);
3122   LLT TargetTy = MRI.getType(DstReg);
3123   int NumElts = UnmergeParts.size();
3124 
3125   if (NumElts == 1) {
3126     assert(TargetTy == V2S16);
3127     B.buildBitcast(DstReg, UnmergeParts[0]);
3128     return;
3129   }
3130 
3131   SmallVector<Register, 4> RemergeParts(NumElts);
3132   for (int I = 0; I != NumElts; ++I)
3133     RemergeParts[I] = B.buildBitcast(V2S16, UnmergeParts[I]).getReg(0);
3134 
3135   if (TargetTy.getSizeInBits() == 32u * NumElts) {
3136     B.buildConcatVectors(DstReg, RemergeParts);
3137     return;
3138   }
3139 
3140   const LLT V3S16 = LLT::vector(3, 16);
3141   const LLT V6S16 = LLT::vector(6, 16);
3142 
3143   // Widen to v6s16 and unpack v3 parts.
3144   assert(TargetTy == V3S16);
3145 
3146   RemergeParts.push_back(B.buildUndef(V2S16).getReg(0));
3147   auto Concat = B.buildConcatVectors(V6S16, RemergeParts);
3148   B.buildUnmerge({DstReg, MRI.createGenericVirtualRegister(V3S16)}, Concat);
3149 }
3150 
3151 // FIXME: Just vector trunc should be sufficent, but legalization currently
3152 // broken.
3153 static void repackUnpackedD16Load(MachineIRBuilder &B, Register DstReg,
3154                                   Register WideDstReg) {
3155   const LLT S32 = LLT::scalar(32);
3156   const LLT S16 = LLT::scalar(16);
3157 
3158   auto Unmerge = B.buildUnmerge(S32, WideDstReg);
3159 
3160   int NumOps = Unmerge->getNumOperands() - 1;
3161   SmallVector<Register, 4> RemergeParts(NumOps);
3162   for (int I = 0; I != NumOps; ++I)
3163     RemergeParts[I] = B.buildTrunc(S16, Unmerge.getReg(I)).getReg(0);
3164 
3165   B.buildBuildVector(DstReg, RemergeParts);
3166 }
3167 
3168 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3169     MachineInstr &MI, MachineIRBuilder &B,
3170     GISelChangeObserver &Observer,
3171     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3172   bool IsTFE = MI.getNumExplicitDefs() == 2;
3173 
3174   // We are only processing the operands of d16 image operations on subtargets
3175   // that use the unpacked register layout, or need to repack the TFE result.
3176 
3177   // TODO: Need to handle a16 images too
3178   // TODO: Do we need to guard against already legalized intrinsics?
3179   if (!IsTFE && !ST.hasUnpackedD16VMem())
3180     return true;
3181 
3182   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3183     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3184 
3185   if (BaseOpcode->Atomic) // No d16 atomics, or TFE.
3186     return true;
3187 
3188   B.setInstr(MI);
3189 
3190   MachineRegisterInfo *MRI = B.getMRI();
3191   const LLT S32 = LLT::scalar(32);
3192   const LLT S16 = LLT::scalar(16);
3193 
3194   if (BaseOpcode->Store) { // No TFE for stores?
3195     Register VData = MI.getOperand(1).getReg();
3196     LLT Ty = MRI->getType(VData);
3197     if (!Ty.isVector() || Ty.getElementType() != S16)
3198       return true;
3199 
3200     B.setInstr(MI);
3201 
3202     Observer.changingInstr(MI);
3203     MI.getOperand(1).setReg(handleD16VData(B, *MRI, VData));
3204     Observer.changedInstr(MI);
3205     return true;
3206   }
3207 
3208   Register DstReg = MI.getOperand(0).getReg();
3209   LLT Ty = MRI->getType(DstReg);
3210   const LLT EltTy = Ty.getScalarType();
3211   const bool IsD16 = Ty.getScalarType() == S16;
3212   const unsigned NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
3213 
3214   if (IsTFE) {
3215     // In the IR, TFE is supposed to be used with a 2 element struct return
3216     // type. The intruction really returns these two values in one contiguous
3217     // register, with one additional dword beyond the loaded data. Rewrite the
3218     // return type to use a single register result.
3219     Register Dst1Reg = MI.getOperand(1).getReg();
3220     if (MRI->getType(Dst1Reg) != S32)
3221       return false;
3222 
3223     // TODO: Make sure the TFE operand bit is set.
3224 
3225     // The raw dword aligned data component of the load. The only legal cases
3226     // where this matters should be when using the packed D16 format, for
3227     // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
3228     LLT RoundedTy;
3229     LLT TFETy;
3230 
3231     if (IsD16 && ST.hasUnpackedD16VMem()) {
3232       RoundedTy = LLT::scalarOrVector(NumElts, 32);
3233       TFETy = LLT::vector(NumElts + 1, 32);
3234     } else {
3235       unsigned EltSize = Ty.getScalarSizeInBits();
3236       unsigned RoundedElts = (Ty.getSizeInBits() + 31) / 32;
3237       unsigned RoundedSize = 32 * RoundedElts;
3238       RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
3239       TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
3240     }
3241 
3242     Register TFEReg = MRI->createGenericVirtualRegister(TFETy);
3243     Observer.changingInstr(MI);
3244 
3245     MI.getOperand(0).setReg(TFEReg);
3246     MI.RemoveOperand(1);
3247 
3248     Observer.changedInstr(MI);
3249 
3250     // Insert after the instruction.
3251     B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3252 
3253     // Now figure out how to copy the new result register back into the old
3254     // result.
3255 
3256     SmallVector<Register, 5> UnmergeResults(TFETy.getNumElements(), Dst1Reg);
3257     int NumDataElts = TFETy.getNumElements() - 1;
3258 
3259     if (!Ty.isVector()) {
3260       // Simplest case is a trivial unmerge (plus a truncate for d16).
3261       UnmergeResults[0] = Ty == S32 ?
3262         DstReg : MRI->createGenericVirtualRegister(S32);
3263 
3264       B.buildUnmerge(UnmergeResults, TFEReg);
3265       if (Ty != S32)
3266         B.buildTrunc(DstReg, UnmergeResults[0]);
3267       return true;
3268     }
3269 
3270     // We have to repack into a new vector of some kind.
3271     for (int I = 0; I != NumDataElts; ++I)
3272       UnmergeResults[I] = MRI->createGenericVirtualRegister(S32);
3273     B.buildUnmerge(UnmergeResults, TFEReg);
3274 
3275     // Drop the final TFE element.
3276     ArrayRef<Register> DataPart(UnmergeResults.data(), NumDataElts);
3277 
3278     if (EltTy == S32)
3279       B.buildBuildVector(DstReg, DataPart);
3280     else if (ST.hasUnpackedD16VMem())
3281       truncToS16Vector(B, DstReg, DataPart);
3282     else
3283       bitcastToS16Vector(B, DstReg, DataPart);
3284 
3285     return true;
3286   }
3287 
3288   // Must be an image load.
3289   if (!Ty.isVector() || Ty.getElementType() != S16)
3290     return true;
3291 
3292   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3293 
3294   LLT WidenedTy = Ty.changeElementType(S32);
3295   Register WideDstReg = MRI->createGenericVirtualRegister(WidenedTy);
3296 
3297   Observer.changingInstr(MI);
3298   MI.getOperand(0).setReg(WideDstReg);
3299   Observer.changedInstr(MI);
3300 
3301   repackUnpackedD16Load(B, DstReg, WideDstReg);
3302   return true;
3303 }
3304 
3305 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
3306   MachineInstr &MI, MachineIRBuilder &B,
3307   GISelChangeObserver &Observer) const {
3308   Register Dst = MI.getOperand(0).getReg();
3309   LLT Ty = B.getMRI()->getType(Dst);
3310   unsigned Size = Ty.getSizeInBits();
3311   MachineFunction &MF = B.getMF();
3312 
3313   Observer.changingInstr(MI);
3314 
3315   // FIXME: We don't really need this intermediate instruction. The intrinsic
3316   // should be fixed to have a memory operand. Since it's readnone, we're not
3317   // allowed to add one.
3318   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
3319   MI.RemoveOperand(1); // Remove intrinsic ID
3320 
3321   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
3322   // TODO: Should this use datalayout alignment?
3323   const unsigned MemSize = (Size + 7) / 8;
3324   const unsigned MemAlign = 4;
3325   MachineMemOperand *MMO = MF.getMachineMemOperand(
3326     MachinePointerInfo(),
3327     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
3328     MachineMemOperand::MOInvariant, MemSize, MemAlign);
3329   MI.addMemOperand(MF, MMO);
3330 
3331   // There are no 96-bit result scalar loads, but widening to 128-bit should
3332   // always be legal. We may need to restore this to a 96-bit result if it turns
3333   // out this needs to be converted to a vector load during RegBankSelect.
3334   if (!isPowerOf2_32(Size)) {
3335     LegalizerHelper Helper(MF, *this, Observer, B);
3336     B.setInstr(MI);
3337 
3338     if (Ty.isVector())
3339       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
3340     else
3341       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
3342   }
3343 
3344   Observer.changedInstr(MI);
3345   return true;
3346 }
3347 
3348 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
3349                                             MachineIRBuilder &B,
3350                                             GISelChangeObserver &Observer) const {
3351   MachineRegisterInfo &MRI = *B.getMRI();
3352 
3353   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
3354   auto IntrID = MI.getIntrinsicID();
3355   switch (IntrID) {
3356   case Intrinsic::amdgcn_if:
3357   case Intrinsic::amdgcn_else: {
3358     MachineInstr *Br = nullptr;
3359     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
3360       const SIRegisterInfo *TRI
3361         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
3362 
3363       B.setInstr(*BrCond);
3364       Register Def = MI.getOperand(1).getReg();
3365       Register Use = MI.getOperand(3).getReg();
3366 
3367       MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB();
3368       if (Br)
3369         BrTarget = Br->getOperand(0).getMBB();
3370 
3371       if (IntrID == Intrinsic::amdgcn_if) {
3372         B.buildInstr(AMDGPU::SI_IF)
3373           .addDef(Def)
3374           .addUse(Use)
3375           .addMBB(BrTarget);
3376       } else {
3377         B.buildInstr(AMDGPU::SI_ELSE)
3378           .addDef(Def)
3379           .addUse(Use)
3380           .addMBB(BrTarget)
3381           .addImm(0);
3382       }
3383 
3384       if (Br)
3385         Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB());
3386 
3387       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
3388       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
3389       MI.eraseFromParent();
3390       BrCond->eraseFromParent();
3391       return true;
3392     }
3393 
3394     return false;
3395   }
3396   case Intrinsic::amdgcn_loop: {
3397     MachineInstr *Br = nullptr;
3398     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
3399       const SIRegisterInfo *TRI
3400         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
3401 
3402       B.setInstr(*BrCond);
3403 
3404       // FIXME: Need to adjust branch targets based on unconditional branch.
3405       Register Reg = MI.getOperand(2).getReg();
3406       B.buildInstr(AMDGPU::SI_LOOP)
3407         .addUse(Reg)
3408         .addMBB(BrCond->getOperand(1).getMBB());
3409       MI.eraseFromParent();
3410       BrCond->eraseFromParent();
3411       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
3412       return true;
3413     }
3414 
3415     return false;
3416   }
3417   case Intrinsic::amdgcn_kernarg_segment_ptr:
3418     return legalizePreloadedArgIntrin(
3419       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
3420   case Intrinsic::amdgcn_implicitarg_ptr:
3421     return legalizeImplicitArgPtr(MI, MRI, B);
3422   case Intrinsic::amdgcn_workitem_id_x:
3423     return legalizePreloadedArgIntrin(MI, MRI, B,
3424                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
3425   case Intrinsic::amdgcn_workitem_id_y:
3426     return legalizePreloadedArgIntrin(MI, MRI, B,
3427                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
3428   case Intrinsic::amdgcn_workitem_id_z:
3429     return legalizePreloadedArgIntrin(MI, MRI, B,
3430                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
3431   case Intrinsic::amdgcn_workgroup_id_x:
3432     return legalizePreloadedArgIntrin(MI, MRI, B,
3433                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
3434   case Intrinsic::amdgcn_workgroup_id_y:
3435     return legalizePreloadedArgIntrin(MI, MRI, B,
3436                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
3437   case Intrinsic::amdgcn_workgroup_id_z:
3438     return legalizePreloadedArgIntrin(MI, MRI, B,
3439                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
3440   case Intrinsic::amdgcn_dispatch_ptr:
3441     return legalizePreloadedArgIntrin(MI, MRI, B,
3442                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
3443   case Intrinsic::amdgcn_queue_ptr:
3444     return legalizePreloadedArgIntrin(MI, MRI, B,
3445                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
3446   case Intrinsic::amdgcn_implicit_buffer_ptr:
3447     return legalizePreloadedArgIntrin(
3448       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
3449   case Intrinsic::amdgcn_dispatch_id:
3450     return legalizePreloadedArgIntrin(MI, MRI, B,
3451                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
3452   case Intrinsic::amdgcn_fdiv_fast:
3453     return legalizeFDIVFastIntrin(MI, MRI, B);
3454   case Intrinsic::amdgcn_is_shared:
3455     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
3456   case Intrinsic::amdgcn_is_private:
3457     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
3458   case Intrinsic::amdgcn_wavefrontsize: {
3459     B.setInstr(MI);
3460     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
3461     MI.eraseFromParent();
3462     return true;
3463   }
3464   case Intrinsic::amdgcn_s_buffer_load:
3465     return legalizeSBufferLoad(MI, B, Observer);
3466   case Intrinsic::amdgcn_raw_buffer_store:
3467   case Intrinsic::amdgcn_struct_buffer_store:
3468     return legalizeBufferStore(MI, MRI, B, false, false);
3469   case Intrinsic::amdgcn_raw_buffer_store_format:
3470   case Intrinsic::amdgcn_struct_buffer_store_format:
3471     return legalizeBufferStore(MI, MRI, B, false, true);
3472   case Intrinsic::amdgcn_raw_tbuffer_store:
3473   case Intrinsic::amdgcn_struct_tbuffer_store:
3474     return legalizeBufferStore(MI, MRI, B, true, true);
3475   case Intrinsic::amdgcn_raw_buffer_load:
3476   case Intrinsic::amdgcn_struct_buffer_load:
3477     return legalizeBufferLoad(MI, MRI, B, false, false);
3478   case Intrinsic::amdgcn_raw_buffer_load_format:
3479   case Intrinsic::amdgcn_struct_buffer_load_format:
3480     return legalizeBufferLoad(MI, MRI, B, true, false);
3481   case Intrinsic::amdgcn_raw_tbuffer_load:
3482   case Intrinsic::amdgcn_struct_tbuffer_load:
3483     return legalizeBufferLoad(MI, MRI, B, true, true);
3484   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3485   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3486   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3487   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3488   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3489   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3490   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3491   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3492   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3493   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3494   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3495   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3496   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3497   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3498   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3499   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3500   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3501   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3502   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3503   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3504   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3505   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3506   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3507   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3508   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3509   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3510     return legalizeBufferAtomic(MI, B, IntrID);
3511   case Intrinsic::amdgcn_atomic_inc:
3512     return legalizeAtomicIncDec(MI, B, true);
3513   case Intrinsic::amdgcn_atomic_dec:
3514     return legalizeAtomicIncDec(MI, B, false);
3515   default: {
3516     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
3517             AMDGPU::getImageDimIntrinsicInfo(IntrID))
3518       return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr);
3519     return true;
3520   }
3521   }
3522 
3523   return true;
3524 }
3525