1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
19 #endif
20 
21 #include "AMDGPULegalizerInfo.h"
22 
23 #include "AMDGPU.h"
24 #include "AMDGPUGlobalISelUtils.h"
25 #include "AMDGPUTargetMachine.h"
26 #include "SIMachineFunctionInfo.h"
27 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
28 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
29 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
30 #include "llvm/CodeGen/TargetOpcodes.h"
31 #include "llvm/CodeGen/ValueTypes.h"
32 #include "llvm/IR/DerivedTypes.h"
33 #include "llvm/IR/DiagnosticInfo.h"
34 #include "llvm/IR/Type.h"
35 #include "llvm/Support/Debug.h"
36 
37 #define DEBUG_TYPE "amdgpu-legalinfo"
38 
39 using namespace llvm;
40 using namespace LegalizeActions;
41 using namespace LegalizeMutations;
42 using namespace LegalityPredicates;
43 using namespace MIPatternMatch;
44 
45 // Round the number of elements to the next power of two elements
46 static LLT getPow2VectorType(LLT Ty) {
47   unsigned NElts = Ty.getNumElements();
48   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
49   return Ty.changeNumElements(Pow2NElts);
50 }
51 
52 // Round the number of bits to the next power of two bits
53 static LLT getPow2ScalarType(LLT Ty) {
54   unsigned Bits = Ty.getSizeInBits();
55   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
56   return LLT::scalar(Pow2Bits);
57 }
58 
59 static LegalityPredicate isMultiple32(unsigned TypeIdx,
60                                       unsigned MaxSize = 1024) {
61   return [=](const LegalityQuery &Query) {
62     const LLT Ty = Query.Types[TypeIdx];
63     const LLT EltTy = Ty.getScalarType();
64     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
65   };
66 }
67 
68 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
69   return [=](const LegalityQuery &Query) {
70     return Query.Types[TypeIdx].getSizeInBits() == Size;
71   };
72 }
73 
74 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
75   return [=](const LegalityQuery &Query) {
76     const LLT Ty = Query.Types[TypeIdx];
77     return Ty.isVector() &&
78            Ty.getNumElements() % 2 != 0 &&
79            Ty.getElementType().getSizeInBits() < 32 &&
80            Ty.getSizeInBits() % 32 != 0;
81   };
82 }
83 
84 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
85   return [=](const LegalityQuery &Query) {
86     const LLT Ty = Query.Types[TypeIdx];
87     const LLT EltTy = Ty.getScalarType();
88     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
89   };
90 }
91 
92 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
93   return [=](const LegalityQuery &Query) {
94     const LLT Ty = Query.Types[TypeIdx];
95     const LLT EltTy = Ty.getElementType();
96     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
97   };
98 }
99 
100 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
101   return [=](const LegalityQuery &Query) {
102     const LLT Ty = Query.Types[TypeIdx];
103     const LLT EltTy = Ty.getElementType();
104     unsigned Size = Ty.getSizeInBits();
105     unsigned Pieces = (Size + 63) / 64;
106     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
107     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
108   };
109 }
110 
111 // Increase the number of vector elements to reach the next multiple of 32-bit
112 // type.
113 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
114   return [=](const LegalityQuery &Query) {
115     const LLT Ty = Query.Types[TypeIdx];
116 
117     const LLT EltTy = Ty.getElementType();
118     const int Size = Ty.getSizeInBits();
119     const int EltSize = EltTy.getSizeInBits();
120     const int NextMul32 = (Size + 31) / 32;
121 
122     assert(EltSize < 32);
123 
124     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
125     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
126   };
127 }
128 
129 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
130   return [=](const LegalityQuery &Query) {
131     const LLT QueryTy = Query.Types[TypeIdx];
132     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
133   };
134 }
135 
136 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
137   return [=](const LegalityQuery &Query) {
138     const LLT QueryTy = Query.Types[TypeIdx];
139     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
140   };
141 }
142 
143 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
144   return [=](const LegalityQuery &Query) {
145     const LLT QueryTy = Query.Types[TypeIdx];
146     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
147   };
148 }
149 
150 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
151 // v2s16.
152 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
153   return [=](const LegalityQuery &Query) {
154     const LLT Ty = Query.Types[TypeIdx];
155     if (Ty.isVector()) {
156       const int EltSize = Ty.getElementType().getSizeInBits();
157       return EltSize == 32 || EltSize == 64 ||
158             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
159              EltSize == 128 || EltSize == 256;
160     }
161 
162     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
163   };
164 }
165 
166 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
167   return [=](const LegalityQuery &Query) {
168     const LLT QueryTy = Query.Types[TypeIdx];
169     return QueryTy.isVector() && QueryTy.getElementType() == Type;
170   };
171 }
172 
173 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
174   return [=](const LegalityQuery &Query) {
175     const LLT Ty = Query.Types[TypeIdx];
176     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
177            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
178   };
179 }
180 
181 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
182                                          const GCNTargetMachine &TM)
183   :  ST(ST_) {
184   using namespace TargetOpcode;
185 
186   auto GetAddrSpacePtr = [&TM](unsigned AS) {
187     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
188   };
189 
190   const LLT S1 = LLT::scalar(1);
191   const LLT S16 = LLT::scalar(16);
192   const LLT S32 = LLT::scalar(32);
193   const LLT S64 = LLT::scalar(64);
194   const LLT S128 = LLT::scalar(128);
195   const LLT S256 = LLT::scalar(256);
196   const LLT S1024 = LLT::scalar(1024);
197 
198   const LLT V2S16 = LLT::vector(2, 16);
199   const LLT V4S16 = LLT::vector(4, 16);
200 
201   const LLT V2S32 = LLT::vector(2, 32);
202   const LLT V3S32 = LLT::vector(3, 32);
203   const LLT V4S32 = LLT::vector(4, 32);
204   const LLT V5S32 = LLT::vector(5, 32);
205   const LLT V6S32 = LLT::vector(6, 32);
206   const LLT V7S32 = LLT::vector(7, 32);
207   const LLT V8S32 = LLT::vector(8, 32);
208   const LLT V9S32 = LLT::vector(9, 32);
209   const LLT V10S32 = LLT::vector(10, 32);
210   const LLT V11S32 = LLT::vector(11, 32);
211   const LLT V12S32 = LLT::vector(12, 32);
212   const LLT V13S32 = LLT::vector(13, 32);
213   const LLT V14S32 = LLT::vector(14, 32);
214   const LLT V15S32 = LLT::vector(15, 32);
215   const LLT V16S32 = LLT::vector(16, 32);
216   const LLT V32S32 = LLT::vector(32, 32);
217 
218   const LLT V2S64 = LLT::vector(2, 64);
219   const LLT V3S64 = LLT::vector(3, 64);
220   const LLT V4S64 = LLT::vector(4, 64);
221   const LLT V5S64 = LLT::vector(5, 64);
222   const LLT V6S64 = LLT::vector(6, 64);
223   const LLT V7S64 = LLT::vector(7, 64);
224   const LLT V8S64 = LLT::vector(8, 64);
225   const LLT V16S64 = LLT::vector(16, 64);
226 
227   std::initializer_list<LLT> AllS32Vectors =
228     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
229      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
230   std::initializer_list<LLT> AllS64Vectors =
231     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
232 
233   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
234   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
235   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
236   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
237   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
238   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
239   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
240 
241   const LLT CodePtr = FlatPtr;
242 
243   const std::initializer_list<LLT> AddrSpaces64 = {
244     GlobalPtr, ConstantPtr, FlatPtr
245   };
246 
247   const std::initializer_list<LLT> AddrSpaces32 = {
248     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
249   };
250 
251   const std::initializer_list<LLT> FPTypesBase = {
252     S32, S64
253   };
254 
255   const std::initializer_list<LLT> FPTypes16 = {
256     S32, S64, S16
257   };
258 
259   const std::initializer_list<LLT> FPTypesPK16 = {
260     S32, S64, S16, V2S16
261   };
262 
263   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
264 
265   setAction({G_BRCOND, S1}, Legal); // VCC branches
266   setAction({G_BRCOND, S32}, Legal); // SCC branches
267 
268   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
269   // elements for v3s16
270   getActionDefinitionsBuilder(G_PHI)
271     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
272     .legalFor(AllS32Vectors)
273     .legalFor(AllS64Vectors)
274     .legalFor(AddrSpaces64)
275     .legalFor(AddrSpaces32)
276     .clampScalar(0, S32, S256)
277     .widenScalarToNextPow2(0, 32)
278     .clampMaxNumElements(0, S32, 16)
279     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
280     .legalIf(isPointer(0));
281 
282   if (ST.has16BitInsts()) {
283     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
284       .legalFor({S32, S16})
285       .clampScalar(0, S16, S32)
286       .scalarize(0)
287       .widenScalarToNextPow2(0, 32);
288   } else {
289     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
290       .legalFor({S32})
291       .clampScalar(0, S32, S32)
292       .scalarize(0);
293   }
294 
295   // FIXME: Not really legal. Placeholder for custom lowering.
296   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
297     .legalFor({S32, S64})
298     .clampScalar(0, S32, S64)
299     .widenScalarToNextPow2(0, 32)
300     .scalarize(0);
301 
302   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
303     .legalFor({S32})
304     .clampScalar(0, S32, S32)
305     .scalarize(0);
306 
307   // Report legal for any types we can handle anywhere. For the cases only legal
308   // on the SALU, RegBankSelect will be able to re-legalize.
309   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
310     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
311     .clampScalar(0, S32, S64)
312     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
313     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
314     .widenScalarToNextPow2(0)
315     .scalarize(0);
316 
317   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
318                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
319     .legalFor({{S32, S1}, {S32, S32}})
320     .clampScalar(0, S32, S32)
321     .scalarize(0); // TODO: Implement.
322 
323   getActionDefinitionsBuilder(G_BITCAST)
324     // Don't worry about the size constraint.
325     .legalIf(all(isRegisterType(0), isRegisterType(1)))
326     .lower();
327 
328 
329   getActionDefinitionsBuilder(G_CONSTANT)
330     .legalFor({S1, S32, S64, S16, GlobalPtr,
331                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
332     .clampScalar(0, S32, S64)
333     .widenScalarToNextPow2(0)
334     .legalIf(isPointer(0));
335 
336   getActionDefinitionsBuilder(G_FCONSTANT)
337     .legalFor({S32, S64, S16})
338     .clampScalar(0, S16, S64);
339 
340   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
341     .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
342                ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
343     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
344     .clampScalarOrElt(0, S32, S1024)
345     .legalIf(isMultiple32(0))
346     .widenScalarToNextPow2(0, 32)
347     .clampMaxNumElements(0, S32, 16);
348 
349   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
350   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
351     .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr});
352   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
353 
354   auto &FPOpActions = getActionDefinitionsBuilder(
355     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
356     .legalFor({S32, S64});
357   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
358     .customFor({S32, S64});
359   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
360     .customFor({S32, S64});
361 
362   if (ST.has16BitInsts()) {
363     if (ST.hasVOP3PInsts())
364       FPOpActions.legalFor({S16, V2S16});
365     else
366       FPOpActions.legalFor({S16});
367 
368     TrigActions.customFor({S16});
369     FDIVActions.customFor({S16});
370   }
371 
372   auto &MinNumMaxNum = getActionDefinitionsBuilder({
373       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
374 
375   if (ST.hasVOP3PInsts()) {
376     MinNumMaxNum.customFor(FPTypesPK16)
377       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
378       .clampMaxNumElements(0, S16, 2)
379       .clampScalar(0, S16, S64)
380       .scalarize(0);
381   } else if (ST.has16BitInsts()) {
382     MinNumMaxNum.customFor(FPTypes16)
383       .clampScalar(0, S16, S64)
384       .scalarize(0);
385   } else {
386     MinNumMaxNum.customFor(FPTypesBase)
387       .clampScalar(0, S32, S64)
388       .scalarize(0);
389   }
390 
391   if (ST.hasVOP3PInsts())
392     FPOpActions.clampMaxNumElements(0, S16, 2);
393 
394   FPOpActions
395     .scalarize(0)
396     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
397 
398   TrigActions
399     .scalarize(0)
400     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
401 
402   FDIVActions
403     .scalarize(0)
404     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
405 
406   getActionDefinitionsBuilder({G_FNEG, G_FABS})
407     .legalFor(FPTypesPK16)
408     .clampMaxNumElements(0, S16, 2)
409     .scalarize(0)
410     .clampScalar(0, S16, S64);
411 
412   if (ST.has16BitInsts()) {
413     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
414       .legalFor({S32, S64, S16})
415       .scalarize(0)
416       .clampScalar(0, S16, S64);
417   } else {
418     getActionDefinitionsBuilder(G_FSQRT)
419       .legalFor({S32, S64})
420       .scalarize(0)
421       .clampScalar(0, S32, S64);
422 
423     if (ST.hasFractBug()) {
424       getActionDefinitionsBuilder(G_FFLOOR)
425         .customFor({S64})
426         .legalFor({S32, S64})
427         .scalarize(0)
428         .clampScalar(0, S32, S64);
429     } else {
430       getActionDefinitionsBuilder(G_FFLOOR)
431         .legalFor({S32, S64})
432         .scalarize(0)
433         .clampScalar(0, S32, S64);
434     }
435   }
436 
437   getActionDefinitionsBuilder(G_FPTRUNC)
438     .legalFor({{S32, S64}, {S16, S32}})
439     .scalarize(0)
440     .lower();
441 
442   getActionDefinitionsBuilder(G_FPEXT)
443     .legalFor({{S64, S32}, {S32, S16}})
444     .lowerFor({{S64, S16}}) // FIXME: Implement
445     .scalarize(0);
446 
447   getActionDefinitionsBuilder(G_FSUB)
448       // Use actual fsub instruction
449       .legalFor({S32})
450       // Must use fadd + fneg
451       .lowerFor({S64, S16, V2S16})
452       .scalarize(0)
453       .clampScalar(0, S32, S64);
454 
455   // Whether this is legal depends on the floating point mode for the function.
456   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
457   if (ST.hasMadF16())
458     FMad.customFor({S32, S16});
459   else
460     FMad.customFor({S32});
461   FMad.scalarize(0)
462       .lower();
463 
464   getActionDefinitionsBuilder(G_TRUNC)
465     .alwaysLegal();
466 
467   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
468     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
469                {S32, S1}, {S64, S1}, {S16, S1}})
470     .scalarize(0)
471     .clampScalar(0, S32, S64)
472     .widenScalarToNextPow2(1, 32);
473 
474   // TODO: Split s1->s64 during regbankselect for VALU.
475   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
476     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
477     .lowerFor({{S32, S64}})
478     .lowerIf(typeIs(1, S1))
479     .customFor({{S64, S64}});
480   if (ST.has16BitInsts())
481     IToFP.legalFor({{S16, S16}});
482   IToFP.clampScalar(1, S32, S64)
483        .scalarize(0);
484 
485   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
486     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
487     .customFor({{S64, S64}});
488   if (ST.has16BitInsts())
489     FPToI.legalFor({{S16, S16}});
490   else
491     FPToI.minScalar(1, S32);
492 
493   FPToI.minScalar(0, S32)
494        .scalarize(0)
495        .lower();
496 
497   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
498     .scalarize(0)
499     .lower();
500 
501   if (ST.has16BitInsts()) {
502     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
503       .legalFor({S16, S32, S64})
504       .clampScalar(0, S16, S64)
505       .scalarize(0);
506   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
507     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
508       .legalFor({S32, S64})
509       .clampScalar(0, S32, S64)
510       .scalarize(0);
511   } else {
512     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
513       .legalFor({S32})
514       .customFor({S64})
515       .clampScalar(0, S32, S64)
516       .scalarize(0);
517   }
518 
519   getActionDefinitionsBuilder({G_PTR_ADD, G_PTR_MASK})
520     .scalarize(0)
521     .alwaysLegal();
522 
523   auto &CmpBuilder =
524     getActionDefinitionsBuilder(G_ICMP)
525     // The compare output type differs based on the register bank of the output,
526     // so make both s1 and s32 legal.
527     //
528     // Scalar compares producing output in scc will be promoted to s32, as that
529     // is the allocatable register type that will be needed for the copy from
530     // scc. This will be promoted during RegBankSelect, and we assume something
531     // before that won't try to use s32 result types.
532     //
533     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
534     // bank.
535     .legalForCartesianProduct(
536       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
537     .legalForCartesianProduct(
538       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
539   if (ST.has16BitInsts()) {
540     CmpBuilder.legalFor({{S1, S16}});
541   }
542 
543   CmpBuilder
544     .widenScalarToNextPow2(1)
545     .clampScalar(1, S32, S64)
546     .scalarize(0)
547     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
548 
549   getActionDefinitionsBuilder(G_FCMP)
550     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
551     .widenScalarToNextPow2(1)
552     .clampScalar(1, S32, S64)
553     .scalarize(0);
554 
555   // FIXME: fpow has a selection pattern that should move to custom lowering.
556   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2, G_FPOW});
557   if (ST.has16BitInsts())
558     Exp2Ops.legalFor({S32, S16});
559   else
560     Exp2Ops.legalFor({S32});
561   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
562   Exp2Ops.scalarize(0);
563 
564   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10});
565   if (ST.has16BitInsts())
566     ExpOps.customFor({{S32}, {S16}});
567   else
568     ExpOps.customFor({S32});
569   ExpOps.clampScalar(0, MinScalarFPTy, S32)
570         .scalarize(0);
571 
572   // The 64-bit versions produce 32-bit results, but only on the SALU.
573   getActionDefinitionsBuilder(G_CTPOP)
574     .legalFor({{S32, S32}, {S32, S64}})
575     .clampScalar(0, S32, S32)
576     .clampScalar(1, S32, S64)
577     .scalarize(0)
578     .widenScalarToNextPow2(0, 32)
579     .widenScalarToNextPow2(1, 32);
580 
581   // The hardware instructions return a different result on 0 than the generic
582   // instructions expect. The hardware produces -1, but these produce the
583   // bitwidth.
584   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
585     .scalarize(0)
586     .clampScalar(0, S32, S32)
587     .clampScalar(1, S32, S64)
588     .widenScalarToNextPow2(0, 32)
589     .widenScalarToNextPow2(1, 32)
590     .lower();
591 
592   // The 64-bit versions produce 32-bit results, but only on the SALU.
593   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
594     .legalFor({{S32, S32}, {S32, S64}})
595     .clampScalar(0, S32, S32)
596     .clampScalar(1, S32, S64)
597     .scalarize(0)
598     .widenScalarToNextPow2(0, 32)
599     .widenScalarToNextPow2(1, 32);
600 
601   getActionDefinitionsBuilder(G_BITREVERSE)
602     .legalFor({S32})
603     .clampScalar(0, S32, S32)
604     .scalarize(0);
605 
606   if (ST.has16BitInsts()) {
607     getActionDefinitionsBuilder(G_BSWAP)
608       .legalFor({S16, S32, V2S16})
609       .clampMaxNumElements(0, S16, 2)
610       // FIXME: Fixing non-power-of-2 before clamp is workaround for
611       // narrowScalar limitation.
612       .widenScalarToNextPow2(0)
613       .clampScalar(0, S16, S32)
614       .scalarize(0);
615 
616     if (ST.hasVOP3PInsts()) {
617       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
618         .legalFor({S32, S16, V2S16})
619         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
620         .clampMaxNumElements(0, S16, 2)
621         .clampScalar(0, S16, S32)
622         .widenScalarToNextPow2(0)
623         .scalarize(0);
624     } else {
625       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
626         .legalFor({S32, S16})
627         .widenScalarToNextPow2(0)
628         .clampScalar(0, S16, S32)
629         .scalarize(0);
630     }
631   } else {
632     // TODO: Should have same legality without v_perm_b32
633     getActionDefinitionsBuilder(G_BSWAP)
634       .legalFor({S32})
635       .lowerIf(narrowerThan(0, 32))
636       // FIXME: Fixing non-power-of-2 before clamp is workaround for
637       // narrowScalar limitation.
638       .widenScalarToNextPow2(0)
639       .maxScalar(0, S32)
640       .scalarize(0)
641       .lower();
642 
643     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
644       .legalFor({S32})
645       .clampScalar(0, S32, S32)
646       .widenScalarToNextPow2(0)
647       .scalarize(0);
648   }
649 
650   auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
651     return [=](const LegalityQuery &Query) {
652       return Query.Types[TypeIdx0].getSizeInBits() <
653              Query.Types[TypeIdx1].getSizeInBits();
654     };
655   };
656 
657   auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
658     return [=](const LegalityQuery &Query) {
659       return Query.Types[TypeIdx0].getSizeInBits() >
660              Query.Types[TypeIdx1].getSizeInBits();
661     };
662   };
663 
664   getActionDefinitionsBuilder(G_INTTOPTR)
665     // List the common cases
666     .legalForCartesianProduct(AddrSpaces64, {S64})
667     .legalForCartesianProduct(AddrSpaces32, {S32})
668     .scalarize(0)
669     // Accept any address space as long as the size matches
670     .legalIf(sameSize(0, 1))
671     .widenScalarIf(smallerThan(1, 0),
672       [](const LegalityQuery &Query) {
673         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
674       })
675     .narrowScalarIf(greaterThan(1, 0),
676       [](const LegalityQuery &Query) {
677         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
678       });
679 
680   getActionDefinitionsBuilder(G_PTRTOINT)
681     // List the common cases
682     .legalForCartesianProduct(AddrSpaces64, {S64})
683     .legalForCartesianProduct(AddrSpaces32, {S32})
684     .scalarize(0)
685     // Accept any address space as long as the size matches
686     .legalIf(sameSize(0, 1))
687     .widenScalarIf(smallerThan(0, 1),
688       [](const LegalityQuery &Query) {
689         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
690       })
691     .narrowScalarIf(
692       greaterThan(0, 1),
693       [](const LegalityQuery &Query) {
694         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
695       });
696 
697   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
698     .scalarize(0)
699     .custom();
700 
701   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
702   // handle some operations by just promoting the register during
703   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
704   auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned {
705     switch (AS) {
706     // FIXME: Private element size.
707     case AMDGPUAS::PRIVATE_ADDRESS:
708       return 32;
709     // FIXME: Check subtarget
710     case AMDGPUAS::LOCAL_ADDRESS:
711       return ST.useDS128() ? 128 : 64;
712 
713     // Treat constant and global as identical. SMRD loads are sometimes usable
714     // for global loads (ideally constant address space should be eliminated)
715     // depending on the context. Legality cannot be context dependent, but
716     // RegBankSelect can split the load as necessary depending on the pointer
717     // register bank/uniformity and if the memory is invariant or not written in
718     // a kernel.
719     case AMDGPUAS::CONSTANT_ADDRESS:
720     case AMDGPUAS::GLOBAL_ADDRESS:
721       return IsLoad ? 512 : 128;
722     default:
723       return 128;
724     }
725   };
726 
727   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
728                                     bool IsLoad) -> bool {
729     const LLT DstTy = Query.Types[0];
730 
731     // Split vector extloads.
732     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
733     unsigned Align = Query.MMODescrs[0].AlignInBits;
734 
735     if (MemSize < DstTy.getSizeInBits())
736       MemSize = std::max(MemSize, Align);
737 
738     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
739       return true;
740 
741     const LLT PtrTy = Query.Types[1];
742     unsigned AS = PtrTy.getAddressSpace();
743     if (MemSize > maxSizeForAddrSpace(AS, IsLoad))
744       return true;
745 
746     // Catch weird sized loads that don't evenly divide into the access sizes
747     // TODO: May be able to widen depending on alignment etc.
748     unsigned NumRegs = (MemSize + 31) / 32;
749     if (NumRegs == 3) {
750       if (!ST.hasDwordx3LoadStores())
751         return true;
752     } else {
753       // If the alignment allows, these should have been widened.
754       if (!isPowerOf2_32(NumRegs))
755         return true;
756     }
757 
758     if (Align < MemSize) {
759       const SITargetLowering *TLI = ST.getTargetLowering();
760       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
761     }
762 
763     return false;
764   };
765 
766   const auto shouldWidenLoadResult = [=](const LegalityQuery &Query) -> bool {
767     unsigned Size = Query.Types[0].getSizeInBits();
768     if (isPowerOf2_32(Size))
769       return false;
770 
771     if (Size == 96 && ST.hasDwordx3LoadStores())
772       return false;
773 
774     unsigned AddrSpace = Query.Types[1].getAddressSpace();
775     if (Size >= maxSizeForAddrSpace(AddrSpace, true))
776       return false;
777 
778     unsigned Align = Query.MMODescrs[0].AlignInBits;
779     unsigned RoundedSize = NextPowerOf2(Size);
780     return (Align >= RoundedSize);
781   };
782 
783   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
784   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
785   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
786 
787   // TODO: Refine based on subtargets which support unaligned access or 128-bit
788   // LDS
789   // TODO: Unsupported flat for SI.
790 
791   for (unsigned Op : {G_LOAD, G_STORE}) {
792     const bool IsStore = Op == G_STORE;
793 
794     auto &Actions = getActionDefinitionsBuilder(Op);
795     // Whitelist the common cases.
796     // TODO: Loads to s16 on gfx9
797     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
798                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
799                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
800                                       {S128, GlobalPtr, 128, GlobalAlign32},
801                                       {S64, GlobalPtr, 64, GlobalAlign32},
802                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
803                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
804                                       {S32, GlobalPtr, 8, GlobalAlign8},
805                                       {S32, GlobalPtr, 16, GlobalAlign16},
806 
807                                       {S32, LocalPtr, 32, 32},
808                                       {S64, LocalPtr, 64, 32},
809                                       {V2S32, LocalPtr, 64, 32},
810                                       {S32, LocalPtr, 8, 8},
811                                       {S32, LocalPtr, 16, 16},
812                                       {V2S16, LocalPtr, 32, 32},
813 
814                                       {S32, PrivatePtr, 32, 32},
815                                       {S32, PrivatePtr, 8, 8},
816                                       {S32, PrivatePtr, 16, 16},
817                                       {V2S16, PrivatePtr, 32, 32},
818 
819                                       {S32, FlatPtr, 32, GlobalAlign32},
820                                       {S32, FlatPtr, 16, GlobalAlign16},
821                                       {S32, FlatPtr, 8, GlobalAlign8},
822                                       {V2S16, FlatPtr, 32, GlobalAlign32},
823 
824                                       {S32, ConstantPtr, 32, GlobalAlign32},
825                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
826                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
827                                       {S64, ConstantPtr, 64, GlobalAlign32},
828                                       {S128, ConstantPtr, 128, GlobalAlign32},
829                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
830     Actions
831         .customIf(typeIs(1, Constant32Ptr))
832         // Widen suitably aligned loads by loading extra elements.
833         .moreElementsIf([=](const LegalityQuery &Query) {
834             const LLT Ty = Query.Types[0];
835             return Op == G_LOAD && Ty.isVector() &&
836                    shouldWidenLoadResult(Query);
837           }, moreElementsToNextPow2(0))
838         .widenScalarIf([=](const LegalityQuery &Query) {
839             const LLT Ty = Query.Types[0];
840             return Op == G_LOAD && !Ty.isVector() &&
841                    shouldWidenLoadResult(Query);
842           }, widenScalarOrEltToNextPow2(0))
843         .narrowScalarIf(
844             [=](const LegalityQuery &Query) -> bool {
845               return !Query.Types[0].isVector() &&
846                      needToSplitMemOp(Query, Op == G_LOAD);
847             },
848             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
849               const LLT DstTy = Query.Types[0];
850               const LLT PtrTy = Query.Types[1];
851 
852               const unsigned DstSize = DstTy.getSizeInBits();
853               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
854 
855               // Split extloads.
856               if (DstSize > MemSize)
857                 return std::make_pair(0, LLT::scalar(MemSize));
858 
859               if (!isPowerOf2_32(DstSize)) {
860                 // We're probably decomposing an odd sized store. Try to split
861                 // to the widest type. TODO: Account for alignment. As-is it
862                 // should be OK, since the new parts will be further legalized.
863                 unsigned FloorSize = PowerOf2Floor(DstSize);
864                 return std::make_pair(0, LLT::scalar(FloorSize));
865               }
866 
867               if (DstSize > 32 && (DstSize % 32 != 0)) {
868                 // FIXME: Need a way to specify non-extload of larger size if
869                 // suitably aligned.
870                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
871               }
872 
873               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
874                                                      Op == G_LOAD);
875               if (MemSize > MaxSize)
876                 return std::make_pair(0, LLT::scalar(MaxSize));
877 
878               unsigned Align = Query.MMODescrs[0].AlignInBits;
879               return std::make_pair(0, LLT::scalar(Align));
880             })
881         .fewerElementsIf(
882             [=](const LegalityQuery &Query) -> bool {
883               return Query.Types[0].isVector() &&
884                      needToSplitMemOp(Query, Op == G_LOAD);
885             },
886             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
887               const LLT DstTy = Query.Types[0];
888               const LLT PtrTy = Query.Types[1];
889 
890               LLT EltTy = DstTy.getElementType();
891               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
892                                                      Op == G_LOAD);
893 
894               // FIXME: Handle widened to power of 2 results better. This ends
895               // up scalarizing.
896               // FIXME: 3 element stores scalarized on SI
897 
898               // Split if it's too large for the address space.
899               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
900                 unsigned NumElts = DstTy.getNumElements();
901                 unsigned EltSize = EltTy.getSizeInBits();
902 
903                 if (MaxSize % EltSize == 0) {
904                   return std::make_pair(
905                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
906                 }
907 
908                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
909 
910                 // FIXME: Refine when odd breakdowns handled
911                 // The scalars will need to be re-legalized.
912                 if (NumPieces == 1 || NumPieces >= NumElts ||
913                     NumElts % NumPieces != 0)
914                   return std::make_pair(0, EltTy);
915 
916                 return std::make_pair(0,
917                                       LLT::vector(NumElts / NumPieces, EltTy));
918               }
919 
920               // FIXME: We could probably handle weird extending loads better.
921               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
922               if (DstTy.getSizeInBits() > MemSize)
923                 return std::make_pair(0, EltTy);
924 
925               unsigned EltSize = EltTy.getSizeInBits();
926               unsigned DstSize = DstTy.getSizeInBits();
927               if (!isPowerOf2_32(DstSize)) {
928                 // We're probably decomposing an odd sized store. Try to split
929                 // to the widest type. TODO: Account for alignment. As-is it
930                 // should be OK, since the new parts will be further legalized.
931                 unsigned FloorSize = PowerOf2Floor(DstSize);
932                 return std::make_pair(
933                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
934               }
935 
936               // Need to split because of alignment.
937               unsigned Align = Query.MMODescrs[0].AlignInBits;
938               if (EltSize > Align &&
939                   (EltSize / Align < DstTy.getNumElements())) {
940                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
941               }
942 
943               // May need relegalization for the scalars.
944               return std::make_pair(0, EltTy);
945             })
946         .minScalar(0, S32);
947 
948     if (IsStore)
949       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
950 
951     // TODO: Need a bitcast lower option?
952     Actions
953         .legalIf([=](const LegalityQuery &Query) {
954           const LLT Ty0 = Query.Types[0];
955           unsigned Size = Ty0.getSizeInBits();
956           unsigned MemSize = Query.MMODescrs[0].SizeInBits;
957           unsigned Align = Query.MMODescrs[0].AlignInBits;
958 
959           // FIXME: Widening store from alignment not valid.
960           if (MemSize < Size)
961             MemSize = std::max(MemSize, Align);
962 
963           // No extending vector loads.
964           if (Size > MemSize && Ty0.isVector())
965             return false;
966 
967           switch (MemSize) {
968           case 8:
969           case 16:
970             return Size == 32;
971           case 32:
972           case 64:
973           case 128:
974             return true;
975           case 96:
976             return ST.hasDwordx3LoadStores();
977           case 256:
978           case 512:
979             return true;
980           default:
981             return false;
982           }
983         })
984         .widenScalarToNextPow2(0)
985         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
986   }
987 
988   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
989                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
990                                                   {S32, GlobalPtr, 16, 2 * 8},
991                                                   {S32, LocalPtr, 8, 8},
992                                                   {S32, LocalPtr, 16, 16},
993                                                   {S32, PrivatePtr, 8, 8},
994                                                   {S32, PrivatePtr, 16, 16},
995                                                   {S32, ConstantPtr, 8, 8},
996                                                   {S32, ConstantPtr, 16, 2 * 8}});
997   if (ST.hasFlatAddressSpace()) {
998     ExtLoads.legalForTypesWithMemDesc(
999         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1000   }
1001 
1002   ExtLoads.clampScalar(0, S32, S32)
1003           .widenScalarToNextPow2(0)
1004           .unsupportedIfMemSizeNotPow2()
1005           .lower();
1006 
1007   auto &Atomics = getActionDefinitionsBuilder(
1008     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1009      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1010      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1011      G_ATOMICRMW_UMIN})
1012     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1013                {S64, GlobalPtr}, {S64, LocalPtr}});
1014   if (ST.hasFlatAddressSpace()) {
1015     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1016   }
1017 
1018   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1019     .legalFor({{S32, LocalPtr}});
1020 
1021   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1022   // demarshalling
1023   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1024     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1025                 {S32, FlatPtr}, {S64, FlatPtr}})
1026     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1027                {S32, RegionPtr}, {S64, RegionPtr}});
1028   // TODO: Pointer types, any 32-bit or 64-bit vector
1029 
1030   // Condition should be s32 for scalar, s1 for vector.
1031   getActionDefinitionsBuilder(G_SELECT)
1032     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1033           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1034           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1035     .clampScalar(0, S16, S64)
1036     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1037     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1038     .scalarize(1)
1039     .clampMaxNumElements(0, S32, 2)
1040     .clampMaxNumElements(0, LocalPtr, 2)
1041     .clampMaxNumElements(0, PrivatePtr, 2)
1042     .scalarize(0)
1043     .widenScalarToNextPow2(0)
1044     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1045 
1046   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1047   // be more flexible with the shift amount type.
1048   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1049     .legalFor({{S32, S32}, {S64, S32}});
1050   if (ST.has16BitInsts()) {
1051     if (ST.hasVOP3PInsts()) {
1052       Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
1053             .clampMaxNumElements(0, S16, 2);
1054     } else
1055       Shifts.legalFor({{S16, S32}, {S16, S16}});
1056 
1057     // TODO: Support 16-bit shift amounts
1058     Shifts.clampScalar(1, S32, S32);
1059     Shifts.clampScalar(0, S16, S64);
1060     Shifts.widenScalarToNextPow2(0, 16);
1061   } else {
1062     // Make sure we legalize the shift amount type first, as the general
1063     // expansion for the shifted type will produce much worse code if it hasn't
1064     // been truncated already.
1065     Shifts.clampScalar(1, S32, S32);
1066     Shifts.clampScalar(0, S32, S64);
1067     Shifts.widenScalarToNextPow2(0, 32);
1068   }
1069   Shifts.scalarize(0);
1070 
1071   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1072     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1073     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1074     unsigned IdxTypeIdx = 2;
1075 
1076     getActionDefinitionsBuilder(Op)
1077       .customIf([=](const LegalityQuery &Query) {
1078           const LLT EltTy = Query.Types[EltTypeIdx];
1079           const LLT VecTy = Query.Types[VecTypeIdx];
1080           const LLT IdxTy = Query.Types[IdxTypeIdx];
1081           return (EltTy.getSizeInBits() == 16 ||
1082                   EltTy.getSizeInBits() % 32 == 0) &&
1083                  VecTy.getSizeInBits() % 32 == 0 &&
1084                  VecTy.getSizeInBits() <= 1024 &&
1085                  IdxTy.getSizeInBits() == 32;
1086         })
1087       .clampScalar(EltTypeIdx, S32, S64)
1088       .clampScalar(VecTypeIdx, S32, S64)
1089       .clampScalar(IdxTypeIdx, S32, S32);
1090   }
1091 
1092   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1093     .unsupportedIf([=](const LegalityQuery &Query) {
1094         const LLT &EltTy = Query.Types[1].getElementType();
1095         return Query.Types[0] != EltTy;
1096       });
1097 
1098   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1099     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1100     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1101 
1102     // FIXME: Doesn't handle extract of illegal sizes.
1103     getActionDefinitionsBuilder(Op)
1104       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1105       // FIXME: Multiples of 16 should not be legal.
1106       .legalIf([=](const LegalityQuery &Query) {
1107           const LLT BigTy = Query.Types[BigTyIdx];
1108           const LLT LitTy = Query.Types[LitTyIdx];
1109           return (BigTy.getSizeInBits() % 32 == 0) &&
1110                  (LitTy.getSizeInBits() % 16 == 0);
1111         })
1112       .widenScalarIf(
1113         [=](const LegalityQuery &Query) {
1114           const LLT BigTy = Query.Types[BigTyIdx];
1115           return (BigTy.getScalarSizeInBits() < 16);
1116         },
1117         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1118       .widenScalarIf(
1119         [=](const LegalityQuery &Query) {
1120           const LLT LitTy = Query.Types[LitTyIdx];
1121           return (LitTy.getScalarSizeInBits() < 16);
1122         },
1123         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1124       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1125       .widenScalarToNextPow2(BigTyIdx, 32);
1126 
1127   }
1128 
1129   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1130     .legalForCartesianProduct(AllS32Vectors, {S32})
1131     .legalForCartesianProduct(AllS64Vectors, {S64})
1132     .clampNumElements(0, V16S32, V32S32)
1133     .clampNumElements(0, V2S64, V16S64)
1134     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1135 
1136   if (ST.hasScalarPackInsts()) {
1137     BuildVector
1138       // FIXME: Should probably widen s1 vectors straight to s32
1139       .minScalarOrElt(0, S16)
1140       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1141       .minScalar(1, S32);
1142 
1143     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1144       .legalFor({V2S16, S32})
1145       .lower();
1146     BuildVector.minScalarOrElt(0, S32);
1147   } else {
1148     BuildVector.customFor({V2S16, S16});
1149     BuildVector.minScalarOrElt(0, S32);
1150 
1151     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1152       .customFor({V2S16, S32})
1153       .lower();
1154   }
1155 
1156   BuildVector.legalIf(isRegisterType(0));
1157 
1158   // FIXME: Clamp maximum size
1159   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1160     .legalIf(isRegisterType(0));
1161 
1162   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1163   // pre-legalize.
1164   if (ST.hasVOP3PInsts()) {
1165     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1166       .customFor({V2S16, V2S16})
1167       .lower();
1168   } else
1169     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1170 
1171   // Merge/Unmerge
1172   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1173     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1174     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1175 
1176     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1177       const LLT &Ty = Query.Types[TypeIdx];
1178       if (Ty.isVector()) {
1179         const LLT &EltTy = Ty.getElementType();
1180         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
1181           return true;
1182         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1183           return true;
1184       }
1185       return false;
1186     };
1187 
1188     auto &Builder = getActionDefinitionsBuilder(Op)
1189       // Try to widen to s16 first for small types.
1190       // TODO: Only do this on targets with legal s16 shifts
1191       .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1192 
1193       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1194       .lowerFor({{S16, V2S16}})
1195       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1196       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1197                            elementTypeIs(1, S16)),
1198                        changeTo(1, V2S16))
1199       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1200       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1201       // valid.
1202       .clampScalar(LitTyIdx, S32, S256)
1203       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1204       // Break up vectors with weird elements into scalars
1205       .fewerElementsIf(
1206         [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
1207         scalarize(0))
1208       .fewerElementsIf(
1209         [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
1210         scalarize(1))
1211       .clampScalar(BigTyIdx, S32, S1024);
1212 
1213     if (Op == G_MERGE_VALUES) {
1214       Builder.widenScalarIf(
1215         // TODO: Use 16-bit shifts if legal for 8-bit values?
1216         [=](const LegalityQuery &Query) {
1217           const LLT Ty = Query.Types[LitTyIdx];
1218           return Ty.getSizeInBits() < 32;
1219         },
1220         changeTo(LitTyIdx, S32));
1221     }
1222 
1223     Builder.widenScalarIf(
1224       [=](const LegalityQuery &Query) {
1225         const LLT Ty = Query.Types[BigTyIdx];
1226         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1227           Ty.getSizeInBits() % 16 != 0;
1228       },
1229       [=](const LegalityQuery &Query) {
1230         // Pick the next power of 2, or a multiple of 64 over 128.
1231         // Whichever is smaller.
1232         const LLT &Ty = Query.Types[BigTyIdx];
1233         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1234         if (NewSizeInBits >= 256) {
1235           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1236           if (RoundedTo < NewSizeInBits)
1237             NewSizeInBits = RoundedTo;
1238         }
1239         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1240       })
1241       .legalIf([=](const LegalityQuery &Query) {
1242           const LLT &BigTy = Query.Types[BigTyIdx];
1243           const LLT &LitTy = Query.Types[LitTyIdx];
1244 
1245           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1246             return false;
1247           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1248             return false;
1249 
1250           return BigTy.getSizeInBits() % 16 == 0 &&
1251                  LitTy.getSizeInBits() % 16 == 0 &&
1252                  BigTy.getSizeInBits() <= 1024;
1253         })
1254       // Any vectors left are the wrong size. Scalarize them.
1255       .scalarize(0)
1256       .scalarize(1);
1257   }
1258 
1259   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1260   // RegBankSelect.
1261   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1262     .legalFor({{S32}, {S64}});
1263 
1264   if (ST.hasVOP3PInsts()) {
1265     SextInReg.lowerFor({{V2S16}})
1266       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1267       // get more vector shift opportunities, since we'll get those when
1268       // expanded.
1269       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1270   } else if (ST.has16BitInsts()) {
1271     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1272   } else {
1273     // Prefer to promote to s32 before lowering if we don't have 16-bit
1274     // shifts. This avoid a lot of intermediate truncate and extend operations.
1275     SextInReg.lowerFor({{S32}, {S64}});
1276   }
1277 
1278   SextInReg
1279     .scalarize(0)
1280     .clampScalar(0, S32, S64)
1281     .lower();
1282 
1283   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1284     .legalFor({S64});
1285 
1286   getActionDefinitionsBuilder({
1287       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1288       G_FCOPYSIGN,
1289 
1290       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1291       G_READ_REGISTER,
1292       G_WRITE_REGISTER,
1293 
1294       G_SADDO, G_SSUBO,
1295 
1296        // TODO: Implement
1297       G_FMINIMUM, G_FMAXIMUM
1298     }).lower();
1299 
1300   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1301         G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1302         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1303     .unsupported();
1304 
1305   computeTables();
1306   verify(*ST.getInstrInfo());
1307 }
1308 
1309 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1310                                          MachineRegisterInfo &MRI,
1311                                          MachineIRBuilder &B,
1312                                          GISelChangeObserver &Observer) const {
1313   switch (MI.getOpcode()) {
1314   case TargetOpcode::G_ADDRSPACE_CAST:
1315     return legalizeAddrSpaceCast(MI, MRI, B);
1316   case TargetOpcode::G_FRINT:
1317     return legalizeFrint(MI, MRI, B);
1318   case TargetOpcode::G_FCEIL:
1319     return legalizeFceil(MI, MRI, B);
1320   case TargetOpcode::G_INTRINSIC_TRUNC:
1321     return legalizeIntrinsicTrunc(MI, MRI, B);
1322   case TargetOpcode::G_SITOFP:
1323     return legalizeITOFP(MI, MRI, B, true);
1324   case TargetOpcode::G_UITOFP:
1325     return legalizeITOFP(MI, MRI, B, false);
1326   case TargetOpcode::G_FPTOSI:
1327     return legalizeFPTOI(MI, MRI, B, true);
1328   case TargetOpcode::G_FPTOUI:
1329     return legalizeFPTOI(MI, MRI, B, false);
1330   case TargetOpcode::G_FMINNUM:
1331   case TargetOpcode::G_FMAXNUM:
1332   case TargetOpcode::G_FMINNUM_IEEE:
1333   case TargetOpcode::G_FMAXNUM_IEEE:
1334     return legalizeMinNumMaxNum(MI, MRI, B);
1335   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1336     return legalizeExtractVectorElt(MI, MRI, B);
1337   case TargetOpcode::G_INSERT_VECTOR_ELT:
1338     return legalizeInsertVectorElt(MI, MRI, B);
1339   case TargetOpcode::G_SHUFFLE_VECTOR:
1340     return legalizeShuffleVector(MI, MRI, B);
1341   case TargetOpcode::G_FSIN:
1342   case TargetOpcode::G_FCOS:
1343     return legalizeSinCos(MI, MRI, B);
1344   case TargetOpcode::G_GLOBAL_VALUE:
1345     return legalizeGlobalValue(MI, MRI, B);
1346   case TargetOpcode::G_LOAD:
1347     return legalizeLoad(MI, MRI, B, Observer);
1348   case TargetOpcode::G_FMAD:
1349     return legalizeFMad(MI, MRI, B);
1350   case TargetOpcode::G_FDIV:
1351     return legalizeFDIV(MI, MRI, B);
1352   case TargetOpcode::G_ATOMIC_CMPXCHG:
1353     return legalizeAtomicCmpXChg(MI, MRI, B);
1354   case TargetOpcode::G_FLOG:
1355     return legalizeFlog(MI, B, 1.0f / numbers::log2ef);
1356   case TargetOpcode::G_FLOG10:
1357     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1358   case TargetOpcode::G_FEXP:
1359     return legalizeFExp(MI, B);
1360   case TargetOpcode::G_FFLOOR:
1361     return legalizeFFloor(MI, MRI, B);
1362   case TargetOpcode::G_BUILD_VECTOR:
1363     return legalizeBuildVector(MI, MRI, B);
1364   default:
1365     return false;
1366   }
1367 
1368   llvm_unreachable("expected switch to return");
1369 }
1370 
1371 Register AMDGPULegalizerInfo::getSegmentAperture(
1372   unsigned AS,
1373   MachineRegisterInfo &MRI,
1374   MachineIRBuilder &B) const {
1375   MachineFunction &MF = B.getMF();
1376   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1377   const LLT S32 = LLT::scalar(32);
1378 
1379   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1380 
1381   if (ST.hasApertureRegs()) {
1382     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1383     // getreg.
1384     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1385         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1386         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1387     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1388         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1389         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1390     unsigned Encoding =
1391         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1392         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1393         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1394 
1395     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1396 
1397     B.buildInstr(AMDGPU::S_GETREG_B32)
1398       .addDef(GetReg)
1399       .addImm(Encoding);
1400     MRI.setType(GetReg, S32);
1401 
1402     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1403     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1404   }
1405 
1406   Register QueuePtr = MRI.createGenericVirtualRegister(
1407     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1408 
1409   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1410   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1411     return Register();
1412 
1413   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1414   // private_segment_aperture_base_hi.
1415   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1416 
1417   // TODO: can we be smarter about machine pointer info?
1418   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1419   MachineMemOperand *MMO = MF.getMachineMemOperand(
1420     PtrInfo,
1421     MachineMemOperand::MOLoad |
1422     MachineMemOperand::MODereferenceable |
1423     MachineMemOperand::MOInvariant,
1424     4,
1425     MinAlign(64, StructOffset));
1426 
1427   Register LoadAddr;
1428 
1429   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1430   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1431 }
1432 
1433 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1434   MachineInstr &MI, MachineRegisterInfo &MRI,
1435   MachineIRBuilder &B) const {
1436   MachineFunction &MF = B.getMF();
1437 
1438   B.setInstr(MI);
1439 
1440   const LLT S32 = LLT::scalar(32);
1441   Register Dst = MI.getOperand(0).getReg();
1442   Register Src = MI.getOperand(1).getReg();
1443 
1444   LLT DstTy = MRI.getType(Dst);
1445   LLT SrcTy = MRI.getType(Src);
1446   unsigned DestAS = DstTy.getAddressSpace();
1447   unsigned SrcAS = SrcTy.getAddressSpace();
1448 
1449   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1450   // vector element.
1451   assert(!DstTy.isVector());
1452 
1453   const AMDGPUTargetMachine &TM
1454     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1455 
1456   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1457   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1458     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1459     return true;
1460   }
1461 
1462   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1463     // Truncate.
1464     B.buildExtract(Dst, Src, 0);
1465     MI.eraseFromParent();
1466     return true;
1467   }
1468 
1469   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1470     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1471     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1472 
1473     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1474     // another. Merge operands are required to be the same type, but creating an
1475     // extra ptrtoint would be kind of pointless.
1476     auto HighAddr = B.buildConstant(
1477       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1478     B.buildMerge(Dst, {Src, HighAddr});
1479     MI.eraseFromParent();
1480     return true;
1481   }
1482 
1483   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1484     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1485            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1486     unsigned NullVal = TM.getNullPointerValue(DestAS);
1487 
1488     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1489     auto FlatNull = B.buildConstant(SrcTy, 0);
1490 
1491     // Extract low 32-bits of the pointer.
1492     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1493 
1494     auto CmpRes =
1495         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1496     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1497 
1498     MI.eraseFromParent();
1499     return true;
1500   }
1501 
1502   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1503     return false;
1504 
1505   if (!ST.hasFlatAddressSpace())
1506     return false;
1507 
1508   auto SegmentNull =
1509       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1510   auto FlatNull =
1511       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1512 
1513   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1514   if (!ApertureReg.isValid())
1515     return false;
1516 
1517   auto CmpRes =
1518       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1519 
1520   // Coerce the type of the low half of the result so we can use merge_values.
1521   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1522 
1523   // TODO: Should we allow mismatched types but matching sizes in merges to
1524   // avoid the ptrtoint?
1525   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1526   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1527 
1528   MI.eraseFromParent();
1529   return true;
1530 }
1531 
1532 bool AMDGPULegalizerInfo::legalizeFrint(
1533   MachineInstr &MI, MachineRegisterInfo &MRI,
1534   MachineIRBuilder &B) const {
1535   B.setInstr(MI);
1536 
1537   Register Src = MI.getOperand(1).getReg();
1538   LLT Ty = MRI.getType(Src);
1539   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1540 
1541   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1542   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1543 
1544   auto C1 = B.buildFConstant(Ty, C1Val);
1545   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1546 
1547   // TODO: Should this propagate fast-math-flags?
1548   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1549   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1550 
1551   auto C2 = B.buildFConstant(Ty, C2Val);
1552   auto Fabs = B.buildFAbs(Ty, Src);
1553 
1554   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1555   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1556   return true;
1557 }
1558 
1559 bool AMDGPULegalizerInfo::legalizeFceil(
1560   MachineInstr &MI, MachineRegisterInfo &MRI,
1561   MachineIRBuilder &B) const {
1562   B.setInstr(MI);
1563 
1564   const LLT S1 = LLT::scalar(1);
1565   const LLT S64 = LLT::scalar(64);
1566 
1567   Register Src = MI.getOperand(1).getReg();
1568   assert(MRI.getType(Src) == S64);
1569 
1570   // result = trunc(src)
1571   // if (src > 0.0 && src != result)
1572   //   result += 1.0
1573 
1574   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1575 
1576   const auto Zero = B.buildFConstant(S64, 0.0);
1577   const auto One = B.buildFConstant(S64, 1.0);
1578   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1579   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1580   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1581   auto Add = B.buildSelect(S64, And, One, Zero);
1582 
1583   // TODO: Should this propagate fast-math-flags?
1584   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1585   return true;
1586 }
1587 
1588 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1589                                               MachineIRBuilder &B) {
1590   const unsigned FractBits = 52;
1591   const unsigned ExpBits = 11;
1592   LLT S32 = LLT::scalar(32);
1593 
1594   auto Const0 = B.buildConstant(S32, FractBits - 32);
1595   auto Const1 = B.buildConstant(S32, ExpBits);
1596 
1597   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1598     .addUse(Const0.getReg(0))
1599     .addUse(Const1.getReg(0));
1600 
1601   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1602 }
1603 
1604 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1605   MachineInstr &MI, MachineRegisterInfo &MRI,
1606   MachineIRBuilder &B) const {
1607   B.setInstr(MI);
1608 
1609   const LLT S1 = LLT::scalar(1);
1610   const LLT S32 = LLT::scalar(32);
1611   const LLT S64 = LLT::scalar(64);
1612 
1613   Register Src = MI.getOperand(1).getReg();
1614   assert(MRI.getType(Src) == S64);
1615 
1616   // TODO: Should this use extract since the low half is unused?
1617   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1618   Register Hi = Unmerge.getReg(1);
1619 
1620   // Extract the upper half, since this is where we will find the sign and
1621   // exponent.
1622   auto Exp = extractF64Exponent(Hi, B);
1623 
1624   const unsigned FractBits = 52;
1625 
1626   // Extract the sign bit.
1627   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1628   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1629 
1630   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1631 
1632   const auto Zero32 = B.buildConstant(S32, 0);
1633 
1634   // Extend back to 64-bits.
1635   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1636 
1637   auto Shr = B.buildAShr(S64, FractMask, Exp);
1638   auto Not = B.buildNot(S64, Shr);
1639   auto Tmp0 = B.buildAnd(S64, Src, Not);
1640   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1641 
1642   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1643   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1644 
1645   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1646   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1647   return true;
1648 }
1649 
1650 bool AMDGPULegalizerInfo::legalizeITOFP(
1651   MachineInstr &MI, MachineRegisterInfo &MRI,
1652   MachineIRBuilder &B, bool Signed) const {
1653   B.setInstr(MI);
1654 
1655   Register Dst = MI.getOperand(0).getReg();
1656   Register Src = MI.getOperand(1).getReg();
1657 
1658   const LLT S64 = LLT::scalar(64);
1659   const LLT S32 = LLT::scalar(32);
1660 
1661   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1662 
1663   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1664 
1665   auto CvtHi = Signed ?
1666     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1667     B.buildUITOFP(S64, Unmerge.getReg(1));
1668 
1669   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1670 
1671   auto ThirtyTwo = B.buildConstant(S32, 32);
1672   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1673     .addUse(CvtHi.getReg(0))
1674     .addUse(ThirtyTwo.getReg(0));
1675 
1676   // TODO: Should this propagate fast-math-flags?
1677   B.buildFAdd(Dst, LdExp, CvtLo);
1678   MI.eraseFromParent();
1679   return true;
1680 }
1681 
1682 // TODO: Copied from DAG implementation. Verify logic and document how this
1683 // actually works.
1684 bool AMDGPULegalizerInfo::legalizeFPTOI(
1685   MachineInstr &MI, MachineRegisterInfo &MRI,
1686   MachineIRBuilder &B, bool Signed) const {
1687   B.setInstr(MI);
1688 
1689   Register Dst = MI.getOperand(0).getReg();
1690   Register Src = MI.getOperand(1).getReg();
1691 
1692   const LLT S64 = LLT::scalar(64);
1693   const LLT S32 = LLT::scalar(32);
1694 
1695   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1696 
1697   unsigned Flags = MI.getFlags();
1698 
1699   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
1700   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
1701   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
1702 
1703   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
1704   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
1705   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
1706 
1707   auto Hi = Signed ?
1708     B.buildFPTOSI(S32, FloorMul) :
1709     B.buildFPTOUI(S32, FloorMul);
1710   auto Lo = B.buildFPTOUI(S32, Fma);
1711 
1712   B.buildMerge(Dst, { Lo, Hi });
1713   MI.eraseFromParent();
1714 
1715   return true;
1716 }
1717 
1718 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1719   MachineInstr &MI, MachineRegisterInfo &MRI,
1720   MachineIRBuilder &B) const {
1721   MachineFunction &MF = B.getMF();
1722   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1723 
1724   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1725                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1726 
1727   // With ieee_mode disabled, the instructions have the correct behavior
1728   // already for G_FMINNUM/G_FMAXNUM
1729   if (!MFI->getMode().IEEE)
1730     return !IsIEEEOp;
1731 
1732   if (IsIEEEOp)
1733     return true;
1734 
1735   MachineIRBuilder HelperBuilder(MI);
1736   GISelObserverWrapper DummyObserver;
1737   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1738   HelperBuilder.setInstr(MI);
1739   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1740 }
1741 
1742 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1743   MachineInstr &MI, MachineRegisterInfo &MRI,
1744   MachineIRBuilder &B) const {
1745   // TODO: Should move some of this into LegalizerHelper.
1746 
1747   // TODO: Promote dynamic indexing of s16 to s32
1748 
1749   // FIXME: Artifact combiner probably should have replaced the truncated
1750   // constant before this, so we shouldn't need
1751   // getConstantVRegValWithLookThrough.
1752   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1753     MI.getOperand(2).getReg(), MRI);
1754   if (!IdxVal) // Dynamic case will be selected to register indexing.
1755     return true;
1756 
1757   Register Dst = MI.getOperand(0).getReg();
1758   Register Vec = MI.getOperand(1).getReg();
1759 
1760   LLT VecTy = MRI.getType(Vec);
1761   LLT EltTy = VecTy.getElementType();
1762   assert(EltTy == MRI.getType(Dst));
1763 
1764   B.setInstr(MI);
1765 
1766   if (IdxVal->Value < VecTy.getNumElements())
1767     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
1768   else
1769     B.buildUndef(Dst);
1770 
1771   MI.eraseFromParent();
1772   return true;
1773 }
1774 
1775 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1776   MachineInstr &MI, MachineRegisterInfo &MRI,
1777   MachineIRBuilder &B) const {
1778   // TODO: Should move some of this into LegalizerHelper.
1779 
1780   // TODO: Promote dynamic indexing of s16 to s32
1781 
1782   // FIXME: Artifact combiner probably should have replaced the truncated
1783   // constant before this, so we shouldn't need
1784   // getConstantVRegValWithLookThrough.
1785   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1786     MI.getOperand(3).getReg(), MRI);
1787   if (!IdxVal) // Dynamic case will be selected to register indexing.
1788     return true;
1789 
1790   Register Dst = MI.getOperand(0).getReg();
1791   Register Vec = MI.getOperand(1).getReg();
1792   Register Ins = MI.getOperand(2).getReg();
1793 
1794   LLT VecTy = MRI.getType(Vec);
1795   LLT EltTy = VecTy.getElementType();
1796   assert(EltTy == MRI.getType(Ins));
1797 
1798   B.setInstr(MI);
1799 
1800   if (IdxVal->Value < VecTy.getNumElements())
1801     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
1802   else
1803     B.buildUndef(Dst);
1804 
1805   MI.eraseFromParent();
1806   return true;
1807 }
1808 
1809 static bool isLegalVOP3PShuffleMask(ArrayRef<int> Mask) {
1810   assert(Mask.size() == 2);
1811 
1812   // If one half is undef, the other is trivially in the same reg.
1813   if (Mask[0] == -1 || Mask[1] == -1)
1814     return true;
1815   return ((Mask[0] == 0 || Mask[0] == 1) && (Mask[1] == 0 || Mask[1] == 1)) ||
1816          ((Mask[0] == 2 || Mask[0] == 3) && (Mask[1] == 2 || Mask[1] == 3));
1817 }
1818 
1819 bool AMDGPULegalizerInfo::legalizeShuffleVector(
1820   MachineInstr &MI, MachineRegisterInfo &MRI,
1821   MachineIRBuilder &B) const {
1822   const LLT V2S16 = LLT::vector(2, 16);
1823 
1824   Register Dst = MI.getOperand(0).getReg();
1825   Register Src0 = MI.getOperand(1).getReg();
1826   LLT DstTy = MRI.getType(Dst);
1827   LLT SrcTy = MRI.getType(Src0);
1828 
1829   if (SrcTy == V2S16 && DstTy == V2S16 &&
1830       isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
1831     return true;
1832 
1833   MachineIRBuilder HelperBuilder(MI);
1834   GISelObserverWrapper DummyObserver;
1835   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
1836   HelperBuilder.setInstr(MI);
1837   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
1838 }
1839 
1840 bool AMDGPULegalizerInfo::legalizeSinCos(
1841   MachineInstr &MI, MachineRegisterInfo &MRI,
1842   MachineIRBuilder &B) const {
1843   B.setInstr(MI);
1844 
1845   Register DstReg = MI.getOperand(0).getReg();
1846   Register SrcReg = MI.getOperand(1).getReg();
1847   LLT Ty = MRI.getType(DstReg);
1848   unsigned Flags = MI.getFlags();
1849 
1850   Register TrigVal;
1851   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1852   if (ST.hasTrigReducedRange()) {
1853     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1854     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1855       .addUse(MulVal.getReg(0))
1856       .setMIFlags(Flags).getReg(0);
1857   } else
1858     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1859 
1860   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1861     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1862   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1863     .addUse(TrigVal)
1864     .setMIFlags(Flags);
1865   MI.eraseFromParent();
1866   return true;
1867 }
1868 
1869 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1870   Register DstReg, LLT PtrTy,
1871   MachineIRBuilder &B, const GlobalValue *GV,
1872   unsigned Offset, unsigned GAFlags) const {
1873   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1874   // to the following code sequence:
1875   //
1876   // For constant address space:
1877   //   s_getpc_b64 s[0:1]
1878   //   s_add_u32 s0, s0, $symbol
1879   //   s_addc_u32 s1, s1, 0
1880   //
1881   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1882   //   a fixup or relocation is emitted to replace $symbol with a literal
1883   //   constant, which is a pc-relative offset from the encoding of the $symbol
1884   //   operand to the global variable.
1885   //
1886   // For global address space:
1887   //   s_getpc_b64 s[0:1]
1888   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1889   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1890   //
1891   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1892   //   fixups or relocations are emitted to replace $symbol@*@lo and
1893   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1894   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
1895   //   operand to the global variable.
1896   //
1897   // What we want here is an offset from the value returned by s_getpc
1898   // (which is the address of the s_add_u32 instruction) to the global
1899   // variable, but since the encoding of $symbol starts 4 bytes after the start
1900   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1901   // small. This requires us to add 4 to the global variable offset in order to
1902   // compute the correct address.
1903 
1904   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1905 
1906   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1907     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1908 
1909   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1910     .addDef(PCReg);
1911 
1912   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1913   if (GAFlags == SIInstrInfo::MO_NONE)
1914     MIB.addImm(0);
1915   else
1916     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1917 
1918   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1919 
1920   if (PtrTy.getSizeInBits() == 32)
1921     B.buildExtract(DstReg, PCReg, 0);
1922   return true;
1923  }
1924 
1925 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1926   MachineInstr &MI, MachineRegisterInfo &MRI,
1927   MachineIRBuilder &B) const {
1928   Register DstReg = MI.getOperand(0).getReg();
1929   LLT Ty = MRI.getType(DstReg);
1930   unsigned AS = Ty.getAddressSpace();
1931 
1932   const GlobalValue *GV = MI.getOperand(1).getGlobal();
1933   MachineFunction &MF = B.getMF();
1934   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1935   B.setInstr(MI);
1936 
1937   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1938     if (!MFI->isEntryFunction()) {
1939       const Function &Fn = MF.getFunction();
1940       DiagnosticInfoUnsupported BadLDSDecl(
1941         Fn, "local memory global used by non-kernel function", MI.getDebugLoc());
1942       Fn.getContext().diagnose(BadLDSDecl);
1943     }
1944 
1945     // TODO: We could emit code to handle the initialization somewhere.
1946     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1947       const SITargetLowering *TLI = ST.getTargetLowering();
1948       if (!TLI->shouldUseLDSConstAddress(GV)) {
1949         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
1950         return true; // Leave in place;
1951       }
1952 
1953       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1954       MI.eraseFromParent();
1955       return true;
1956     }
1957 
1958     const Function &Fn = MF.getFunction();
1959     DiagnosticInfoUnsupported BadInit(
1960       Fn, "unsupported initializer for address space", MI.getDebugLoc());
1961     Fn.getContext().diagnose(BadInit);
1962     return true;
1963   }
1964 
1965   const SITargetLowering *TLI = ST.getTargetLowering();
1966 
1967   if (TLI->shouldEmitFixup(GV)) {
1968     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
1969     MI.eraseFromParent();
1970     return true;
1971   }
1972 
1973   if (TLI->shouldEmitPCReloc(GV)) {
1974     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
1975     MI.eraseFromParent();
1976     return true;
1977   }
1978 
1979   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1980   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
1981 
1982   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
1983     MachinePointerInfo::getGOT(MF),
1984     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1985     MachineMemOperand::MOInvariant,
1986     8 /*Size*/, 8 /*Align*/);
1987 
1988   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
1989 
1990   if (Ty.getSizeInBits() == 32) {
1991     // Truncate if this is a 32-bit constant adrdess.
1992     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
1993     B.buildExtract(DstReg, Load, 0);
1994   } else
1995     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
1996 
1997   MI.eraseFromParent();
1998   return true;
1999 }
2000 
2001 bool AMDGPULegalizerInfo::legalizeLoad(
2002   MachineInstr &MI, MachineRegisterInfo &MRI,
2003   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
2004   B.setInstr(MI);
2005   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2006   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
2007   Observer.changingInstr(MI);
2008   MI.getOperand(1).setReg(Cast.getReg(0));
2009   Observer.changedInstr(MI);
2010   return true;
2011 }
2012 
2013 bool AMDGPULegalizerInfo::legalizeFMad(
2014   MachineInstr &MI, MachineRegisterInfo &MRI,
2015   MachineIRBuilder &B) const {
2016   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2017   assert(Ty.isScalar());
2018 
2019   MachineFunction &MF = B.getMF();
2020   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2021 
2022   // TODO: Always legal with future ftz flag.
2023   // FIXME: Do we need just output?
2024   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2025     return true;
2026   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2027     return true;
2028 
2029   MachineIRBuilder HelperBuilder(MI);
2030   GISelObserverWrapper DummyObserver;
2031   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2032   HelperBuilder.setMBB(*MI.getParent());
2033   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2034 }
2035 
2036 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2037   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2038   Register DstReg = MI.getOperand(0).getReg();
2039   Register PtrReg = MI.getOperand(1).getReg();
2040   Register CmpVal = MI.getOperand(2).getReg();
2041   Register NewVal = MI.getOperand(3).getReg();
2042 
2043   assert(SITargetLowering::isFlatGlobalAddrSpace(
2044            MRI.getType(PtrReg).getAddressSpace()) &&
2045          "this should not have been custom lowered");
2046 
2047   LLT ValTy = MRI.getType(CmpVal);
2048   LLT VecTy = LLT::vector(2, ValTy);
2049 
2050   B.setInstr(MI);
2051   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2052 
2053   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2054     .addDef(DstReg)
2055     .addUse(PtrReg)
2056     .addUse(PackedVal)
2057     .setMemRefs(MI.memoperands());
2058 
2059   MI.eraseFromParent();
2060   return true;
2061 }
2062 
2063 bool AMDGPULegalizerInfo::legalizeFlog(
2064   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2065   Register Dst = MI.getOperand(0).getReg();
2066   Register Src = MI.getOperand(1).getReg();
2067   LLT Ty = B.getMRI()->getType(Dst);
2068   unsigned Flags = MI.getFlags();
2069   B.setInstr(MI);
2070 
2071   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2072   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2073 
2074   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2075   MI.eraseFromParent();
2076   return true;
2077 }
2078 
2079 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2080                                        MachineIRBuilder &B) const {
2081   Register Dst = MI.getOperand(0).getReg();
2082   Register Src = MI.getOperand(1).getReg();
2083   unsigned Flags = MI.getFlags();
2084   LLT Ty = B.getMRI()->getType(Dst);
2085   B.setInstr(MI);
2086 
2087   auto K = B.buildFConstant(Ty, numbers::log2e);
2088   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2089   B.buildFExp2(Dst, Mul, Flags);
2090   MI.eraseFromParent();
2091   return true;
2092 }
2093 
2094 // Find a source register, ignoring any possible source modifiers.
2095 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2096   Register ModSrc = OrigSrc;
2097   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2098     ModSrc = SrcFNeg->getOperand(1).getReg();
2099     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2100       ModSrc = SrcFAbs->getOperand(1).getReg();
2101   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2102     ModSrc = SrcFAbs->getOperand(1).getReg();
2103   return ModSrc;
2104 }
2105 
2106 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2107                                          MachineRegisterInfo &MRI,
2108                                          MachineIRBuilder &B) const {
2109   B.setInstr(MI);
2110 
2111   const LLT S1 = LLT::scalar(1);
2112   const LLT S64 = LLT::scalar(64);
2113   Register Dst = MI.getOperand(0).getReg();
2114   Register OrigSrc = MI.getOperand(1).getReg();
2115   unsigned Flags = MI.getFlags();
2116   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2117          "this should not have been custom lowered");
2118 
2119   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2120   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2121   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2122   // V_FRACT bug is:
2123   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2124   //
2125   // Convert floor(x) to (x - fract(x))
2126 
2127   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2128     .addUse(OrigSrc)
2129     .setMIFlags(Flags);
2130 
2131   // Give source modifier matching some assistance before obscuring a foldable
2132   // pattern.
2133 
2134   // TODO: We can avoid the neg on the fract? The input sign to fract
2135   // shouldn't matter?
2136   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2137 
2138   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2139 
2140   Register Min = MRI.createGenericVirtualRegister(S64);
2141 
2142   // We don't need to concern ourselves with the snan handling difference, so
2143   // use the one which will directly select.
2144   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2145   if (MFI->getMode().IEEE)
2146     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2147   else
2148     B.buildFMinNum(Min, Fract, Const, Flags);
2149 
2150   Register CorrectedFract = Min;
2151   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2152     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2153     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2154   }
2155 
2156   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2157   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2158 
2159   MI.eraseFromParent();
2160   return true;
2161 }
2162 
2163 // Turn an illegal packed v2s16 build vector into bit operations.
2164 // TODO: This should probably be a bitcast action in LegalizerHelper.
2165 bool AMDGPULegalizerInfo::legalizeBuildVector(
2166   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2167   Register Dst = MI.getOperand(0).getReg();
2168   LLT DstTy = MRI.getType(Dst);
2169   const LLT S32 = LLT::scalar(32);
2170   const LLT V2S16 = LLT::vector(2, 16);
2171   (void)DstTy;
2172   (void)V2S16;
2173   assert(DstTy == V2S16);
2174 
2175   Register Src0 = MI.getOperand(1).getReg();
2176   Register Src1 = MI.getOperand(2).getReg();
2177   assert(MRI.getType(Src0) == LLT::scalar(16));
2178 
2179   B.setInstr(MI);
2180   auto Merge = B.buildMerge(S32, {Src0, Src1});
2181   B.buildBitcast(Dst, Merge);
2182 
2183   MI.eraseFromParent();
2184   return true;
2185 }
2186 
2187 // Return the use branch instruction, otherwise null if the usage is invalid.
2188 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2189                                        MachineRegisterInfo &MRI,
2190                                        MachineInstr *&Br) {
2191   Register CondDef = MI.getOperand(0).getReg();
2192   if (!MRI.hasOneNonDBGUse(CondDef))
2193     return nullptr;
2194 
2195   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2196   if (UseMI.getParent() != MI.getParent() ||
2197       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2198     return nullptr;
2199 
2200   // Make sure the cond br is followed by a G_BR
2201   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2202   if (Next != MI.getParent()->end()) {
2203     if (Next->getOpcode() != AMDGPU::G_BR)
2204       return nullptr;
2205     Br = &*Next;
2206   }
2207 
2208   return &UseMI;
2209 }
2210 
2211 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
2212                                                 Register Reg, LLT Ty) const {
2213   Register LiveIn = MRI.getLiveInVirtReg(Reg);
2214   if (LiveIn)
2215     return LiveIn;
2216 
2217   Register NewReg = MRI.createGenericVirtualRegister(Ty);
2218   MRI.addLiveIn(Reg, NewReg);
2219   return NewReg;
2220 }
2221 
2222 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2223                                          const ArgDescriptor *Arg) const {
2224   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2225     return false; // TODO: Handle these
2226 
2227   assert(Arg->getRegister().isPhysical());
2228 
2229   MachineRegisterInfo &MRI = *B.getMRI();
2230 
2231   LLT Ty = MRI.getType(DstReg);
2232   Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
2233 
2234   if (Arg->isMasked()) {
2235     // TODO: Should we try to emit this once in the entry block?
2236     const LLT S32 = LLT::scalar(32);
2237     const unsigned Mask = Arg->getMask();
2238     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2239 
2240     Register AndMaskSrc = LiveIn;
2241 
2242     if (Shift != 0) {
2243       auto ShiftAmt = B.buildConstant(S32, Shift);
2244       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2245     }
2246 
2247     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2248   } else
2249     B.buildCopy(DstReg, LiveIn);
2250 
2251   // Insert the argument copy if it doens't already exist.
2252   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
2253   if (!MRI.getVRegDef(LiveIn)) {
2254     // FIXME: Should have scoped insert pt
2255     MachineBasicBlock &OrigInsBB = B.getMBB();
2256     auto OrigInsPt = B.getInsertPt();
2257 
2258     MachineBasicBlock &EntryMBB = B.getMF().front();
2259     EntryMBB.addLiveIn(Arg->getRegister());
2260     B.setInsertPt(EntryMBB, EntryMBB.begin());
2261     B.buildCopy(LiveIn, Arg->getRegister());
2262 
2263     B.setInsertPt(OrigInsBB, OrigInsPt);
2264   }
2265 
2266   return true;
2267 }
2268 
2269 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2270   MachineInstr &MI,
2271   MachineRegisterInfo &MRI,
2272   MachineIRBuilder &B,
2273   AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2274   B.setInstr(MI);
2275 
2276   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2277 
2278   const ArgDescriptor *Arg;
2279   const TargetRegisterClass *RC;
2280   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
2281   if (!Arg) {
2282     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
2283     return false;
2284   }
2285 
2286   if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
2287     MI.eraseFromParent();
2288     return true;
2289   }
2290 
2291   return false;
2292 }
2293 
2294 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2295                                        MachineRegisterInfo &MRI,
2296                                        MachineIRBuilder &B) const {
2297   B.setInstr(MI);
2298   Register Dst = MI.getOperand(0).getReg();
2299   LLT DstTy = MRI.getType(Dst);
2300   LLT S16 = LLT::scalar(16);
2301   LLT S32 = LLT::scalar(32);
2302   LLT S64 = LLT::scalar(64);
2303 
2304   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2305     return true;
2306 
2307   if (DstTy == S16)
2308     return legalizeFDIV16(MI, MRI, B);
2309   if (DstTy == S32)
2310     return legalizeFDIV32(MI, MRI, B);
2311   if (DstTy == S64)
2312     return legalizeFDIV64(MI, MRI, B);
2313 
2314   return false;
2315 }
2316 
2317 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2318                                                  MachineRegisterInfo &MRI,
2319                                                  MachineIRBuilder &B) const {
2320   Register Res = MI.getOperand(0).getReg();
2321   Register LHS = MI.getOperand(1).getReg();
2322   Register RHS = MI.getOperand(2).getReg();
2323 
2324   uint16_t Flags = MI.getFlags();
2325 
2326   LLT ResTy = MRI.getType(Res);
2327   LLT S32 = LLT::scalar(32);
2328   LLT S64 = LLT::scalar(64);
2329 
2330   const MachineFunction &MF = B.getMF();
2331   bool Unsafe =
2332     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2333 
2334   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2335     return false;
2336 
2337   if (!Unsafe && ResTy == S32 &&
2338       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2339     return false;
2340 
2341   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2342     // 1 / x -> RCP(x)
2343     if (CLHS->isExactlyValue(1.0)) {
2344       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2345         .addUse(RHS)
2346         .setMIFlags(Flags);
2347 
2348       MI.eraseFromParent();
2349       return true;
2350     }
2351 
2352     // -1 / x -> RCP( FNEG(x) )
2353     if (CLHS->isExactlyValue(-1.0)) {
2354       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2355       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2356         .addUse(FNeg.getReg(0))
2357         .setMIFlags(Flags);
2358 
2359       MI.eraseFromParent();
2360       return true;
2361     }
2362   }
2363 
2364   // x / y -> x * (1.0 / y)
2365   if (Unsafe) {
2366     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2367       .addUse(RHS)
2368       .setMIFlags(Flags);
2369     B.buildFMul(Res, LHS, RCP, Flags);
2370 
2371     MI.eraseFromParent();
2372     return true;
2373   }
2374 
2375   return false;
2376 }
2377 
2378 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2379                                          MachineRegisterInfo &MRI,
2380                                          MachineIRBuilder &B) const {
2381   B.setInstr(MI);
2382   Register Res = MI.getOperand(0).getReg();
2383   Register LHS = MI.getOperand(1).getReg();
2384   Register RHS = MI.getOperand(2).getReg();
2385 
2386   uint16_t Flags = MI.getFlags();
2387 
2388   LLT S16 = LLT::scalar(16);
2389   LLT S32 = LLT::scalar(32);
2390 
2391   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2392   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2393 
2394   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2395     .addUse(RHSExt.getReg(0))
2396     .setMIFlags(Flags);
2397 
2398   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2399   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2400 
2401   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2402     .addUse(RDst.getReg(0))
2403     .addUse(RHS)
2404     .addUse(LHS)
2405     .setMIFlags(Flags);
2406 
2407   MI.eraseFromParent();
2408   return true;
2409 }
2410 
2411 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2412 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2413 static void toggleSPDenormMode(bool Enable,
2414                                MachineIRBuilder &B,
2415                                const GCNSubtarget &ST,
2416                                AMDGPU::SIModeRegisterDefaults Mode) {
2417   // Set SP denorm mode to this value.
2418   unsigned SPDenormMode =
2419     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
2420 
2421   if (ST.hasDenormModeInst()) {
2422     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2423     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
2424 
2425     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2426     B.buildInstr(AMDGPU::S_DENORM_MODE)
2427       .addImm(NewDenormModeValue);
2428 
2429   } else {
2430     // Select FP32 bit field in mode register.
2431     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2432                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2433                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2434 
2435     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2436       .addImm(SPDenormMode)
2437       .addImm(SPDenormModeBitField);
2438   }
2439 }
2440 
2441 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2442                                          MachineRegisterInfo &MRI,
2443                                          MachineIRBuilder &B) const {
2444   B.setInstr(MI);
2445   Register Res = MI.getOperand(0).getReg();
2446   Register LHS = MI.getOperand(1).getReg();
2447   Register RHS = MI.getOperand(2).getReg();
2448   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2449   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2450 
2451   uint16_t Flags = MI.getFlags();
2452 
2453   LLT S32 = LLT::scalar(32);
2454   LLT S1 = LLT::scalar(1);
2455 
2456   auto One = B.buildFConstant(S32, 1.0f);
2457 
2458   auto DenominatorScaled =
2459     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2460       .addUse(RHS)
2461       .addUse(LHS)
2462       .addImm(1)
2463       .setMIFlags(Flags);
2464   auto NumeratorScaled =
2465     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2466       .addUse(LHS)
2467       .addUse(RHS)
2468       .addImm(0)
2469       .setMIFlags(Flags);
2470 
2471   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2472     .addUse(DenominatorScaled.getReg(0))
2473     .setMIFlags(Flags);
2474   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2475 
2476   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2477   // aren't modeled as reading it.
2478   if (!Mode.allFP32Denormals())
2479     toggleSPDenormMode(true, B, ST, Mode);
2480 
2481   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2482   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2483   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2484   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2485   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2486   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2487 
2488   if (!Mode.allFP32Denormals())
2489     toggleSPDenormMode(false, B, ST, Mode);
2490 
2491   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2492     .addUse(Fma4.getReg(0))
2493     .addUse(Fma1.getReg(0))
2494     .addUse(Fma3.getReg(0))
2495     .addUse(NumeratorScaled.getReg(1))
2496     .setMIFlags(Flags);
2497 
2498   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2499     .addUse(Fmas.getReg(0))
2500     .addUse(RHS)
2501     .addUse(LHS)
2502     .setMIFlags(Flags);
2503 
2504   MI.eraseFromParent();
2505   return true;
2506 }
2507 
2508 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
2509                                          MachineRegisterInfo &MRI,
2510                                          MachineIRBuilder &B) const {
2511   B.setInstr(MI);
2512   Register Res = MI.getOperand(0).getReg();
2513   Register LHS = MI.getOperand(1).getReg();
2514   Register RHS = MI.getOperand(2).getReg();
2515 
2516   uint16_t Flags = MI.getFlags();
2517 
2518   LLT S64 = LLT::scalar(64);
2519   LLT S1 = LLT::scalar(1);
2520 
2521   auto One = B.buildFConstant(S64, 1.0);
2522 
2523   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2524     .addUse(LHS)
2525     .addUse(RHS)
2526     .addImm(1)
2527     .setMIFlags(Flags);
2528 
2529   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
2530 
2531   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
2532     .addUse(DivScale0.getReg(0))
2533     .setMIFlags(Flags);
2534 
2535   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
2536   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
2537   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
2538 
2539   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2540     .addUse(LHS)
2541     .addUse(RHS)
2542     .addImm(0)
2543     .setMIFlags(Flags);
2544 
2545   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
2546   auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags);
2547   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
2548 
2549   Register Scale;
2550   if (!ST.hasUsableDivScaleConditionOutput()) {
2551     // Workaround a hardware bug on SI where the condition output from div_scale
2552     // is not usable.
2553 
2554     LLT S32 = LLT::scalar(32);
2555 
2556     auto NumUnmerge = B.buildUnmerge(S32, LHS);
2557     auto DenUnmerge = B.buildUnmerge(S32, RHS);
2558     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
2559     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
2560 
2561     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
2562                               Scale1Unmerge.getReg(1));
2563     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
2564                               Scale0Unmerge.getReg(1));
2565     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
2566   } else {
2567     Scale = DivScale1.getReg(1);
2568   }
2569 
2570   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
2571     .addUse(Fma4.getReg(0))
2572     .addUse(Fma3.getReg(0))
2573     .addUse(Mul.getReg(0))
2574     .addUse(Scale)
2575     .setMIFlags(Flags);
2576 
2577   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
2578     .addUse(Fmas.getReg(0))
2579     .addUse(RHS)
2580     .addUse(LHS)
2581     .setMIFlags(Flags);
2582 
2583   MI.eraseFromParent();
2584   return true;
2585 }
2586 
2587 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
2588                                                  MachineRegisterInfo &MRI,
2589                                                  MachineIRBuilder &B) const {
2590   B.setInstr(MI);
2591   Register Res = MI.getOperand(0).getReg();
2592   Register LHS = MI.getOperand(2).getReg();
2593   Register RHS = MI.getOperand(3).getReg();
2594   uint16_t Flags = MI.getFlags();
2595 
2596   LLT S32 = LLT::scalar(32);
2597   LLT S1 = LLT::scalar(1);
2598 
2599   auto Abs = B.buildFAbs(S32, RHS, Flags);
2600   const APFloat C0Val(1.0f);
2601 
2602   auto C0 = B.buildConstant(S32, 0x6f800000);
2603   auto C1 = B.buildConstant(S32, 0x2f800000);
2604   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
2605 
2606   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
2607   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
2608 
2609   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
2610 
2611   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2612     .addUse(Mul0.getReg(0))
2613     .setMIFlags(Flags);
2614 
2615   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
2616 
2617   B.buildFMul(Res, Sel, Mul1, Flags);
2618 
2619   MI.eraseFromParent();
2620   return true;
2621 }
2622 
2623 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
2624                                                  MachineRegisterInfo &MRI,
2625                                                  MachineIRBuilder &B) const {
2626   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2627   if (!MFI->isEntryFunction()) {
2628     return legalizePreloadedArgIntrin(MI, MRI, B,
2629                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
2630   }
2631 
2632   B.setInstr(MI);
2633 
2634   uint64_t Offset =
2635     ST.getTargetLowering()->getImplicitParameterOffset(
2636       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
2637   Register DstReg = MI.getOperand(0).getReg();
2638   LLT DstTy = MRI.getType(DstReg);
2639   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
2640 
2641   const ArgDescriptor *Arg;
2642   const TargetRegisterClass *RC;
2643   std::tie(Arg, RC)
2644     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2645   if (!Arg)
2646     return false;
2647 
2648   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
2649   if (!loadInputValue(KernargPtrReg, B, Arg))
2650     return false;
2651 
2652   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
2653   MI.eraseFromParent();
2654   return true;
2655 }
2656 
2657 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
2658                                               MachineRegisterInfo &MRI,
2659                                               MachineIRBuilder &B,
2660                                               unsigned AddrSpace) const {
2661   B.setInstr(MI);
2662   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
2663   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
2664   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
2665   MI.eraseFromParent();
2666   return true;
2667 }
2668 
2669 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
2670 // offset (the offset that is included in bounds checking and swizzling, to be
2671 // split between the instruction's voffset and immoffset fields) and soffset
2672 // (the offset that is excluded from bounds checking and swizzling, to go in
2673 // the instruction's soffset field).  This function takes the first kind of
2674 // offset and figures out how to split it between voffset and immoffset.
2675 std::tuple<Register, unsigned, unsigned>
2676 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
2677                                         Register OrigOffset) const {
2678   const unsigned MaxImm = 4095;
2679   Register BaseReg;
2680   unsigned TotalConstOffset;
2681   MachineInstr *OffsetDef;
2682   const LLT S32 = LLT::scalar(32);
2683 
2684   std::tie(BaseReg, TotalConstOffset, OffsetDef)
2685     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
2686 
2687   unsigned ImmOffset = TotalConstOffset;
2688 
2689   // If the immediate value is too big for the immoffset field, put the value
2690   // and -4096 into the immoffset field so that the value that is copied/added
2691   // for the voffset field is a multiple of 4096, and it stands more chance
2692   // of being CSEd with the copy/add for another similar load/store.
2693   // However, do not do that rounding down to a multiple of 4096 if that is a
2694   // negative number, as it appears to be illegal to have a negative offset
2695   // in the vgpr, even if adding the immediate offset makes it positive.
2696   unsigned Overflow = ImmOffset & ~MaxImm;
2697   ImmOffset -= Overflow;
2698   if ((int32_t)Overflow < 0) {
2699     Overflow += ImmOffset;
2700     ImmOffset = 0;
2701   }
2702 
2703   if (Overflow != 0) {
2704     if (!BaseReg) {
2705       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
2706     } else {
2707       auto OverflowVal = B.buildConstant(S32, Overflow);
2708       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
2709     }
2710   }
2711 
2712   if (!BaseReg)
2713     BaseReg = B.buildConstant(S32, 0).getReg(0);
2714 
2715   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
2716 }
2717 
2718 /// Handle register layout difference for f16 images for some subtargets.
2719 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
2720                                              MachineRegisterInfo &MRI,
2721                                              Register Reg) const {
2722   if (!ST.hasUnpackedD16VMem())
2723     return Reg;
2724 
2725   const LLT S16 = LLT::scalar(16);
2726   const LLT S32 = LLT::scalar(32);
2727   LLT StoreVT = MRI.getType(Reg);
2728   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
2729 
2730   auto Unmerge = B.buildUnmerge(S16, Reg);
2731 
2732   SmallVector<Register, 4> WideRegs;
2733   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
2734     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
2735 
2736   int NumElts = StoreVT.getNumElements();
2737 
2738   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
2739 }
2740 
2741 Register AMDGPULegalizerInfo::fixStoreSourceType(
2742   MachineIRBuilder &B, Register VData, bool IsFormat) const {
2743   MachineRegisterInfo *MRI = B.getMRI();
2744   LLT Ty = MRI->getType(VData);
2745 
2746   const LLT S16 = LLT::scalar(16);
2747 
2748   // Fixup illegal register types for i8 stores.
2749   if (Ty == LLT::scalar(8) || Ty == S16) {
2750     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
2751     return AnyExt;
2752   }
2753 
2754   if (Ty.isVector()) {
2755     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
2756       if (IsFormat)
2757         return handleD16VData(B, *MRI, VData);
2758     }
2759   }
2760 
2761   return VData;
2762 }
2763 
2764 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
2765                                               MachineRegisterInfo &MRI,
2766                                               MachineIRBuilder &B,
2767                                               bool IsTyped,
2768                                               bool IsFormat) const {
2769   B.setInstr(MI);
2770 
2771   Register VData = MI.getOperand(1).getReg();
2772   LLT Ty = MRI.getType(VData);
2773   LLT EltTy = Ty.getScalarType();
2774   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
2775   const LLT S32 = LLT::scalar(32);
2776 
2777   VData = fixStoreSourceType(B, VData, IsFormat);
2778   Register RSrc = MI.getOperand(2).getReg();
2779 
2780   MachineMemOperand *MMO = *MI.memoperands_begin();
2781   const int MemSize = MMO->getSize();
2782 
2783   unsigned ImmOffset;
2784   unsigned TotalOffset;
2785 
2786   // The typed intrinsics add an immediate after the registers.
2787   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
2788 
2789   // The struct intrinsic variants add one additional operand over raw.
2790   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
2791   Register VIndex;
2792   int OpOffset = 0;
2793   if (HasVIndex) {
2794     VIndex = MI.getOperand(3).getReg();
2795     OpOffset = 1;
2796   }
2797 
2798   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
2799   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
2800 
2801   unsigned Format = 0;
2802   if (IsTyped) {
2803     Format = MI.getOperand(5 + OpOffset).getImm();
2804     ++OpOffset;
2805   }
2806 
2807   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
2808 
2809   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
2810   if (TotalOffset != 0)
2811     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
2812 
2813   unsigned Opc;
2814   if (IsTyped) {
2815     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
2816                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
2817   } else if (IsFormat) {
2818     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
2819                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
2820   } else {
2821     switch (MemSize) {
2822     case 1:
2823       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
2824       break;
2825     case 2:
2826       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
2827       break;
2828     default:
2829       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
2830       break;
2831     }
2832   }
2833 
2834   if (!VIndex)
2835     VIndex = B.buildConstant(S32, 0).getReg(0);
2836 
2837   auto MIB = B.buildInstr(Opc)
2838     .addUse(VData)              // vdata
2839     .addUse(RSrc)               // rsrc
2840     .addUse(VIndex)             // vindex
2841     .addUse(VOffset)            // voffset
2842     .addUse(SOffset)            // soffset
2843     .addImm(ImmOffset);         // offset(imm)
2844 
2845   if (IsTyped)
2846     MIB.addImm(Format);
2847 
2848   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
2849      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
2850      .addMemOperand(MMO);
2851 
2852   MI.eraseFromParent();
2853   return true;
2854 }
2855 
2856 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
2857                                              MachineRegisterInfo &MRI,
2858                                              MachineIRBuilder &B,
2859                                              bool IsFormat,
2860                                              bool IsTyped) const {
2861   B.setInstr(MI);
2862 
2863   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
2864   MachineMemOperand *MMO = *MI.memoperands_begin();
2865   const int MemSize = MMO->getSize();
2866   const LLT S32 = LLT::scalar(32);
2867 
2868   Register Dst = MI.getOperand(0).getReg();
2869   Register RSrc = MI.getOperand(2).getReg();
2870 
2871   // The typed intrinsics add an immediate after the registers.
2872   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
2873 
2874   // The struct intrinsic variants add one additional operand over raw.
2875   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
2876   Register VIndex;
2877   int OpOffset = 0;
2878   if (HasVIndex) {
2879     VIndex = MI.getOperand(3).getReg();
2880     OpOffset = 1;
2881   }
2882 
2883   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
2884   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
2885 
2886   unsigned Format = 0;
2887   if (IsTyped) {
2888     Format = MI.getOperand(5 + OpOffset).getImm();
2889     ++OpOffset;
2890   }
2891 
2892   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
2893   unsigned ImmOffset;
2894   unsigned TotalOffset;
2895 
2896   LLT Ty = MRI.getType(Dst);
2897   LLT EltTy = Ty.getScalarType();
2898   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
2899   const bool Unpacked = ST.hasUnpackedD16VMem();
2900 
2901   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
2902   if (TotalOffset != 0)
2903     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
2904 
2905   unsigned Opc;
2906 
2907   if (IsTyped) {
2908     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
2909                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
2910   } else if (IsFormat) {
2911     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
2912                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
2913   } else {
2914     switch (MemSize) {
2915     case 1:
2916       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
2917       break;
2918     case 2:
2919       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
2920       break;
2921     default:
2922       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
2923       break;
2924     }
2925   }
2926 
2927   Register LoadDstReg;
2928 
2929   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
2930   LLT UnpackedTy = Ty.changeElementSize(32);
2931 
2932   if (IsExtLoad)
2933     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
2934   else if (Unpacked && IsD16 && Ty.isVector())
2935     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
2936   else
2937     LoadDstReg = Dst;
2938 
2939   if (!VIndex)
2940     VIndex = B.buildConstant(S32, 0).getReg(0);
2941 
2942   auto MIB = B.buildInstr(Opc)
2943     .addDef(LoadDstReg)         // vdata
2944     .addUse(RSrc)               // rsrc
2945     .addUse(VIndex)             // vindex
2946     .addUse(VOffset)            // voffset
2947     .addUse(SOffset)            // soffset
2948     .addImm(ImmOffset);         // offset(imm)
2949 
2950   if (IsTyped)
2951     MIB.addImm(Format);
2952 
2953   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
2954      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
2955      .addMemOperand(MMO);
2956 
2957   if (LoadDstReg != Dst) {
2958     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2959 
2960     // Widen result for extending loads was widened.
2961     if (IsExtLoad)
2962       B.buildTrunc(Dst, LoadDstReg);
2963     else {
2964       // Repack to original 16-bit vector result
2965       // FIXME: G_TRUNC should work, but legalization currently fails
2966       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
2967       SmallVector<Register, 4> Repack;
2968       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
2969         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
2970       B.buildMerge(Dst, Repack);
2971     }
2972   }
2973 
2974   MI.eraseFromParent();
2975   return true;
2976 }
2977 
2978 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
2979                                                MachineIRBuilder &B,
2980                                                bool IsInc) const {
2981   B.setInstr(MI);
2982   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
2983                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
2984   B.buildInstr(Opc)
2985     .addDef(MI.getOperand(0).getReg())
2986     .addUse(MI.getOperand(2).getReg())
2987     .addUse(MI.getOperand(3).getReg())
2988     .cloneMemRefs(MI);
2989   MI.eraseFromParent();
2990   return true;
2991 }
2992 
2993 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
2994   switch (IntrID) {
2995   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
2996   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
2997     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
2998   case Intrinsic::amdgcn_raw_buffer_atomic_add:
2999   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3000     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3001   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3002   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3003     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3004   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3005   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3006     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3007   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3008   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3009     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3010   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3011   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3012     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3013   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3014   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3015     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3016   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3017   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3018     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3019   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3020   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3021     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3022   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3023   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3024     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3025   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3026   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3027     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3028   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3029   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3030     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3031   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3032   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3033     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3034   default:
3035     llvm_unreachable("unhandled atomic opcode");
3036   }
3037 }
3038 
3039 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3040                                                MachineIRBuilder &B,
3041                                                Intrinsic::ID IID) const {
3042   B.setInstr(MI);
3043 
3044   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3045                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3046 
3047   Register Dst = MI.getOperand(0).getReg();
3048   Register VData = MI.getOperand(2).getReg();
3049 
3050   Register CmpVal;
3051   int OpOffset = 0;
3052 
3053   if (IsCmpSwap) {
3054     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3055     ++OpOffset;
3056   }
3057 
3058   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3059   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
3060 
3061   // The struct intrinsic variants add one additional operand over raw.
3062   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3063   Register VIndex;
3064   if (HasVIndex) {
3065     VIndex = MI.getOperand(4 + OpOffset).getReg();
3066     ++OpOffset;
3067   }
3068 
3069   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3070   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3071   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3072 
3073   MachineMemOperand *MMO = *MI.memoperands_begin();
3074 
3075   unsigned ImmOffset;
3076   unsigned TotalOffset;
3077   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3078   if (TotalOffset != 0)
3079     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3080 
3081   if (!VIndex)
3082     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3083 
3084   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
3085     .addDef(Dst)
3086     .addUse(VData); // vdata
3087 
3088   if (IsCmpSwap)
3089     MIB.addReg(CmpVal);
3090 
3091   MIB.addUse(RSrc)               // rsrc
3092      .addUse(VIndex)             // vindex
3093      .addUse(VOffset)            // voffset
3094      .addUse(SOffset)            // soffset
3095      .addImm(ImmOffset)          // offset(imm)
3096      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3097      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3098      .addMemOperand(MMO);
3099 
3100   MI.eraseFromParent();
3101   return true;
3102 }
3103 
3104 // Produce a vector of s16 elements from s32 pieces.
3105 static void truncToS16Vector(MachineIRBuilder &B, Register DstReg,
3106                              ArrayRef<Register> UnmergeParts) {
3107   const LLT S16 = LLT::scalar(16);
3108 
3109   SmallVector<Register, 4> RemergeParts(UnmergeParts.size());
3110   for (int I = 0, E = UnmergeParts.size(); I != E; ++I)
3111     RemergeParts[I] = B.buildTrunc(S16, UnmergeParts[I]).getReg(0);
3112 
3113   B.buildBuildVector(DstReg, RemergeParts);
3114 }
3115 
3116 /// Convert a set of s32 registers to a result vector with s16 elements.
3117 static void bitcastToS16Vector(MachineIRBuilder &B, Register DstReg,
3118                                ArrayRef<Register> UnmergeParts) {
3119   MachineRegisterInfo &MRI = *B.getMRI();
3120   const LLT V2S16 = LLT::vector(2, 16);
3121   LLT TargetTy = MRI.getType(DstReg);
3122   int NumElts = UnmergeParts.size();
3123 
3124   if (NumElts == 1) {
3125     assert(TargetTy == V2S16);
3126     B.buildBitcast(DstReg, UnmergeParts[0]);
3127     return;
3128   }
3129 
3130   SmallVector<Register, 4> RemergeParts(NumElts);
3131   for (int I = 0; I != NumElts; ++I)
3132     RemergeParts[I] = B.buildBitcast(V2S16, UnmergeParts[I]).getReg(0);
3133 
3134   if (TargetTy.getSizeInBits() == 32u * NumElts) {
3135     B.buildConcatVectors(DstReg, RemergeParts);
3136     return;
3137   }
3138 
3139   const LLT V3S16 = LLT::vector(3, 16);
3140   const LLT V6S16 = LLT::vector(6, 16);
3141 
3142   // Widen to v6s16 and unpack v3 parts.
3143   assert(TargetTy == V3S16);
3144 
3145   RemergeParts.push_back(B.buildUndef(V2S16).getReg(0));
3146   auto Concat = B.buildConcatVectors(V6S16, RemergeParts);
3147   B.buildUnmerge({DstReg, MRI.createGenericVirtualRegister(V3S16)}, Concat);
3148 }
3149 
3150 // FIXME: Just vector trunc should be sufficent, but legalization currently
3151 // broken.
3152 static void repackUnpackedD16Load(MachineIRBuilder &B, Register DstReg,
3153                                   Register WideDstReg) {
3154   const LLT S32 = LLT::scalar(32);
3155   const LLT S16 = LLT::scalar(16);
3156 
3157   auto Unmerge = B.buildUnmerge(S32, WideDstReg);
3158 
3159   int NumOps = Unmerge->getNumOperands() - 1;
3160   SmallVector<Register, 4> RemergeParts(NumOps);
3161   for (int I = 0; I != NumOps; ++I)
3162     RemergeParts[I] = B.buildTrunc(S16, Unmerge.getReg(I)).getReg(0);
3163 
3164   B.buildBuildVector(DstReg, RemergeParts);
3165 }
3166 
3167 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3168     MachineInstr &MI, MachineIRBuilder &B,
3169     GISelChangeObserver &Observer,
3170     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3171   bool IsTFE = MI.getNumExplicitDefs() == 2;
3172 
3173   // We are only processing the operands of d16 image operations on subtargets
3174   // that use the unpacked register layout, or need to repack the TFE result.
3175 
3176   // TODO: Need to handle a16 images too
3177   // TODO: Do we need to guard against already legalized intrinsics?
3178   if (!IsTFE && !ST.hasUnpackedD16VMem())
3179     return true;
3180 
3181   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3182     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3183 
3184   if (BaseOpcode->Atomic) // No d16 atomics, or TFE.
3185     return true;
3186 
3187   B.setInstr(MI);
3188 
3189   MachineRegisterInfo *MRI = B.getMRI();
3190   const LLT S32 = LLT::scalar(32);
3191   const LLT S16 = LLT::scalar(16);
3192 
3193   if (BaseOpcode->Store) { // No TFE for stores?
3194     Register VData = MI.getOperand(1).getReg();
3195     LLT Ty = MRI->getType(VData);
3196     if (!Ty.isVector() || Ty.getElementType() != S16)
3197       return true;
3198 
3199     B.setInstr(MI);
3200 
3201     Observer.changingInstr(MI);
3202     MI.getOperand(1).setReg(handleD16VData(B, *MRI, VData));
3203     Observer.changedInstr(MI);
3204     return true;
3205   }
3206 
3207   Register DstReg = MI.getOperand(0).getReg();
3208   LLT Ty = MRI->getType(DstReg);
3209   const LLT EltTy = Ty.getScalarType();
3210   const bool IsD16 = Ty.getScalarType() == S16;
3211   const unsigned NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
3212 
3213   if (IsTFE) {
3214     // In the IR, TFE is supposed to be used with a 2 element struct return
3215     // type. The intruction really returns these two values in one contiguous
3216     // register, with one additional dword beyond the loaded data. Rewrite the
3217     // return type to use a single register result.
3218     Register Dst1Reg = MI.getOperand(1).getReg();
3219     if (MRI->getType(Dst1Reg) != S32)
3220       return false;
3221 
3222     // TODO: Make sure the TFE operand bit is set.
3223 
3224     // The raw dword aligned data component of the load. The only legal cases
3225     // where this matters should be when using the packed D16 format, for
3226     // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
3227     LLT RoundedTy;
3228     LLT TFETy;
3229 
3230     if (IsD16 && ST.hasUnpackedD16VMem()) {
3231       RoundedTy = LLT::scalarOrVector(NumElts, 32);
3232       TFETy = LLT::vector(NumElts + 1, 32);
3233     } else {
3234       unsigned EltSize = Ty.getScalarSizeInBits();
3235       unsigned RoundedElts = (Ty.getSizeInBits() + 31) / 32;
3236       unsigned RoundedSize = 32 * RoundedElts;
3237       RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
3238       TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
3239     }
3240 
3241     Register TFEReg = MRI->createGenericVirtualRegister(TFETy);
3242     Observer.changingInstr(MI);
3243 
3244     MI.getOperand(0).setReg(TFEReg);
3245     MI.RemoveOperand(1);
3246 
3247     Observer.changedInstr(MI);
3248 
3249     // Insert after the instruction.
3250     B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3251 
3252     // Now figure out how to copy the new result register back into the old
3253     // result.
3254 
3255     SmallVector<Register, 5> UnmergeResults(TFETy.getNumElements(), Dst1Reg);
3256     int NumDataElts = TFETy.getNumElements() - 1;
3257 
3258     if (!Ty.isVector()) {
3259       // Simplest case is a trivial unmerge (plus a truncate for d16).
3260       UnmergeResults[0] = Ty == S32 ?
3261         DstReg : MRI->createGenericVirtualRegister(S32);
3262 
3263       B.buildUnmerge(UnmergeResults, TFEReg);
3264       if (Ty != S32)
3265         B.buildTrunc(DstReg, UnmergeResults[0]);
3266       return true;
3267     }
3268 
3269     // We have to repack into a new vector of some kind.
3270     for (int I = 0; I != NumDataElts; ++I)
3271       UnmergeResults[I] = MRI->createGenericVirtualRegister(S32);
3272     B.buildUnmerge(UnmergeResults, TFEReg);
3273 
3274     // Drop the final TFE element.
3275     ArrayRef<Register> DataPart(UnmergeResults.data(), NumDataElts);
3276 
3277     if (EltTy == S32)
3278       B.buildBuildVector(DstReg, DataPart);
3279     else if (ST.hasUnpackedD16VMem())
3280       truncToS16Vector(B, DstReg, DataPart);
3281     else
3282       bitcastToS16Vector(B, DstReg, DataPart);
3283 
3284     return true;
3285   }
3286 
3287   // Must be an image load.
3288   if (!Ty.isVector() || Ty.getElementType() != S16)
3289     return true;
3290 
3291   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3292 
3293   LLT WidenedTy = Ty.changeElementType(S32);
3294   Register WideDstReg = MRI->createGenericVirtualRegister(WidenedTy);
3295 
3296   Observer.changingInstr(MI);
3297   MI.getOperand(0).setReg(WideDstReg);
3298   Observer.changedInstr(MI);
3299 
3300   repackUnpackedD16Load(B, DstReg, WideDstReg);
3301   return true;
3302 }
3303 
3304 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
3305   MachineInstr &MI, MachineIRBuilder &B,
3306   GISelChangeObserver &Observer) const {
3307   Register Dst = MI.getOperand(0).getReg();
3308   LLT Ty = B.getMRI()->getType(Dst);
3309   unsigned Size = Ty.getSizeInBits();
3310   MachineFunction &MF = B.getMF();
3311 
3312   Observer.changingInstr(MI);
3313 
3314   // FIXME: We don't really need this intermediate instruction. The intrinsic
3315   // should be fixed to have a memory operand. Since it's readnone, we're not
3316   // allowed to add one.
3317   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
3318   MI.RemoveOperand(1); // Remove intrinsic ID
3319 
3320   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
3321   // TODO: Should this use datalayout alignment?
3322   const unsigned MemSize = (Size + 7) / 8;
3323   const unsigned MemAlign = 4;
3324   MachineMemOperand *MMO = MF.getMachineMemOperand(
3325     MachinePointerInfo(),
3326     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
3327     MachineMemOperand::MOInvariant, MemSize, MemAlign);
3328   MI.addMemOperand(MF, MMO);
3329 
3330   // There are no 96-bit result scalar loads, but widening to 128-bit should
3331   // always be legal. We may need to restore this to a 96-bit result if it turns
3332   // out this needs to be converted to a vector load during RegBankSelect.
3333   if (!isPowerOf2_32(Size)) {
3334     LegalizerHelper Helper(MF, *this, Observer, B);
3335     B.setInstr(MI);
3336 
3337     if (Ty.isVector())
3338       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
3339     else
3340       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
3341   }
3342 
3343   Observer.changedInstr(MI);
3344   return true;
3345 }
3346 
3347 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
3348                                             MachineIRBuilder &B,
3349                                             GISelChangeObserver &Observer) const {
3350   MachineRegisterInfo &MRI = *B.getMRI();
3351 
3352   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
3353   auto IntrID = MI.getIntrinsicID();
3354   switch (IntrID) {
3355   case Intrinsic::amdgcn_if:
3356   case Intrinsic::amdgcn_else: {
3357     MachineInstr *Br = nullptr;
3358     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
3359       const SIRegisterInfo *TRI
3360         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
3361 
3362       B.setInstr(*BrCond);
3363       Register Def = MI.getOperand(1).getReg();
3364       Register Use = MI.getOperand(3).getReg();
3365 
3366       MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB();
3367       if (Br)
3368         BrTarget = Br->getOperand(0).getMBB();
3369 
3370       if (IntrID == Intrinsic::amdgcn_if) {
3371         B.buildInstr(AMDGPU::SI_IF)
3372           .addDef(Def)
3373           .addUse(Use)
3374           .addMBB(BrTarget);
3375       } else {
3376         B.buildInstr(AMDGPU::SI_ELSE)
3377           .addDef(Def)
3378           .addUse(Use)
3379           .addMBB(BrTarget)
3380           .addImm(0);
3381       }
3382 
3383       if (Br)
3384         Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB());
3385 
3386       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
3387       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
3388       MI.eraseFromParent();
3389       BrCond->eraseFromParent();
3390       return true;
3391     }
3392 
3393     return false;
3394   }
3395   case Intrinsic::amdgcn_loop: {
3396     MachineInstr *Br = nullptr;
3397     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
3398       const SIRegisterInfo *TRI
3399         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
3400 
3401       B.setInstr(*BrCond);
3402 
3403       // FIXME: Need to adjust branch targets based on unconditional branch.
3404       Register Reg = MI.getOperand(2).getReg();
3405       B.buildInstr(AMDGPU::SI_LOOP)
3406         .addUse(Reg)
3407         .addMBB(BrCond->getOperand(1).getMBB());
3408       MI.eraseFromParent();
3409       BrCond->eraseFromParent();
3410       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
3411       return true;
3412     }
3413 
3414     return false;
3415   }
3416   case Intrinsic::amdgcn_kernarg_segment_ptr:
3417     return legalizePreloadedArgIntrin(
3418       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
3419   case Intrinsic::amdgcn_implicitarg_ptr:
3420     return legalizeImplicitArgPtr(MI, MRI, B);
3421   case Intrinsic::amdgcn_workitem_id_x:
3422     return legalizePreloadedArgIntrin(MI, MRI, B,
3423                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
3424   case Intrinsic::amdgcn_workitem_id_y:
3425     return legalizePreloadedArgIntrin(MI, MRI, B,
3426                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
3427   case Intrinsic::amdgcn_workitem_id_z:
3428     return legalizePreloadedArgIntrin(MI, MRI, B,
3429                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
3430   case Intrinsic::amdgcn_workgroup_id_x:
3431     return legalizePreloadedArgIntrin(MI, MRI, B,
3432                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
3433   case Intrinsic::amdgcn_workgroup_id_y:
3434     return legalizePreloadedArgIntrin(MI, MRI, B,
3435                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
3436   case Intrinsic::amdgcn_workgroup_id_z:
3437     return legalizePreloadedArgIntrin(MI, MRI, B,
3438                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
3439   case Intrinsic::amdgcn_dispatch_ptr:
3440     return legalizePreloadedArgIntrin(MI, MRI, B,
3441                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
3442   case Intrinsic::amdgcn_queue_ptr:
3443     return legalizePreloadedArgIntrin(MI, MRI, B,
3444                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
3445   case Intrinsic::amdgcn_implicit_buffer_ptr:
3446     return legalizePreloadedArgIntrin(
3447       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
3448   case Intrinsic::amdgcn_dispatch_id:
3449     return legalizePreloadedArgIntrin(MI, MRI, B,
3450                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
3451   case Intrinsic::amdgcn_fdiv_fast:
3452     return legalizeFDIVFastIntrin(MI, MRI, B);
3453   case Intrinsic::amdgcn_is_shared:
3454     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
3455   case Intrinsic::amdgcn_is_private:
3456     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
3457   case Intrinsic::amdgcn_wavefrontsize: {
3458     B.setInstr(MI);
3459     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
3460     MI.eraseFromParent();
3461     return true;
3462   }
3463   case Intrinsic::amdgcn_s_buffer_load:
3464     return legalizeSBufferLoad(MI, B, Observer);
3465   case Intrinsic::amdgcn_raw_buffer_store:
3466   case Intrinsic::amdgcn_struct_buffer_store:
3467     return legalizeBufferStore(MI, MRI, B, false, false);
3468   case Intrinsic::amdgcn_raw_buffer_store_format:
3469   case Intrinsic::amdgcn_struct_buffer_store_format:
3470     return legalizeBufferStore(MI, MRI, B, false, true);
3471   case Intrinsic::amdgcn_raw_tbuffer_store:
3472   case Intrinsic::amdgcn_struct_tbuffer_store:
3473     return legalizeBufferStore(MI, MRI, B, true, true);
3474   case Intrinsic::amdgcn_raw_buffer_load:
3475   case Intrinsic::amdgcn_struct_buffer_load:
3476     return legalizeBufferLoad(MI, MRI, B, false, false);
3477   case Intrinsic::amdgcn_raw_buffer_load_format:
3478   case Intrinsic::amdgcn_struct_buffer_load_format:
3479     return legalizeBufferLoad(MI, MRI, B, true, false);
3480   case Intrinsic::amdgcn_raw_tbuffer_load:
3481   case Intrinsic::amdgcn_struct_tbuffer_load:
3482     return legalizeBufferLoad(MI, MRI, B, true, true);
3483   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3484   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3485   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3486   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3487   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3488   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3489   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3490   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3491   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3492   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3493   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3494   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3495   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3496   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3497   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3498   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3499   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3500   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3501   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3502   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3503   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3504   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3505   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3506   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3507   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3508   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3509     return legalizeBufferAtomic(MI, B, IntrID);
3510   case Intrinsic::amdgcn_atomic_inc:
3511     return legalizeAtomicIncDec(MI, B, true);
3512   case Intrinsic::amdgcn_atomic_dec:
3513     return legalizeAtomicIncDec(MI, B, false);
3514   default: {
3515     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
3516             AMDGPU::getImageDimIntrinsicInfo(IntrID))
3517       return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr);
3518     return true;
3519   }
3520   }
3521 
3522   return true;
3523 }
3524