1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
19 #endif
20 
21 #include "AMDGPULegalizerInfo.h"
22 
23 #include "AMDGPU.h"
24 #include "AMDGPUGlobalISelUtils.h"
25 #include "AMDGPUTargetMachine.h"
26 #include "SIMachineFunctionInfo.h"
27 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
28 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
29 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
30 #include "llvm/CodeGen/TargetOpcodes.h"
31 #include "llvm/CodeGen/ValueTypes.h"
32 #include "llvm/IR/DerivedTypes.h"
33 #include "llvm/IR/DiagnosticInfo.h"
34 #include "llvm/IR/Type.h"
35 #include "llvm/Support/Debug.h"
36 
37 #define DEBUG_TYPE "amdgpu-legalinfo"
38 
39 using namespace llvm;
40 using namespace LegalizeActions;
41 using namespace LegalizeMutations;
42 using namespace LegalityPredicates;
43 using namespace MIPatternMatch;
44 
45 static LegalityPredicate isMultiple32(unsigned TypeIdx,
46                                       unsigned MaxSize = 1024) {
47   return [=](const LegalityQuery &Query) {
48     const LLT Ty = Query.Types[TypeIdx];
49     const LLT EltTy = Ty.getScalarType();
50     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
51   };
52 }
53 
54 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
55   return [=](const LegalityQuery &Query) {
56     return Query.Types[TypeIdx].getSizeInBits() == Size;
57   };
58 }
59 
60 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
61   return [=](const LegalityQuery &Query) {
62     const LLT Ty = Query.Types[TypeIdx];
63     return Ty.isVector() &&
64            Ty.getNumElements() % 2 != 0 &&
65            Ty.getElementType().getSizeInBits() < 32 &&
66            Ty.getSizeInBits() % 32 != 0;
67   };
68 }
69 
70 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
71   return [=](const LegalityQuery &Query) {
72     const LLT Ty = Query.Types[TypeIdx];
73     const LLT EltTy = Ty.getScalarType();
74     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
75   };
76 }
77 
78 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
79   return [=](const LegalityQuery &Query) {
80     const LLT Ty = Query.Types[TypeIdx];
81     const LLT EltTy = Ty.getElementType();
82     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
83   };
84 }
85 
86 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
87   return [=](const LegalityQuery &Query) {
88     const LLT Ty = Query.Types[TypeIdx];
89     const LLT EltTy = Ty.getElementType();
90     unsigned Size = Ty.getSizeInBits();
91     unsigned Pieces = (Size + 63) / 64;
92     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
93     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
94   };
95 }
96 
97 // Increase the number of vector elements to reach the next multiple of 32-bit
98 // type.
99 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
100   return [=](const LegalityQuery &Query) {
101     const LLT Ty = Query.Types[TypeIdx];
102 
103     const LLT EltTy = Ty.getElementType();
104     const int Size = Ty.getSizeInBits();
105     const int EltSize = EltTy.getSizeInBits();
106     const int NextMul32 = (Size + 31) / 32;
107 
108     assert(EltSize < 32);
109 
110     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
111     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
112   };
113 }
114 
115 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
116   return [=](const LegalityQuery &Query) {
117     const LLT QueryTy = Query.Types[TypeIdx];
118     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
119   };
120 }
121 
122 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
123   return [=](const LegalityQuery &Query) {
124     const LLT QueryTy = Query.Types[TypeIdx];
125     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
126   };
127 }
128 
129 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
130   return [=](const LegalityQuery &Query) {
131     const LLT QueryTy = Query.Types[TypeIdx];
132     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
133   };
134 }
135 
136 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
137 // v2s16.
138 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
139   return [=](const LegalityQuery &Query) {
140     const LLT Ty = Query.Types[TypeIdx];
141     if (Ty.isVector()) {
142       const int EltSize = Ty.getElementType().getSizeInBits();
143       return EltSize == 32 || EltSize == 64 ||
144             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
145              EltSize == 128 || EltSize == 256;
146     }
147 
148     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
149   };
150 }
151 
152 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
153   return [=](const LegalityQuery &Query) {
154     const LLT QueryTy = Query.Types[TypeIdx];
155     return QueryTy.isVector() && QueryTy.getElementType() == Type;
156   };
157 }
158 
159 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
160   return [=](const LegalityQuery &Query) {
161     const LLT Ty = Query.Types[TypeIdx];
162     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
163            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
164   };
165 }
166 
167 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
168                                          const GCNTargetMachine &TM)
169   :  ST(ST_) {
170   using namespace TargetOpcode;
171 
172   auto GetAddrSpacePtr = [&TM](unsigned AS) {
173     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
174   };
175 
176   const LLT S1 = LLT::scalar(1);
177   const LLT S16 = LLT::scalar(16);
178   const LLT S32 = LLT::scalar(32);
179   const LLT S64 = LLT::scalar(64);
180   const LLT S96 = LLT::scalar(96);
181   const LLT S128 = LLT::scalar(128);
182   const LLT S256 = LLT::scalar(256);
183   const LLT S1024 = LLT::scalar(1024);
184 
185   const LLT V2S16 = LLT::vector(2, 16);
186   const LLT V4S16 = LLT::vector(4, 16);
187 
188   const LLT V2S32 = LLT::vector(2, 32);
189   const LLT V3S32 = LLT::vector(3, 32);
190   const LLT V4S32 = LLT::vector(4, 32);
191   const LLT V5S32 = LLT::vector(5, 32);
192   const LLT V6S32 = LLT::vector(6, 32);
193   const LLT V7S32 = LLT::vector(7, 32);
194   const LLT V8S32 = LLT::vector(8, 32);
195   const LLT V9S32 = LLT::vector(9, 32);
196   const LLT V10S32 = LLT::vector(10, 32);
197   const LLT V11S32 = LLT::vector(11, 32);
198   const LLT V12S32 = LLT::vector(12, 32);
199   const LLT V13S32 = LLT::vector(13, 32);
200   const LLT V14S32 = LLT::vector(14, 32);
201   const LLT V15S32 = LLT::vector(15, 32);
202   const LLT V16S32 = LLT::vector(16, 32);
203   const LLT V32S32 = LLT::vector(32, 32);
204 
205   const LLT V2S64 = LLT::vector(2, 64);
206   const LLT V3S64 = LLT::vector(3, 64);
207   const LLT V4S64 = LLT::vector(4, 64);
208   const LLT V5S64 = LLT::vector(5, 64);
209   const LLT V6S64 = LLT::vector(6, 64);
210   const LLT V7S64 = LLT::vector(7, 64);
211   const LLT V8S64 = LLT::vector(8, 64);
212   const LLT V16S64 = LLT::vector(16, 64);
213 
214   std::initializer_list<LLT> AllS32Vectors =
215     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
216      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
217   std::initializer_list<LLT> AllS64Vectors =
218     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
219 
220   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
221   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
222   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
223   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
224   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
225   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
226   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
227 
228   const LLT CodePtr = FlatPtr;
229 
230   const std::initializer_list<LLT> AddrSpaces64 = {
231     GlobalPtr, ConstantPtr, FlatPtr
232   };
233 
234   const std::initializer_list<LLT> AddrSpaces32 = {
235     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
236   };
237 
238   const std::initializer_list<LLT> FPTypesBase = {
239     S32, S64
240   };
241 
242   const std::initializer_list<LLT> FPTypes16 = {
243     S32, S64, S16
244   };
245 
246   const std::initializer_list<LLT> FPTypesPK16 = {
247     S32, S64, S16, V2S16
248   };
249 
250   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
251 
252   setAction({G_BRCOND, S1}, Legal); // VCC branches
253   setAction({G_BRCOND, S32}, Legal); // SCC branches
254 
255   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
256   // elements for v3s16
257   getActionDefinitionsBuilder(G_PHI)
258     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
259     .legalFor(AllS32Vectors)
260     .legalFor(AllS64Vectors)
261     .legalFor(AddrSpaces64)
262     .legalFor(AddrSpaces32)
263     .clampScalar(0, S32, S256)
264     .widenScalarToNextPow2(0, 32)
265     .clampMaxNumElements(0, S32, 16)
266     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
267     .legalIf(isPointer(0));
268 
269   if (ST.has16BitInsts()) {
270     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
271       .legalFor({S32, S16})
272       .clampScalar(0, S16, S32)
273       .scalarize(0);
274   } else {
275     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
276       .legalFor({S32})
277       .clampScalar(0, S32, S32)
278       .scalarize(0);
279   }
280 
281   // FIXME: Not really legal. Placeholder for custom lowering.
282   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
283     .legalFor({S32, S64})
284     .clampScalar(0, S32, S64)
285     .widenScalarToNextPow2(0, 32)
286     .scalarize(0);
287 
288   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
289     .legalFor({S32})
290     .clampScalar(0, S32, S32)
291     .scalarize(0);
292 
293   // Report legal for any types we can handle anywhere. For the cases only legal
294   // on the SALU, RegBankSelect will be able to re-legalize.
295   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
296     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
297     .clampScalar(0, S32, S64)
298     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
299     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
300     .widenScalarToNextPow2(0)
301     .scalarize(0);
302 
303   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
304                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
305     .legalFor({{S32, S1}, {S32, S32}})
306     .clampScalar(0, S32, S32)
307     .scalarize(0); // TODO: Implement.
308 
309   getActionDefinitionsBuilder(G_BITCAST)
310     // Don't worry about the size constraint.
311     .legalIf(all(isRegisterType(0), isRegisterType(1)))
312     // FIXME: Testing hack
313     .legalForCartesianProduct({S16, LLT::vector(2, 8), })
314     .lower();
315 
316 
317   getActionDefinitionsBuilder(G_CONSTANT)
318     .legalFor({S1, S32, S64, S16, GlobalPtr,
319                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
320     .clampScalar(0, S32, S64)
321     .widenScalarToNextPow2(0)
322     .legalIf(isPointer(0));
323 
324   getActionDefinitionsBuilder(G_FCONSTANT)
325     .legalFor({S32, S64, S16})
326     .clampScalar(0, S16, S64);
327 
328   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
329     .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
330                ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
331     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
332     .clampScalarOrElt(0, S32, S1024)
333     .legalIf(isMultiple32(0))
334     .widenScalarToNextPow2(0, 32)
335     .clampMaxNumElements(0, S32, 16);
336 
337   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
338   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
339     .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr});
340   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
341 
342   auto &FPOpActions = getActionDefinitionsBuilder(
343     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
344     .legalFor({S32, S64});
345   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
346     .customFor({S32, S64});
347   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
348     .customFor({S32, S64});
349 
350   if (ST.has16BitInsts()) {
351     if (ST.hasVOP3PInsts())
352       FPOpActions.legalFor({S16, V2S16});
353     else
354       FPOpActions.legalFor({S16});
355 
356     TrigActions.customFor({S16});
357     FDIVActions.customFor({S16});
358   }
359 
360   auto &MinNumMaxNum = getActionDefinitionsBuilder({
361       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
362 
363   if (ST.hasVOP3PInsts()) {
364     MinNumMaxNum.customFor(FPTypesPK16)
365       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
366       .clampMaxNumElements(0, S16, 2)
367       .clampScalar(0, S16, S64)
368       .scalarize(0);
369   } else if (ST.has16BitInsts()) {
370     MinNumMaxNum.customFor(FPTypes16)
371       .clampScalar(0, S16, S64)
372       .scalarize(0);
373   } else {
374     MinNumMaxNum.customFor(FPTypesBase)
375       .clampScalar(0, S32, S64)
376       .scalarize(0);
377   }
378 
379   if (ST.hasVOP3PInsts())
380     FPOpActions.clampMaxNumElements(0, S16, 2);
381 
382   FPOpActions
383     .scalarize(0)
384     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
385 
386   TrigActions
387     .scalarize(0)
388     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
389 
390   FDIVActions
391     .scalarize(0)
392     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
393 
394   getActionDefinitionsBuilder({G_FNEG, G_FABS})
395     .legalFor(FPTypesPK16)
396     .clampMaxNumElements(0, S16, 2)
397     .scalarize(0)
398     .clampScalar(0, S16, S64);
399 
400   if (ST.has16BitInsts()) {
401     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
402       .legalFor({S32, S64, S16})
403       .scalarize(0)
404       .clampScalar(0, S16, S64);
405   } else {
406     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
407       .legalFor({S32, S64})
408       .scalarize(0)
409       .clampScalar(0, S32, S64);
410   }
411 
412   getActionDefinitionsBuilder(G_FPTRUNC)
413     .legalFor({{S32, S64}, {S16, S32}})
414     .scalarize(0);
415 
416   getActionDefinitionsBuilder(G_FPEXT)
417     .legalFor({{S64, S32}, {S32, S16}})
418     .lowerFor({{S64, S16}}) // FIXME: Implement
419     .scalarize(0);
420 
421   getActionDefinitionsBuilder(G_FSUB)
422       // Use actual fsub instruction
423       .legalFor({S32})
424       // Must use fadd + fneg
425       .lowerFor({S64, S16, V2S16})
426       .scalarize(0)
427       .clampScalar(0, S32, S64);
428 
429   // Whether this is legal depends on the floating point mode for the function.
430   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
431   if (ST.hasMadF16())
432     FMad.customFor({S32, S16});
433   else
434     FMad.customFor({S32});
435   FMad.scalarize(0)
436       .lower();
437 
438   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
439     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
440                {S32, S1}, {S64, S1}, {S16, S1}})
441     .scalarize(0)
442     .clampScalar(0, S32, S64);
443 
444   // TODO: Split s1->s64 during regbankselect for VALU.
445   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
446     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
447     .lowerFor({{S32, S64}})
448     .lowerIf(typeIs(1, S1))
449     .customFor({{S64, S64}});
450   if (ST.has16BitInsts())
451     IToFP.legalFor({{S16, S16}});
452   IToFP.clampScalar(1, S32, S64)
453        .scalarize(0);
454 
455   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
456     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
457     .customFor({{S64, S64}});
458   if (ST.has16BitInsts())
459     FPToI.legalFor({{S16, S16}});
460   else
461     FPToI.minScalar(1, S32);
462 
463   FPToI.minScalar(0, S32)
464        .scalarize(0)
465        .lower();
466 
467   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
468     .scalarize(0)
469     .lower();
470 
471   if (ST.has16BitInsts()) {
472     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
473       .legalFor({S16, S32, S64})
474       .clampScalar(0, S16, S64)
475       .scalarize(0);
476   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
477     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
478       .legalFor({S32, S64})
479       .clampScalar(0, S32, S64)
480       .scalarize(0);
481   } else {
482     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
483       .legalFor({S32})
484       .customFor({S64})
485       .clampScalar(0, S32, S64)
486       .scalarize(0);
487   }
488 
489   getActionDefinitionsBuilder({G_PTR_ADD, G_PTR_MASK})
490     .scalarize(0)
491     .alwaysLegal();
492 
493   auto &CmpBuilder =
494     getActionDefinitionsBuilder(G_ICMP)
495     // The compare output type differs based on the register bank of the output,
496     // so make both s1 and s32 legal.
497     //
498     // Scalar compares producing output in scc will be promoted to s32, as that
499     // is the allocatable register type that will be needed for the copy from
500     // scc. This will be promoted during RegBankSelect, and we assume something
501     // before that won't try to use s32 result types.
502     //
503     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
504     // bank.
505     .legalForCartesianProduct(
506       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
507     .legalForCartesianProduct(
508       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
509   if (ST.has16BitInsts()) {
510     CmpBuilder.legalFor({{S1, S16}});
511   }
512 
513   CmpBuilder
514     .widenScalarToNextPow2(1)
515     .clampScalar(1, S32, S64)
516     .scalarize(0)
517     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
518 
519   getActionDefinitionsBuilder(G_FCMP)
520     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
521     .widenScalarToNextPow2(1)
522     .clampScalar(1, S32, S64)
523     .scalarize(0);
524 
525   // FIXME: fpow has a selection pattern that should move to custom lowering.
526   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2, G_FPOW});
527   if (ST.has16BitInsts())
528     Exp2Ops.legalFor({S32, S16});
529   else
530     Exp2Ops.legalFor({S32});
531   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
532   Exp2Ops.scalarize(0);
533 
534   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10});
535   if (ST.has16BitInsts())
536     ExpOps.customFor({{S32}, {S16}});
537   else
538     ExpOps.customFor({S32});
539   ExpOps.clampScalar(0, MinScalarFPTy, S32)
540         .scalarize(0);
541 
542   // The 64-bit versions produce 32-bit results, but only on the SALU.
543   getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF,
544                                G_CTTZ, G_CTTZ_ZERO_UNDEF,
545                                G_CTPOP})
546     .legalFor({{S32, S32}, {S32, S64}})
547     .clampScalar(0, S32, S32)
548     .clampScalar(1, S32, S64)
549     .scalarize(0)
550     .widenScalarToNextPow2(0, 32)
551     .widenScalarToNextPow2(1, 32);
552 
553   // TODO: Expand for > s32
554   getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE})
555     .legalFor({S32})
556     .clampScalar(0, S32, S32)
557     .scalarize(0);
558 
559   if (ST.has16BitInsts()) {
560     if (ST.hasVOP3PInsts()) {
561       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
562         .legalFor({S32, S16, V2S16})
563         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
564         .clampMaxNumElements(0, S16, 2)
565         .clampScalar(0, S16, S32)
566         .widenScalarToNextPow2(0)
567         .scalarize(0);
568     } else {
569       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
570         .legalFor({S32, S16})
571         .widenScalarToNextPow2(0)
572         .clampScalar(0, S16, S32)
573         .scalarize(0);
574     }
575   } else {
576     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
577       .legalFor({S32})
578       .clampScalar(0, S32, S32)
579       .widenScalarToNextPow2(0)
580       .scalarize(0);
581   }
582 
583   auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
584     return [=](const LegalityQuery &Query) {
585       return Query.Types[TypeIdx0].getSizeInBits() <
586              Query.Types[TypeIdx1].getSizeInBits();
587     };
588   };
589 
590   auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
591     return [=](const LegalityQuery &Query) {
592       return Query.Types[TypeIdx0].getSizeInBits() >
593              Query.Types[TypeIdx1].getSizeInBits();
594     };
595   };
596 
597   getActionDefinitionsBuilder(G_INTTOPTR)
598     // List the common cases
599     .legalForCartesianProduct(AddrSpaces64, {S64})
600     .legalForCartesianProduct(AddrSpaces32, {S32})
601     .scalarize(0)
602     // Accept any address space as long as the size matches
603     .legalIf(sameSize(0, 1))
604     .widenScalarIf(smallerThan(1, 0),
605       [](const LegalityQuery &Query) {
606         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
607       })
608     .narrowScalarIf(greaterThan(1, 0),
609       [](const LegalityQuery &Query) {
610         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
611       });
612 
613   getActionDefinitionsBuilder(G_PTRTOINT)
614     // List the common cases
615     .legalForCartesianProduct(AddrSpaces64, {S64})
616     .legalForCartesianProduct(AddrSpaces32, {S32})
617     .scalarize(0)
618     // Accept any address space as long as the size matches
619     .legalIf(sameSize(0, 1))
620     .widenScalarIf(smallerThan(0, 1),
621       [](const LegalityQuery &Query) {
622         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
623       })
624     .narrowScalarIf(
625       greaterThan(0, 1),
626       [](const LegalityQuery &Query) {
627         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
628       });
629 
630   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
631     .scalarize(0)
632     .custom();
633 
634   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
635   // handle some operations by just promoting the register during
636   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
637   auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned {
638     switch (AS) {
639     // FIXME: Private element size.
640     case AMDGPUAS::PRIVATE_ADDRESS:
641       return 32;
642     // FIXME: Check subtarget
643     case AMDGPUAS::LOCAL_ADDRESS:
644       return ST.useDS128() ? 128 : 64;
645 
646     // Treat constant and global as identical. SMRD loads are sometimes usable
647     // for global loads (ideally constant address space should be eliminated)
648     // depending on the context. Legality cannot be context dependent, but
649     // RegBankSelect can split the load as necessary depending on the pointer
650     // register bank/uniformity and if the memory is invariant or not written in
651     // a kernel.
652     case AMDGPUAS::CONSTANT_ADDRESS:
653     case AMDGPUAS::GLOBAL_ADDRESS:
654       return IsLoad ? 512 : 128;
655     default:
656       return 128;
657     }
658   };
659 
660   const auto needToSplitMemOp = [=](const LegalityQuery &Query, bool IsLoad) -> bool {
661     const LLT DstTy = Query.Types[0];
662 
663     // Split vector extloads.
664     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
665     unsigned Align = Query.MMODescrs[0].AlignInBits;
666 
667     if (MemSize < DstTy.getSizeInBits())
668       MemSize = std::max(MemSize, Align);
669 
670     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
671       return true;
672 
673     const LLT PtrTy = Query.Types[1];
674     unsigned AS = PtrTy.getAddressSpace();
675     if (MemSize > maxSizeForAddrSpace(AS, IsLoad))
676       return true;
677 
678     // Catch weird sized loads that don't evenly divide into the access sizes
679     // TODO: May be able to widen depending on alignment etc.
680     unsigned NumRegs = MemSize / 32;
681     if (NumRegs == 3 && !ST.hasDwordx3LoadStores())
682       return true;
683 
684     if (Align < MemSize) {
685       const SITargetLowering *TLI = ST.getTargetLowering();
686       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
687     }
688 
689     return false;
690   };
691 
692   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
693   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
694   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
695 
696   // TODO: Refine based on subtargets which support unaligned access or 128-bit
697   // LDS
698   // TODO: Unsupported flat for SI.
699 
700   for (unsigned Op : {G_LOAD, G_STORE}) {
701     const bool IsStore = Op == G_STORE;
702 
703     auto &Actions = getActionDefinitionsBuilder(Op);
704     // Whitelist the common cases.
705     // TODO: Pointer loads
706     // TODO: Wide constant loads
707     // TODO: Only CI+ has 3x loads
708     // TODO: Loads to s16 on gfx9
709     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
710                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
711                                       {V3S32, GlobalPtr, 96, GlobalAlign32},
712                                       {S96, GlobalPtr, 96, GlobalAlign32},
713                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
714                                       {S128, GlobalPtr, 128, GlobalAlign32},
715                                       {S64, GlobalPtr, 64, GlobalAlign32},
716                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
717                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
718                                       {S32, GlobalPtr, 8, GlobalAlign8},
719                                       {S32, GlobalPtr, 16, GlobalAlign16},
720 
721                                       {S32, LocalPtr, 32, 32},
722                                       {S64, LocalPtr, 64, 32},
723                                       {V2S32, LocalPtr, 64, 32},
724                                       {S32, LocalPtr, 8, 8},
725                                       {S32, LocalPtr, 16, 16},
726                                       {V2S16, LocalPtr, 32, 32},
727 
728                                       {S32, PrivatePtr, 32, 32},
729                                       {S32, PrivatePtr, 8, 8},
730                                       {S32, PrivatePtr, 16, 16},
731                                       {V2S16, PrivatePtr, 32, 32},
732 
733                                       {S32, FlatPtr, 32, GlobalAlign32},
734                                       {S32, FlatPtr, 16, GlobalAlign16},
735                                       {S32, FlatPtr, 8, GlobalAlign8},
736                                       {V2S16, FlatPtr, 32, GlobalAlign32},
737 
738                                       {S32, ConstantPtr, 32, GlobalAlign32},
739                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
740                                       {V3S32, ConstantPtr, 96, GlobalAlign32},
741                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
742                                       {S64, ConstantPtr, 64, GlobalAlign32},
743                                       {S128, ConstantPtr, 128, GlobalAlign32},
744                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
745     Actions
746         .customIf(typeIs(1, Constant32Ptr))
747         .narrowScalarIf(
748             [=](const LegalityQuery &Query) -> bool {
749               return !Query.Types[0].isVector() &&
750                      needToSplitMemOp(Query, Op == G_LOAD);
751             },
752             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
753               const LLT DstTy = Query.Types[0];
754               const LLT PtrTy = Query.Types[1];
755 
756               const unsigned DstSize = DstTy.getSizeInBits();
757               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
758 
759               // Split extloads.
760               if (DstSize > MemSize)
761                 return std::make_pair(0, LLT::scalar(MemSize));
762 
763               if (DstSize > 32 && (DstSize % 32 != 0)) {
764                 // FIXME: Need a way to specify non-extload of larger size if
765                 // suitably aligned.
766                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
767               }
768 
769               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
770                                                      Op == G_LOAD);
771               if (MemSize > MaxSize)
772                 return std::make_pair(0, LLT::scalar(MaxSize));
773 
774               unsigned Align = Query.MMODescrs[0].AlignInBits;
775               return std::make_pair(0, LLT::scalar(Align));
776             })
777         .fewerElementsIf(
778             [=](const LegalityQuery &Query) -> bool {
779               return Query.Types[0].isVector() &&
780                      needToSplitMemOp(Query, Op == G_LOAD);
781             },
782             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
783               const LLT DstTy = Query.Types[0];
784               const LLT PtrTy = Query.Types[1];
785 
786               LLT EltTy = DstTy.getElementType();
787               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
788                                                      Op == G_LOAD);
789 
790               // Split if it's too large for the address space.
791               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
792                 unsigned NumElts = DstTy.getNumElements();
793                 unsigned EltSize = EltTy.getSizeInBits();
794 
795                 if (MaxSize % EltSize == 0) {
796                   return std::make_pair(
797                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
798                 }
799 
800                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
801 
802                 // FIXME: Refine when odd breakdowns handled
803                 // The scalars will need to be re-legalized.
804                 if (NumPieces == 1 || NumPieces >= NumElts ||
805                     NumElts % NumPieces != 0)
806                   return std::make_pair(0, EltTy);
807 
808                 return std::make_pair(0,
809                                       LLT::vector(NumElts / NumPieces, EltTy));
810               }
811 
812               // Need to split because of alignment.
813               unsigned Align = Query.MMODescrs[0].AlignInBits;
814               unsigned EltSize = EltTy.getSizeInBits();
815               if (EltSize > Align &&
816                   (EltSize / Align < DstTy.getNumElements())) {
817                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
818               }
819 
820               // May need relegalization for the scalars.
821               return std::make_pair(0, EltTy);
822             })
823         .minScalar(0, S32);
824 
825     if (IsStore)
826       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
827 
828     // TODO: Need a bitcast lower option?
829     Actions
830         .legalIf([=](const LegalityQuery &Query) {
831           const LLT Ty0 = Query.Types[0];
832           unsigned Size = Ty0.getSizeInBits();
833           unsigned MemSize = Query.MMODescrs[0].SizeInBits;
834           unsigned Align = Query.MMODescrs[0].AlignInBits;
835 
836           // FIXME: Widening store from alignment not valid.
837           if (MemSize < Size)
838             MemSize = std::max(MemSize, Align);
839 
840           // No extending vector loads.
841           if (Size > MemSize && Ty0.isVector())
842             return false;
843 
844           switch (MemSize) {
845           case 8:
846           case 16:
847             return Size == 32;
848           case 32:
849           case 64:
850           case 128:
851             return true;
852           case 96:
853             return ST.hasDwordx3LoadStores();
854           case 256:
855           case 512:
856             return true;
857           default:
858             return false;
859           }
860         })
861         .widenScalarToNextPow2(0)
862         // TODO: v3s32->v4s32 with alignment
863         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
864   }
865 
866   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
867                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
868                                                   {S32, GlobalPtr, 16, 2 * 8},
869                                                   {S32, LocalPtr, 8, 8},
870                                                   {S32, LocalPtr, 16, 16},
871                                                   {S32, PrivatePtr, 8, 8},
872                                                   {S32, PrivatePtr, 16, 16},
873                                                   {S32, ConstantPtr, 8, 8},
874                                                   {S32, ConstantPtr, 16, 2 * 8}});
875   if (ST.hasFlatAddressSpace()) {
876     ExtLoads.legalForTypesWithMemDesc(
877         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
878   }
879 
880   ExtLoads.clampScalar(0, S32, S32)
881           .widenScalarToNextPow2(0)
882           .unsupportedIfMemSizeNotPow2()
883           .lower();
884 
885   auto &Atomics = getActionDefinitionsBuilder(
886     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
887      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
888      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
889      G_ATOMICRMW_UMIN})
890     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
891                {S64, GlobalPtr}, {S64, LocalPtr}});
892   if (ST.hasFlatAddressSpace()) {
893     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
894   }
895 
896   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
897     .legalFor({{S32, LocalPtr}});
898 
899   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
900   // demarshalling
901   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
902     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
903                 {S32, FlatPtr}, {S64, FlatPtr}})
904     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
905                {S32, RegionPtr}, {S64, RegionPtr}});
906   // TODO: Pointer types, any 32-bit or 64-bit vector
907 
908   // Condition should be s32 for scalar, s1 for vector.
909   getActionDefinitionsBuilder(G_SELECT)
910     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
911           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
912           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
913     .clampScalar(0, S16, S64)
914     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
915     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
916     .scalarize(1)
917     .clampMaxNumElements(0, S32, 2)
918     .clampMaxNumElements(0, LocalPtr, 2)
919     .clampMaxNumElements(0, PrivatePtr, 2)
920     .scalarize(0)
921     .widenScalarToNextPow2(0)
922     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
923 
924   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
925   // be more flexible with the shift amount type.
926   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
927     .legalFor({{S32, S32}, {S64, S32}});
928   if (ST.has16BitInsts()) {
929     if (ST.hasVOP3PInsts()) {
930       Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
931             .clampMaxNumElements(0, S16, 2);
932     } else
933       Shifts.legalFor({{S16, S32}, {S16, S16}});
934 
935     // TODO: Support 16-bit shift amounts
936     Shifts.clampScalar(1, S32, S32);
937     Shifts.clampScalar(0, S16, S64);
938     Shifts.widenScalarToNextPow2(0, 16);
939   } else {
940     // Make sure we legalize the shift amount type first, as the general
941     // expansion for the shifted type will produce much worse code if it hasn't
942     // been truncated already.
943     Shifts.clampScalar(1, S32, S32);
944     Shifts.clampScalar(0, S32, S64);
945     Shifts.widenScalarToNextPow2(0, 32);
946   }
947   Shifts.scalarize(0);
948 
949   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
950     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
951     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
952     unsigned IdxTypeIdx = 2;
953 
954     getActionDefinitionsBuilder(Op)
955       .customIf([=](const LegalityQuery &Query) {
956           const LLT EltTy = Query.Types[EltTypeIdx];
957           const LLT VecTy = Query.Types[VecTypeIdx];
958           const LLT IdxTy = Query.Types[IdxTypeIdx];
959           return (EltTy.getSizeInBits() == 16 ||
960                   EltTy.getSizeInBits() % 32 == 0) &&
961                  VecTy.getSizeInBits() % 32 == 0 &&
962                  VecTy.getSizeInBits() <= 1024 &&
963                  IdxTy.getSizeInBits() == 32;
964         })
965       .clampScalar(EltTypeIdx, S32, S64)
966       .clampScalar(VecTypeIdx, S32, S64)
967       .clampScalar(IdxTypeIdx, S32, S32);
968   }
969 
970   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
971     .unsupportedIf([=](const LegalityQuery &Query) {
972         const LLT &EltTy = Query.Types[1].getElementType();
973         return Query.Types[0] != EltTy;
974       });
975 
976   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
977     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
978     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
979 
980     // FIXME: Doesn't handle extract of illegal sizes.
981     getActionDefinitionsBuilder(Op)
982       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
983       // FIXME: Multiples of 16 should not be legal.
984       .legalIf([=](const LegalityQuery &Query) {
985           const LLT BigTy = Query.Types[BigTyIdx];
986           const LLT LitTy = Query.Types[LitTyIdx];
987           return (BigTy.getSizeInBits() % 32 == 0) &&
988                  (LitTy.getSizeInBits() % 16 == 0);
989         })
990       .widenScalarIf(
991         [=](const LegalityQuery &Query) {
992           const LLT BigTy = Query.Types[BigTyIdx];
993           return (BigTy.getScalarSizeInBits() < 16);
994         },
995         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
996       .widenScalarIf(
997         [=](const LegalityQuery &Query) {
998           const LLT LitTy = Query.Types[LitTyIdx];
999           return (LitTy.getScalarSizeInBits() < 16);
1000         },
1001         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1002       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1003       .widenScalarToNextPow2(BigTyIdx, 32);
1004 
1005   }
1006 
1007   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1008     .legalForCartesianProduct(AllS32Vectors, {S32})
1009     .legalForCartesianProduct(AllS64Vectors, {S64})
1010     .clampNumElements(0, V16S32, V32S32)
1011     .clampNumElements(0, V2S64, V16S64)
1012     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1013 
1014   if (ST.hasScalarPackInsts())
1015     BuildVector.legalFor({V2S16, S32});
1016 
1017   BuildVector
1018     .minScalarSameAs(1, 0)
1019     .legalIf(isRegisterType(0))
1020     .minScalarOrElt(0, S32);
1021 
1022   if (ST.hasScalarPackInsts()) {
1023     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1024       .legalFor({V2S16, S32})
1025       .lower();
1026   } else {
1027     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1028       .lower();
1029   }
1030 
1031   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1032     .legalIf(isRegisterType(0));
1033 
1034   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1035   // pre-legalize.
1036   if (ST.hasVOP3PInsts()) {
1037     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1038       .customFor({V2S16, V2S16})
1039       .lower();
1040   } else
1041     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1042 
1043   // Merge/Unmerge
1044   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1045     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1046     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1047 
1048     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1049       const LLT &Ty = Query.Types[TypeIdx];
1050       if (Ty.isVector()) {
1051         const LLT &EltTy = Ty.getElementType();
1052         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
1053           return true;
1054         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1055           return true;
1056       }
1057       return false;
1058     };
1059 
1060     auto &Builder = getActionDefinitionsBuilder(Op)
1061       // Try to widen to s16 first for small types.
1062       // TODO: Only do this on targets with legal s16 shifts
1063       .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1064 
1065       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1066       .lowerFor({{S16, V2S16}})
1067       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1068       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1069                            elementTypeIs(1, S16)),
1070                        changeTo(1, V2S16))
1071       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1072       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1073       // valid.
1074       .clampScalar(LitTyIdx, S32, S256)
1075       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1076       // Break up vectors with weird elements into scalars
1077       .fewerElementsIf(
1078         [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
1079         scalarize(0))
1080       .fewerElementsIf(
1081         [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
1082         scalarize(1))
1083       .clampScalar(BigTyIdx, S32, S1024);
1084 
1085     if (Op == G_MERGE_VALUES) {
1086       Builder.widenScalarIf(
1087         // TODO: Use 16-bit shifts if legal for 8-bit values?
1088         [=](const LegalityQuery &Query) {
1089           const LLT Ty = Query.Types[LitTyIdx];
1090           return Ty.getSizeInBits() < 32;
1091         },
1092         changeTo(LitTyIdx, S32));
1093     }
1094 
1095     Builder.widenScalarIf(
1096       [=](const LegalityQuery &Query) {
1097         const LLT Ty = Query.Types[BigTyIdx];
1098         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1099           Ty.getSizeInBits() % 16 != 0;
1100       },
1101       [=](const LegalityQuery &Query) {
1102         // Pick the next power of 2, or a multiple of 64 over 128.
1103         // Whichever is smaller.
1104         const LLT &Ty = Query.Types[BigTyIdx];
1105         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1106         if (NewSizeInBits >= 256) {
1107           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1108           if (RoundedTo < NewSizeInBits)
1109             NewSizeInBits = RoundedTo;
1110         }
1111         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1112       })
1113       .legalIf([=](const LegalityQuery &Query) {
1114           const LLT &BigTy = Query.Types[BigTyIdx];
1115           const LLT &LitTy = Query.Types[LitTyIdx];
1116 
1117           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1118             return false;
1119           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1120             return false;
1121 
1122           return BigTy.getSizeInBits() % 16 == 0 &&
1123                  LitTy.getSizeInBits() % 16 == 0 &&
1124                  BigTy.getSizeInBits() <= 1024;
1125         })
1126       // Any vectors left are the wrong size. Scalarize them.
1127       .scalarize(0)
1128       .scalarize(1);
1129   }
1130 
1131   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1132   // RegBankSelect.
1133   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1134     .legalFor({{S32}, {S64}});
1135 
1136   if (ST.hasVOP3PInsts()) {
1137     SextInReg.lowerFor({{V2S16}})
1138       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1139       // get more vector shift opportunities, since we'll get those when
1140       // expanded.
1141       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1142   } else if (ST.has16BitInsts()) {
1143     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1144   } else {
1145     // Prefer to promote to s32 before lowering if we don't have 16-bit
1146     // shifts. This avoid a lot of intermediate truncate and extend operations.
1147     SextInReg.lowerFor({{S32}, {S64}});
1148   }
1149 
1150   SextInReg
1151     .scalarize(0)
1152     .clampScalar(0, S32, S64)
1153     .lower();
1154 
1155   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1156     .legalFor({S64});
1157 
1158   getActionDefinitionsBuilder({
1159       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1160       G_FCOPYSIGN,
1161 
1162       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1163       G_READ_REGISTER,
1164       G_WRITE_REGISTER,
1165 
1166       G_SADDO, G_SSUBO,
1167 
1168        // TODO: Implement
1169       G_FMINIMUM, G_FMAXIMUM
1170     }).lower();
1171 
1172   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1173         G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1174         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1175     .unsupported();
1176 
1177   computeTables();
1178   verify(*ST.getInstrInfo());
1179 }
1180 
1181 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1182                                          MachineRegisterInfo &MRI,
1183                                          MachineIRBuilder &B,
1184                                          GISelChangeObserver &Observer) const {
1185   switch (MI.getOpcode()) {
1186   case TargetOpcode::G_ADDRSPACE_CAST:
1187     return legalizeAddrSpaceCast(MI, MRI, B);
1188   case TargetOpcode::G_FRINT:
1189     return legalizeFrint(MI, MRI, B);
1190   case TargetOpcode::G_FCEIL:
1191     return legalizeFceil(MI, MRI, B);
1192   case TargetOpcode::G_INTRINSIC_TRUNC:
1193     return legalizeIntrinsicTrunc(MI, MRI, B);
1194   case TargetOpcode::G_SITOFP:
1195     return legalizeITOFP(MI, MRI, B, true);
1196   case TargetOpcode::G_UITOFP:
1197     return legalizeITOFP(MI, MRI, B, false);
1198   case TargetOpcode::G_FPTOSI:
1199     return legalizeFPTOI(MI, MRI, B, true);
1200   case TargetOpcode::G_FPTOUI:
1201     return legalizeFPTOI(MI, MRI, B, false);
1202   case TargetOpcode::G_FMINNUM:
1203   case TargetOpcode::G_FMAXNUM:
1204   case TargetOpcode::G_FMINNUM_IEEE:
1205   case TargetOpcode::G_FMAXNUM_IEEE:
1206     return legalizeMinNumMaxNum(MI, MRI, B);
1207   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1208     return legalizeExtractVectorElt(MI, MRI, B);
1209   case TargetOpcode::G_INSERT_VECTOR_ELT:
1210     return legalizeInsertVectorElt(MI, MRI, B);
1211   case TargetOpcode::G_SHUFFLE_VECTOR:
1212     return legalizeShuffleVector(MI, MRI, B);
1213   case TargetOpcode::G_FSIN:
1214   case TargetOpcode::G_FCOS:
1215     return legalizeSinCos(MI, MRI, B);
1216   case TargetOpcode::G_GLOBAL_VALUE:
1217     return legalizeGlobalValue(MI, MRI, B);
1218   case TargetOpcode::G_LOAD:
1219     return legalizeLoad(MI, MRI, B, Observer);
1220   case TargetOpcode::G_FMAD:
1221     return legalizeFMad(MI, MRI, B);
1222   case TargetOpcode::G_FDIV:
1223     return legalizeFDIV(MI, MRI, B);
1224   case TargetOpcode::G_ATOMIC_CMPXCHG:
1225     return legalizeAtomicCmpXChg(MI, MRI, B);
1226   case TargetOpcode::G_FLOG:
1227     return legalizeFlog(MI, B, 1.0f / numbers::log2ef);
1228   case TargetOpcode::G_FLOG10:
1229     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1230   case TargetOpcode::G_FEXP:
1231     return legalizeFExp(MI, B);
1232   default:
1233     return false;
1234   }
1235 
1236   llvm_unreachable("expected switch to return");
1237 }
1238 
1239 Register AMDGPULegalizerInfo::getSegmentAperture(
1240   unsigned AS,
1241   MachineRegisterInfo &MRI,
1242   MachineIRBuilder &B) const {
1243   MachineFunction &MF = B.getMF();
1244   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1245   const LLT S32 = LLT::scalar(32);
1246 
1247   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1248 
1249   if (ST.hasApertureRegs()) {
1250     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1251     // getreg.
1252     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1253         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1254         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1255     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1256         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1257         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1258     unsigned Encoding =
1259         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1260         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1261         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1262 
1263     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1264 
1265     B.buildInstr(AMDGPU::S_GETREG_B32)
1266       .addDef(GetReg)
1267       .addImm(Encoding);
1268     MRI.setType(GetReg, S32);
1269 
1270     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1271     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1272   }
1273 
1274   Register QueuePtr = MRI.createGenericVirtualRegister(
1275     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1276 
1277   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1278   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1279     return Register();
1280 
1281   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1282   // private_segment_aperture_base_hi.
1283   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1284 
1285   // TODO: can we be smarter about machine pointer info?
1286   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1287   MachineMemOperand *MMO = MF.getMachineMemOperand(
1288     PtrInfo,
1289     MachineMemOperand::MOLoad |
1290     MachineMemOperand::MODereferenceable |
1291     MachineMemOperand::MOInvariant,
1292     4,
1293     MinAlign(64, StructOffset));
1294 
1295   Register LoadAddr;
1296 
1297   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1298   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1299 }
1300 
1301 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1302   MachineInstr &MI, MachineRegisterInfo &MRI,
1303   MachineIRBuilder &B) const {
1304   MachineFunction &MF = B.getMF();
1305 
1306   B.setInstr(MI);
1307 
1308   const LLT S32 = LLT::scalar(32);
1309   Register Dst = MI.getOperand(0).getReg();
1310   Register Src = MI.getOperand(1).getReg();
1311 
1312   LLT DstTy = MRI.getType(Dst);
1313   LLT SrcTy = MRI.getType(Src);
1314   unsigned DestAS = DstTy.getAddressSpace();
1315   unsigned SrcAS = SrcTy.getAddressSpace();
1316 
1317   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1318   // vector element.
1319   assert(!DstTy.isVector());
1320 
1321   const AMDGPUTargetMachine &TM
1322     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1323 
1324   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1325   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1326     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1327     return true;
1328   }
1329 
1330   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1331     // Truncate.
1332     B.buildExtract(Dst, Src, 0);
1333     MI.eraseFromParent();
1334     return true;
1335   }
1336 
1337   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1338     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1339     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1340 
1341     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1342     // another. Merge operands are required to be the same type, but creating an
1343     // extra ptrtoint would be kind of pointless.
1344     auto HighAddr = B.buildConstant(
1345       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1346     B.buildMerge(Dst, {Src, HighAddr.getReg(0)});
1347     MI.eraseFromParent();
1348     return true;
1349   }
1350 
1351   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1352     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1353            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1354     unsigned NullVal = TM.getNullPointerValue(DestAS);
1355 
1356     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1357     auto FlatNull = B.buildConstant(SrcTy, 0);
1358 
1359     // Extract low 32-bits of the pointer.
1360     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1361 
1362     auto CmpRes =
1363         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1364     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1365 
1366     MI.eraseFromParent();
1367     return true;
1368   }
1369 
1370   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1371     return false;
1372 
1373   if (!ST.hasFlatAddressSpace())
1374     return false;
1375 
1376   auto SegmentNull =
1377       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1378   auto FlatNull =
1379       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1380 
1381   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1382   if (!ApertureReg.isValid())
1383     return false;
1384 
1385   auto CmpRes =
1386       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1387 
1388   // Coerce the type of the low half of the result so we can use merge_values.
1389   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1390 
1391   // TODO: Should we allow mismatched types but matching sizes in merges to
1392   // avoid the ptrtoint?
1393   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1394   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1395 
1396   MI.eraseFromParent();
1397   return true;
1398 }
1399 
1400 bool AMDGPULegalizerInfo::legalizeFrint(
1401   MachineInstr &MI, MachineRegisterInfo &MRI,
1402   MachineIRBuilder &B) const {
1403   B.setInstr(MI);
1404 
1405   Register Src = MI.getOperand(1).getReg();
1406   LLT Ty = MRI.getType(Src);
1407   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1408 
1409   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1410   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1411 
1412   auto C1 = B.buildFConstant(Ty, C1Val);
1413   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1414 
1415   // TODO: Should this propagate fast-math-flags?
1416   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1417   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1418 
1419   auto C2 = B.buildFConstant(Ty, C2Val);
1420   auto Fabs = B.buildFAbs(Ty, Src);
1421 
1422   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1423   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1424   return true;
1425 }
1426 
1427 bool AMDGPULegalizerInfo::legalizeFceil(
1428   MachineInstr &MI, MachineRegisterInfo &MRI,
1429   MachineIRBuilder &B) const {
1430   B.setInstr(MI);
1431 
1432   const LLT S1 = LLT::scalar(1);
1433   const LLT S64 = LLT::scalar(64);
1434 
1435   Register Src = MI.getOperand(1).getReg();
1436   assert(MRI.getType(Src) == S64);
1437 
1438   // result = trunc(src)
1439   // if (src > 0.0 && src != result)
1440   //   result += 1.0
1441 
1442   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1443 
1444   const auto Zero = B.buildFConstant(S64, 0.0);
1445   const auto One = B.buildFConstant(S64, 1.0);
1446   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1447   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1448   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1449   auto Add = B.buildSelect(S64, And, One, Zero);
1450 
1451   // TODO: Should this propagate fast-math-flags?
1452   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1453   return true;
1454 }
1455 
1456 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1457                                               MachineIRBuilder &B) {
1458   const unsigned FractBits = 52;
1459   const unsigned ExpBits = 11;
1460   LLT S32 = LLT::scalar(32);
1461 
1462   auto Const0 = B.buildConstant(S32, FractBits - 32);
1463   auto Const1 = B.buildConstant(S32, ExpBits);
1464 
1465   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1466     .addUse(Const0.getReg(0))
1467     .addUse(Const1.getReg(0));
1468 
1469   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1470 }
1471 
1472 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1473   MachineInstr &MI, MachineRegisterInfo &MRI,
1474   MachineIRBuilder &B) const {
1475   B.setInstr(MI);
1476 
1477   const LLT S1 = LLT::scalar(1);
1478   const LLT S32 = LLT::scalar(32);
1479   const LLT S64 = LLT::scalar(64);
1480 
1481   Register Src = MI.getOperand(1).getReg();
1482   assert(MRI.getType(Src) == S64);
1483 
1484   // TODO: Should this use extract since the low half is unused?
1485   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1486   Register Hi = Unmerge.getReg(1);
1487 
1488   // Extract the upper half, since this is where we will find the sign and
1489   // exponent.
1490   auto Exp = extractF64Exponent(Hi, B);
1491 
1492   const unsigned FractBits = 52;
1493 
1494   // Extract the sign bit.
1495   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1496   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1497 
1498   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1499 
1500   const auto Zero32 = B.buildConstant(S32, 0);
1501 
1502   // Extend back to 64-bits.
1503   auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)});
1504 
1505   auto Shr = B.buildAShr(S64, FractMask, Exp);
1506   auto Not = B.buildNot(S64, Shr);
1507   auto Tmp0 = B.buildAnd(S64, Src, Not);
1508   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1509 
1510   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1511   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1512 
1513   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1514   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1515   return true;
1516 }
1517 
1518 bool AMDGPULegalizerInfo::legalizeITOFP(
1519   MachineInstr &MI, MachineRegisterInfo &MRI,
1520   MachineIRBuilder &B, bool Signed) const {
1521   B.setInstr(MI);
1522 
1523   Register Dst = MI.getOperand(0).getReg();
1524   Register Src = MI.getOperand(1).getReg();
1525 
1526   const LLT S64 = LLT::scalar(64);
1527   const LLT S32 = LLT::scalar(32);
1528 
1529   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1530 
1531   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1532 
1533   auto CvtHi = Signed ?
1534     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1535     B.buildUITOFP(S64, Unmerge.getReg(1));
1536 
1537   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1538 
1539   auto ThirtyTwo = B.buildConstant(S32, 32);
1540   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1541     .addUse(CvtHi.getReg(0))
1542     .addUse(ThirtyTwo.getReg(0));
1543 
1544   // TODO: Should this propagate fast-math-flags?
1545   B.buildFAdd(Dst, LdExp, CvtLo);
1546   MI.eraseFromParent();
1547   return true;
1548 }
1549 
1550 // TODO: Copied from DAG implementation. Verify logic and document how this
1551 // actually works.
1552 bool AMDGPULegalizerInfo::legalizeFPTOI(
1553   MachineInstr &MI, MachineRegisterInfo &MRI,
1554   MachineIRBuilder &B, bool Signed) const {
1555   B.setInstr(MI);
1556 
1557   Register Dst = MI.getOperand(0).getReg();
1558   Register Src = MI.getOperand(1).getReg();
1559 
1560   const LLT S64 = LLT::scalar(64);
1561   const LLT S32 = LLT::scalar(32);
1562 
1563   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1564 
1565   unsigned Flags = MI.getFlags();
1566 
1567   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
1568   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
1569   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
1570 
1571   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
1572   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
1573   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
1574 
1575   auto Hi = Signed ?
1576     B.buildFPTOSI(S32, FloorMul) :
1577     B.buildFPTOUI(S32, FloorMul);
1578   auto Lo = B.buildFPTOUI(S32, Fma);
1579 
1580   B.buildMerge(Dst, { Lo.getReg(0), Hi.getReg(0) });
1581   MI.eraseFromParent();
1582 
1583   return true;
1584 }
1585 
1586 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1587   MachineInstr &MI, MachineRegisterInfo &MRI,
1588   MachineIRBuilder &B) const {
1589   MachineFunction &MF = B.getMF();
1590   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1591 
1592   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1593                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1594 
1595   // With ieee_mode disabled, the instructions have the correct behavior
1596   // already for G_FMINNUM/G_FMAXNUM
1597   if (!MFI->getMode().IEEE)
1598     return !IsIEEEOp;
1599 
1600   if (IsIEEEOp)
1601     return true;
1602 
1603   MachineIRBuilder HelperBuilder(MI);
1604   GISelObserverWrapper DummyObserver;
1605   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1606   HelperBuilder.setInstr(MI);
1607   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1608 }
1609 
1610 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1611   MachineInstr &MI, MachineRegisterInfo &MRI,
1612   MachineIRBuilder &B) const {
1613   // TODO: Should move some of this into LegalizerHelper.
1614 
1615   // TODO: Promote dynamic indexing of s16 to s32
1616   // TODO: Dynamic s64 indexing is only legal for SGPR.
1617   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
1618   if (!IdxVal) // Dynamic case will be selected to register indexing.
1619     return true;
1620 
1621   Register Dst = MI.getOperand(0).getReg();
1622   Register Vec = MI.getOperand(1).getReg();
1623 
1624   LLT VecTy = MRI.getType(Vec);
1625   LLT EltTy = VecTy.getElementType();
1626   assert(EltTy == MRI.getType(Dst));
1627 
1628   B.setInstr(MI);
1629 
1630   if (IdxVal.getValue() < VecTy.getNumElements())
1631     B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits());
1632   else
1633     B.buildUndef(Dst);
1634 
1635   MI.eraseFromParent();
1636   return true;
1637 }
1638 
1639 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1640   MachineInstr &MI, MachineRegisterInfo &MRI,
1641   MachineIRBuilder &B) const {
1642   // TODO: Should move some of this into LegalizerHelper.
1643 
1644   // TODO: Promote dynamic indexing of s16 to s32
1645   // TODO: Dynamic s64 indexing is only legal for SGPR.
1646   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI);
1647   if (!IdxVal) // Dynamic case will be selected to register indexing.
1648     return true;
1649 
1650   Register Dst = MI.getOperand(0).getReg();
1651   Register Vec = MI.getOperand(1).getReg();
1652   Register Ins = MI.getOperand(2).getReg();
1653 
1654   LLT VecTy = MRI.getType(Vec);
1655   LLT EltTy = VecTy.getElementType();
1656   assert(EltTy == MRI.getType(Ins));
1657 
1658   B.setInstr(MI);
1659 
1660   if (IdxVal.getValue() < VecTy.getNumElements())
1661     B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits());
1662   else
1663     B.buildUndef(Dst);
1664 
1665   MI.eraseFromParent();
1666   return true;
1667 }
1668 
1669 static bool isLegalVOP3PShuffleMask(ArrayRef<int> Mask) {
1670   assert(Mask.size() == 2);
1671 
1672   // If one half is undef, the other is trivially in the same reg.
1673   if (Mask[0] == -1 || Mask[1] == -1)
1674     return true;
1675   return ((Mask[0] == 0 || Mask[0] == 1) && (Mask[1] == 0 || Mask[1] == 1)) ||
1676          ((Mask[0] == 2 || Mask[0] == 3) && (Mask[1] == 2 || Mask[1] == 3));
1677 }
1678 
1679 bool AMDGPULegalizerInfo::legalizeShuffleVector(
1680   MachineInstr &MI, MachineRegisterInfo &MRI,
1681   MachineIRBuilder &B) const {
1682   const LLT V2S16 = LLT::vector(2, 16);
1683 
1684   Register Dst = MI.getOperand(0).getReg();
1685   Register Src0 = MI.getOperand(1).getReg();
1686   LLT DstTy = MRI.getType(Dst);
1687   LLT SrcTy = MRI.getType(Src0);
1688 
1689   if (SrcTy == V2S16 && DstTy == V2S16 &&
1690       isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
1691     return true;
1692 
1693   MachineIRBuilder HelperBuilder(MI);
1694   GISelObserverWrapper DummyObserver;
1695   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
1696   HelperBuilder.setInstr(MI);
1697   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
1698 }
1699 
1700 bool AMDGPULegalizerInfo::legalizeSinCos(
1701   MachineInstr &MI, MachineRegisterInfo &MRI,
1702   MachineIRBuilder &B) const {
1703   B.setInstr(MI);
1704 
1705   Register DstReg = MI.getOperand(0).getReg();
1706   Register SrcReg = MI.getOperand(1).getReg();
1707   LLT Ty = MRI.getType(DstReg);
1708   unsigned Flags = MI.getFlags();
1709 
1710   Register TrigVal;
1711   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1712   if (ST.hasTrigReducedRange()) {
1713     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1714     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1715       .addUse(MulVal.getReg(0))
1716       .setMIFlags(Flags).getReg(0);
1717   } else
1718     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1719 
1720   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1721     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1722   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1723     .addUse(TrigVal)
1724     .setMIFlags(Flags);
1725   MI.eraseFromParent();
1726   return true;
1727 }
1728 
1729 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1730   Register DstReg, LLT PtrTy,
1731   MachineIRBuilder &B, const GlobalValue *GV,
1732   unsigned Offset, unsigned GAFlags) const {
1733   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1734   // to the following code sequence:
1735   //
1736   // For constant address space:
1737   //   s_getpc_b64 s[0:1]
1738   //   s_add_u32 s0, s0, $symbol
1739   //   s_addc_u32 s1, s1, 0
1740   //
1741   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1742   //   a fixup or relocation is emitted to replace $symbol with a literal
1743   //   constant, which is a pc-relative offset from the encoding of the $symbol
1744   //   operand to the global variable.
1745   //
1746   // For global address space:
1747   //   s_getpc_b64 s[0:1]
1748   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1749   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1750   //
1751   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1752   //   fixups or relocations are emitted to replace $symbol@*@lo and
1753   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1754   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
1755   //   operand to the global variable.
1756   //
1757   // What we want here is an offset from the value returned by s_getpc
1758   // (which is the address of the s_add_u32 instruction) to the global
1759   // variable, but since the encoding of $symbol starts 4 bytes after the start
1760   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1761   // small. This requires us to add 4 to the global variable offset in order to
1762   // compute the correct address.
1763 
1764   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1765 
1766   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1767     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1768 
1769   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1770     .addDef(PCReg);
1771 
1772   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1773   if (GAFlags == SIInstrInfo::MO_NONE)
1774     MIB.addImm(0);
1775   else
1776     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1777 
1778   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1779 
1780   if (PtrTy.getSizeInBits() == 32)
1781     B.buildExtract(DstReg, PCReg, 0);
1782   return true;
1783  }
1784 
1785 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1786   MachineInstr &MI, MachineRegisterInfo &MRI,
1787   MachineIRBuilder &B) const {
1788   Register DstReg = MI.getOperand(0).getReg();
1789   LLT Ty = MRI.getType(DstReg);
1790   unsigned AS = Ty.getAddressSpace();
1791 
1792   const GlobalValue *GV = MI.getOperand(1).getGlobal();
1793   MachineFunction &MF = B.getMF();
1794   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1795   B.setInstr(MI);
1796 
1797   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1798     if (!MFI->isEntryFunction()) {
1799       const Function &Fn = MF.getFunction();
1800       DiagnosticInfoUnsupported BadLDSDecl(
1801         Fn, "local memory global used by non-kernel function", MI.getDebugLoc());
1802       Fn.getContext().diagnose(BadLDSDecl);
1803     }
1804 
1805     // TODO: We could emit code to handle the initialization somewhere.
1806     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1807       const SITargetLowering *TLI = ST.getTargetLowering();
1808       if (!TLI->shouldUseLDSConstAddress(GV)) {
1809         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
1810         return true; // Leave in place;
1811       }
1812 
1813       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1814       MI.eraseFromParent();
1815       return true;
1816     }
1817 
1818     const Function &Fn = MF.getFunction();
1819     DiagnosticInfoUnsupported BadInit(
1820       Fn, "unsupported initializer for address space", MI.getDebugLoc());
1821     Fn.getContext().diagnose(BadInit);
1822     return true;
1823   }
1824 
1825   const SITargetLowering *TLI = ST.getTargetLowering();
1826 
1827   if (TLI->shouldEmitFixup(GV)) {
1828     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
1829     MI.eraseFromParent();
1830     return true;
1831   }
1832 
1833   if (TLI->shouldEmitPCReloc(GV)) {
1834     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
1835     MI.eraseFromParent();
1836     return true;
1837   }
1838 
1839   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1840   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
1841 
1842   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
1843     MachinePointerInfo::getGOT(MF),
1844     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1845     MachineMemOperand::MOInvariant,
1846     8 /*Size*/, 8 /*Align*/);
1847 
1848   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
1849 
1850   if (Ty.getSizeInBits() == 32) {
1851     // Truncate if this is a 32-bit constant adrdess.
1852     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
1853     B.buildExtract(DstReg, Load, 0);
1854   } else
1855     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
1856 
1857   MI.eraseFromParent();
1858   return true;
1859 }
1860 
1861 bool AMDGPULegalizerInfo::legalizeLoad(
1862   MachineInstr &MI, MachineRegisterInfo &MRI,
1863   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
1864   B.setInstr(MI);
1865   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1866   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
1867   Observer.changingInstr(MI);
1868   MI.getOperand(1).setReg(Cast.getReg(0));
1869   Observer.changedInstr(MI);
1870   return true;
1871 }
1872 
1873 bool AMDGPULegalizerInfo::legalizeFMad(
1874   MachineInstr &MI, MachineRegisterInfo &MRI,
1875   MachineIRBuilder &B) const {
1876   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1877   assert(Ty.isScalar());
1878 
1879   MachineFunction &MF = B.getMF();
1880   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1881 
1882   // TODO: Always legal with future ftz flag.
1883   // FIXME: Do we need just output?
1884   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
1885     return true;
1886   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
1887     return true;
1888 
1889   MachineIRBuilder HelperBuilder(MI);
1890   GISelObserverWrapper DummyObserver;
1891   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1892   HelperBuilder.setMBB(*MI.getParent());
1893   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
1894 }
1895 
1896 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
1897   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
1898   Register DstReg = MI.getOperand(0).getReg();
1899   Register PtrReg = MI.getOperand(1).getReg();
1900   Register CmpVal = MI.getOperand(2).getReg();
1901   Register NewVal = MI.getOperand(3).getReg();
1902 
1903   assert(SITargetLowering::isFlatGlobalAddrSpace(
1904            MRI.getType(PtrReg).getAddressSpace()) &&
1905          "this should not have been custom lowered");
1906 
1907   LLT ValTy = MRI.getType(CmpVal);
1908   LLT VecTy = LLT::vector(2, ValTy);
1909 
1910   B.setInstr(MI);
1911   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
1912 
1913   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
1914     .addDef(DstReg)
1915     .addUse(PtrReg)
1916     .addUse(PackedVal)
1917     .setMemRefs(MI.memoperands());
1918 
1919   MI.eraseFromParent();
1920   return true;
1921 }
1922 
1923 bool AMDGPULegalizerInfo::legalizeFlog(
1924   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
1925   Register Dst = MI.getOperand(0).getReg();
1926   Register Src = MI.getOperand(1).getReg();
1927   LLT Ty = B.getMRI()->getType(Dst);
1928   unsigned Flags = MI.getFlags();
1929   B.setInstr(MI);
1930 
1931   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
1932   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
1933 
1934   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
1935   MI.eraseFromParent();
1936   return true;
1937 }
1938 
1939 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
1940                                        MachineIRBuilder &B) const {
1941   Register Dst = MI.getOperand(0).getReg();
1942   Register Src = MI.getOperand(1).getReg();
1943   unsigned Flags = MI.getFlags();
1944   LLT Ty = B.getMRI()->getType(Dst);
1945   B.setInstr(MI);
1946 
1947   auto K = B.buildFConstant(Ty, numbers::log2e);
1948   auto Mul = B.buildFMul(Ty, Src, K, Flags);
1949   B.buildFExp2(Dst, Mul, Flags);
1950 
1951   MI.eraseFromParent();
1952   return true;
1953 }
1954 
1955 // Return the use branch instruction, otherwise null if the usage is invalid.
1956 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
1957                                        MachineRegisterInfo &MRI,
1958                                        MachineInstr *&Br) {
1959   Register CondDef = MI.getOperand(0).getReg();
1960   if (!MRI.hasOneNonDBGUse(CondDef))
1961     return nullptr;
1962 
1963   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
1964   if (UseMI.getParent() != MI.getParent() ||
1965       UseMI.getOpcode() != AMDGPU::G_BRCOND)
1966     return nullptr;
1967 
1968   // Make sure the cond br is followed by a G_BR
1969   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
1970   if (Next != MI.getParent()->end()) {
1971     if (Next->getOpcode() != AMDGPU::G_BR)
1972       return nullptr;
1973     Br = &*Next;
1974   }
1975 
1976   return &UseMI;
1977 }
1978 
1979 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
1980                                                 Register Reg, LLT Ty) const {
1981   Register LiveIn = MRI.getLiveInVirtReg(Reg);
1982   if (LiveIn)
1983     return LiveIn;
1984 
1985   Register NewReg = MRI.createGenericVirtualRegister(Ty);
1986   MRI.addLiveIn(Reg, NewReg);
1987   return NewReg;
1988 }
1989 
1990 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
1991                                          const ArgDescriptor *Arg) const {
1992   if (!Arg->isRegister() || !Arg->getRegister().isValid())
1993     return false; // TODO: Handle these
1994 
1995   assert(Arg->getRegister().isPhysical());
1996 
1997   MachineRegisterInfo &MRI = *B.getMRI();
1998 
1999   LLT Ty = MRI.getType(DstReg);
2000   Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
2001 
2002   if (Arg->isMasked()) {
2003     // TODO: Should we try to emit this once in the entry block?
2004     const LLT S32 = LLT::scalar(32);
2005     const unsigned Mask = Arg->getMask();
2006     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2007 
2008     Register AndMaskSrc = LiveIn;
2009 
2010     if (Shift != 0) {
2011       auto ShiftAmt = B.buildConstant(S32, Shift);
2012       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2013     }
2014 
2015     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2016   } else
2017     B.buildCopy(DstReg, LiveIn);
2018 
2019   // Insert the argument copy if it doens't already exist.
2020   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
2021   if (!MRI.getVRegDef(LiveIn)) {
2022     // FIXME: Should have scoped insert pt
2023     MachineBasicBlock &OrigInsBB = B.getMBB();
2024     auto OrigInsPt = B.getInsertPt();
2025 
2026     MachineBasicBlock &EntryMBB = B.getMF().front();
2027     EntryMBB.addLiveIn(Arg->getRegister());
2028     B.setInsertPt(EntryMBB, EntryMBB.begin());
2029     B.buildCopy(LiveIn, Arg->getRegister());
2030 
2031     B.setInsertPt(OrigInsBB, OrigInsPt);
2032   }
2033 
2034   return true;
2035 }
2036 
2037 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2038   MachineInstr &MI,
2039   MachineRegisterInfo &MRI,
2040   MachineIRBuilder &B,
2041   AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2042   B.setInstr(MI);
2043 
2044   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2045 
2046   const ArgDescriptor *Arg;
2047   const TargetRegisterClass *RC;
2048   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
2049   if (!Arg) {
2050     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
2051     return false;
2052   }
2053 
2054   if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
2055     MI.eraseFromParent();
2056     return true;
2057   }
2058 
2059   return false;
2060 }
2061 
2062 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2063                                        MachineRegisterInfo &MRI,
2064                                        MachineIRBuilder &B) const {
2065   B.setInstr(MI);
2066   Register Dst = MI.getOperand(0).getReg();
2067   LLT DstTy = MRI.getType(Dst);
2068   LLT S16 = LLT::scalar(16);
2069   LLT S32 = LLT::scalar(32);
2070   LLT S64 = LLT::scalar(64);
2071 
2072   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2073     return true;
2074 
2075   if (DstTy == S16)
2076     return legalizeFDIV16(MI, MRI, B);
2077   if (DstTy == S32)
2078     return legalizeFDIV32(MI, MRI, B);
2079   if (DstTy == S64)
2080     return legalizeFDIV64(MI, MRI, B);
2081 
2082   return false;
2083 }
2084 
2085 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2086                                                  MachineRegisterInfo &MRI,
2087                                                  MachineIRBuilder &B) const {
2088   Register Res = MI.getOperand(0).getReg();
2089   Register LHS = MI.getOperand(1).getReg();
2090   Register RHS = MI.getOperand(2).getReg();
2091 
2092   uint16_t Flags = MI.getFlags();
2093 
2094   LLT ResTy = MRI.getType(Res);
2095   LLT S32 = LLT::scalar(32);
2096   LLT S64 = LLT::scalar(64);
2097 
2098   const MachineFunction &MF = B.getMF();
2099   bool Unsafe =
2100     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2101 
2102   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2103     return false;
2104 
2105   if (!Unsafe && ResTy == S32 &&
2106       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2107     return false;
2108 
2109   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2110     // 1 / x -> RCP(x)
2111     if (CLHS->isExactlyValue(1.0)) {
2112       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2113         .addUse(RHS)
2114         .setMIFlags(Flags);
2115 
2116       MI.eraseFromParent();
2117       return true;
2118     }
2119 
2120     // -1 / x -> RCP( FNEG(x) )
2121     if (CLHS->isExactlyValue(-1.0)) {
2122       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2123       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2124         .addUse(FNeg.getReg(0))
2125         .setMIFlags(Flags);
2126 
2127       MI.eraseFromParent();
2128       return true;
2129     }
2130   }
2131 
2132   // x / y -> x * (1.0 / y)
2133   if (Unsafe) {
2134     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2135       .addUse(RHS)
2136       .setMIFlags(Flags);
2137     B.buildFMul(Res, LHS, RCP, Flags);
2138 
2139     MI.eraseFromParent();
2140     return true;
2141   }
2142 
2143   return false;
2144 }
2145 
2146 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2147                                          MachineRegisterInfo &MRI,
2148                                          MachineIRBuilder &B) const {
2149   B.setInstr(MI);
2150   Register Res = MI.getOperand(0).getReg();
2151   Register LHS = MI.getOperand(1).getReg();
2152   Register RHS = MI.getOperand(2).getReg();
2153 
2154   uint16_t Flags = MI.getFlags();
2155 
2156   LLT S16 = LLT::scalar(16);
2157   LLT S32 = LLT::scalar(32);
2158 
2159   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2160   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2161 
2162   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2163     .addUse(RHSExt.getReg(0))
2164     .setMIFlags(Flags);
2165 
2166   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2167   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2168 
2169   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2170     .addUse(RDst.getReg(0))
2171     .addUse(RHS)
2172     .addUse(LHS)
2173     .setMIFlags(Flags);
2174 
2175   MI.eraseFromParent();
2176   return true;
2177 }
2178 
2179 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2180 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2181 static void toggleSPDenormMode(bool Enable,
2182                                MachineIRBuilder &B,
2183                                const GCNSubtarget &ST,
2184                                AMDGPU::SIModeRegisterDefaults Mode) {
2185   // Set SP denorm mode to this value.
2186   unsigned SPDenormMode =
2187     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
2188 
2189   if (ST.hasDenormModeInst()) {
2190     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2191     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
2192 
2193     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2194     B.buildInstr(AMDGPU::S_DENORM_MODE)
2195       .addImm(NewDenormModeValue);
2196 
2197   } else {
2198     // Select FP32 bit field in mode register.
2199     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2200                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2201                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2202 
2203     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2204       .addImm(SPDenormMode)
2205       .addImm(SPDenormModeBitField);
2206   }
2207 }
2208 
2209 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2210                                          MachineRegisterInfo &MRI,
2211                                          MachineIRBuilder &B) const {
2212   B.setInstr(MI);
2213   Register Res = MI.getOperand(0).getReg();
2214   Register LHS = MI.getOperand(1).getReg();
2215   Register RHS = MI.getOperand(2).getReg();
2216   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2217   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2218 
2219   uint16_t Flags = MI.getFlags();
2220 
2221   LLT S32 = LLT::scalar(32);
2222   LLT S1 = LLT::scalar(1);
2223 
2224   auto One = B.buildFConstant(S32, 1.0f);
2225 
2226   auto DenominatorScaled =
2227     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2228       .addUse(RHS)
2229       .addUse(LHS)
2230       .addImm(1)
2231       .setMIFlags(Flags);
2232   auto NumeratorScaled =
2233     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2234       .addUse(LHS)
2235       .addUse(RHS)
2236       .addImm(0)
2237       .setMIFlags(Flags);
2238 
2239   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2240     .addUse(DenominatorScaled.getReg(0))
2241     .setMIFlags(Flags);
2242   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2243 
2244   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2245   // aren't modeled as reading it.
2246   if (!Mode.allFP32Denormals())
2247     toggleSPDenormMode(true, B, ST, Mode);
2248 
2249   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2250   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2251   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2252   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2253   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2254   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2255 
2256   if (!Mode.allFP32Denormals())
2257     toggleSPDenormMode(false, B, ST, Mode);
2258 
2259   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2260     .addUse(Fma4.getReg(0))
2261     .addUse(Fma1.getReg(0))
2262     .addUse(Fma3.getReg(0))
2263     .addUse(NumeratorScaled.getReg(1))
2264     .setMIFlags(Flags);
2265 
2266   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2267     .addUse(Fmas.getReg(0))
2268     .addUse(RHS)
2269     .addUse(LHS)
2270     .setMIFlags(Flags);
2271 
2272   MI.eraseFromParent();
2273   return true;
2274 }
2275 
2276 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
2277                                          MachineRegisterInfo &MRI,
2278                                          MachineIRBuilder &B) const {
2279   B.setInstr(MI);
2280   Register Res = MI.getOperand(0).getReg();
2281   Register LHS = MI.getOperand(1).getReg();
2282   Register RHS = MI.getOperand(2).getReg();
2283 
2284   uint16_t Flags = MI.getFlags();
2285 
2286   LLT S64 = LLT::scalar(64);
2287   LLT S1 = LLT::scalar(1);
2288 
2289   auto One = B.buildFConstant(S64, 1.0);
2290 
2291   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2292     .addUse(LHS)
2293     .addUse(RHS)
2294     .addImm(1)
2295     .setMIFlags(Flags);
2296 
2297   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
2298 
2299   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
2300     .addUse(DivScale0.getReg(0))
2301     .setMIFlags(Flags);
2302 
2303   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
2304   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
2305   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
2306 
2307   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2308     .addUse(LHS)
2309     .addUse(RHS)
2310     .addImm(0)
2311     .setMIFlags(Flags);
2312 
2313   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
2314   auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags);
2315   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
2316 
2317   Register Scale;
2318   if (!ST.hasUsableDivScaleConditionOutput()) {
2319     // Workaround a hardware bug on SI where the condition output from div_scale
2320     // is not usable.
2321 
2322     LLT S32 = LLT::scalar(32);
2323 
2324     auto NumUnmerge = B.buildUnmerge(S32, LHS);
2325     auto DenUnmerge = B.buildUnmerge(S32, RHS);
2326     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
2327     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
2328 
2329     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
2330                               Scale1Unmerge.getReg(1));
2331     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
2332                               Scale0Unmerge.getReg(1));
2333     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
2334   } else {
2335     Scale = DivScale1.getReg(1);
2336   }
2337 
2338   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
2339     .addUse(Fma4.getReg(0))
2340     .addUse(Fma3.getReg(0))
2341     .addUse(Mul.getReg(0))
2342     .addUse(Scale)
2343     .setMIFlags(Flags);
2344 
2345   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
2346     .addUse(Fmas.getReg(0))
2347     .addUse(RHS)
2348     .addUse(LHS)
2349     .setMIFlags(Flags);
2350 
2351   MI.eraseFromParent();
2352   return true;
2353 }
2354 
2355 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
2356                                                  MachineRegisterInfo &MRI,
2357                                                  MachineIRBuilder &B) const {
2358   B.setInstr(MI);
2359   Register Res = MI.getOperand(0).getReg();
2360   Register LHS = MI.getOperand(2).getReg();
2361   Register RHS = MI.getOperand(3).getReg();
2362   uint16_t Flags = MI.getFlags();
2363 
2364   LLT S32 = LLT::scalar(32);
2365   LLT S1 = LLT::scalar(1);
2366 
2367   auto Abs = B.buildFAbs(S32, RHS, Flags);
2368   const APFloat C0Val(1.0f);
2369 
2370   auto C0 = B.buildConstant(S32, 0x6f800000);
2371   auto C1 = B.buildConstant(S32, 0x2f800000);
2372   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
2373 
2374   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
2375   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
2376 
2377   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
2378 
2379   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2380     .addUse(Mul0.getReg(0))
2381     .setMIFlags(Flags);
2382 
2383   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
2384 
2385   B.buildFMul(Res, Sel, Mul1, Flags);
2386 
2387   MI.eraseFromParent();
2388   return true;
2389 }
2390 
2391 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
2392                                                  MachineRegisterInfo &MRI,
2393                                                  MachineIRBuilder &B) const {
2394   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2395   if (!MFI->isEntryFunction()) {
2396     return legalizePreloadedArgIntrin(MI, MRI, B,
2397                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
2398   }
2399 
2400   B.setInstr(MI);
2401 
2402   uint64_t Offset =
2403     ST.getTargetLowering()->getImplicitParameterOffset(
2404       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
2405   Register DstReg = MI.getOperand(0).getReg();
2406   LLT DstTy = MRI.getType(DstReg);
2407   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
2408 
2409   const ArgDescriptor *Arg;
2410   const TargetRegisterClass *RC;
2411   std::tie(Arg, RC)
2412     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2413   if (!Arg)
2414     return false;
2415 
2416   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
2417   if (!loadInputValue(KernargPtrReg, B, Arg))
2418     return false;
2419 
2420   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
2421   MI.eraseFromParent();
2422   return true;
2423 }
2424 
2425 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
2426                                               MachineRegisterInfo &MRI,
2427                                               MachineIRBuilder &B,
2428                                               unsigned AddrSpace) const {
2429   B.setInstr(MI);
2430   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
2431   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
2432   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
2433   MI.eraseFromParent();
2434   return true;
2435 }
2436 
2437 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
2438 // offset (the offset that is included in bounds checking and swizzling, to be
2439 // split between the instruction's voffset and immoffset fields) and soffset
2440 // (the offset that is excluded from bounds checking and swizzling, to go in
2441 // the instruction's soffset field).  This function takes the first kind of
2442 // offset and figures out how to split it between voffset and immoffset.
2443 std::tuple<Register, unsigned, unsigned>
2444 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
2445                                         Register OrigOffset) const {
2446   const unsigned MaxImm = 4095;
2447   Register BaseReg;
2448   unsigned TotalConstOffset;
2449   MachineInstr *OffsetDef;
2450   const LLT S32 = LLT::scalar(32);
2451 
2452   std::tie(BaseReg, TotalConstOffset, OffsetDef)
2453     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
2454 
2455   unsigned ImmOffset = TotalConstOffset;
2456 
2457   // If the immediate value is too big for the immoffset field, put the value
2458   // and -4096 into the immoffset field so that the value that is copied/added
2459   // for the voffset field is a multiple of 4096, and it stands more chance
2460   // of being CSEd with the copy/add for another similar load/store.
2461   // However, do not do that rounding down to a multiple of 4096 if that is a
2462   // negative number, as it appears to be illegal to have a negative offset
2463   // in the vgpr, even if adding the immediate offset makes it positive.
2464   unsigned Overflow = ImmOffset & ~MaxImm;
2465   ImmOffset -= Overflow;
2466   if ((int32_t)Overflow < 0) {
2467     Overflow += ImmOffset;
2468     ImmOffset = 0;
2469   }
2470 
2471   if (Overflow != 0) {
2472     if (!BaseReg) {
2473       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
2474     } else {
2475       auto OverflowVal = B.buildConstant(S32, Overflow);
2476       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
2477     }
2478   }
2479 
2480   if (!BaseReg)
2481     BaseReg = B.buildConstant(S32, 0).getReg(0);
2482 
2483   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
2484 }
2485 
2486 /// Handle register layout difference for f16 images for some subtargets.
2487 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
2488                                              MachineRegisterInfo &MRI,
2489                                              Register Reg) const {
2490   if (!ST.hasUnpackedD16VMem())
2491     return Reg;
2492 
2493   const LLT S16 = LLT::scalar(16);
2494   const LLT S32 = LLT::scalar(32);
2495   LLT StoreVT = MRI.getType(Reg);
2496   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
2497 
2498   auto Unmerge = B.buildUnmerge(S16, Reg);
2499 
2500   SmallVector<Register, 4> WideRegs;
2501   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
2502     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
2503 
2504   int NumElts = StoreVT.getNumElements();
2505 
2506   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
2507 }
2508 
2509 Register AMDGPULegalizerInfo::fixStoreSourceType(
2510   MachineIRBuilder &B, Register VData, bool IsFormat) const {
2511   MachineRegisterInfo *MRI = B.getMRI();
2512   LLT Ty = MRI->getType(VData);
2513 
2514   const LLT S16 = LLT::scalar(16);
2515 
2516   // Fixup illegal register types for i8 stores.
2517   if (Ty == LLT::scalar(8) || Ty == S16) {
2518     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
2519     return AnyExt;
2520   }
2521 
2522   if (Ty.isVector()) {
2523     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
2524       if (IsFormat)
2525         return handleD16VData(B, *MRI, VData);
2526     }
2527   }
2528 
2529   return VData;
2530 }
2531 
2532 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
2533                                               MachineRegisterInfo &MRI,
2534                                               MachineIRBuilder &B,
2535                                               bool IsTyped,
2536                                               bool IsFormat) const {
2537   B.setInstr(MI);
2538 
2539   Register VData = MI.getOperand(1).getReg();
2540   LLT Ty = MRI.getType(VData);
2541   LLT EltTy = Ty.getScalarType();
2542   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
2543   const LLT S32 = LLT::scalar(32);
2544 
2545   VData = fixStoreSourceType(B, VData, IsFormat);
2546   Register RSrc = MI.getOperand(2).getReg();
2547 
2548   MachineMemOperand *MMO = *MI.memoperands_begin();
2549   const int MemSize = MMO->getSize();
2550 
2551   unsigned ImmOffset;
2552   unsigned TotalOffset;
2553 
2554   // The typed intrinsics add an immediate after the registers.
2555   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
2556 
2557   // The struct intrinsic variants add one additional operand over raw.
2558   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
2559   Register VIndex;
2560   int OpOffset = 0;
2561   if (HasVIndex) {
2562     VIndex = MI.getOperand(3).getReg();
2563     OpOffset = 1;
2564   }
2565 
2566   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
2567   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
2568 
2569   unsigned Format = 0;
2570   if (IsTyped) {
2571     Format = MI.getOperand(5 + OpOffset).getImm();
2572     ++OpOffset;
2573   }
2574 
2575   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
2576 
2577   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
2578   if (TotalOffset != 0)
2579     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
2580 
2581   unsigned Opc;
2582   if (IsTyped) {
2583     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
2584                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
2585   } else if (IsFormat) {
2586     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
2587                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
2588   } else {
2589     switch (MemSize) {
2590     case 1:
2591       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
2592       break;
2593     case 2:
2594       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
2595       break;
2596     default:
2597       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
2598       break;
2599     }
2600   }
2601 
2602   if (!VIndex)
2603     VIndex = B.buildConstant(S32, 0).getReg(0);
2604 
2605   auto MIB = B.buildInstr(Opc)
2606     .addUse(VData)              // vdata
2607     .addUse(RSrc)               // rsrc
2608     .addUse(VIndex)             // vindex
2609     .addUse(VOffset)            // voffset
2610     .addUse(SOffset)            // soffset
2611     .addImm(ImmOffset);         // offset(imm)
2612 
2613   if (IsTyped)
2614     MIB.addImm(Format);
2615 
2616   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
2617      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
2618      .addMemOperand(MMO);
2619 
2620   MI.eraseFromParent();
2621   return true;
2622 }
2623 
2624 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
2625                                              MachineRegisterInfo &MRI,
2626                                              MachineIRBuilder &B,
2627                                              bool IsFormat,
2628                                              bool IsTyped) const {
2629   B.setInstr(MI);
2630 
2631   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
2632   MachineMemOperand *MMO = *MI.memoperands_begin();
2633   const int MemSize = MMO->getSize();
2634   const LLT S32 = LLT::scalar(32);
2635 
2636   Register Dst = MI.getOperand(0).getReg();
2637   Register RSrc = MI.getOperand(2).getReg();
2638 
2639   // The typed intrinsics add an immediate after the registers.
2640   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
2641 
2642   // The struct intrinsic variants add one additional operand over raw.
2643   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
2644   Register VIndex;
2645   int OpOffset = 0;
2646   if (HasVIndex) {
2647     VIndex = MI.getOperand(3).getReg();
2648     OpOffset = 1;
2649   }
2650 
2651   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
2652   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
2653 
2654   unsigned Format = 0;
2655   if (IsTyped) {
2656     Format = MI.getOperand(5 + OpOffset).getImm();
2657     ++OpOffset;
2658   }
2659 
2660   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
2661   unsigned ImmOffset;
2662   unsigned TotalOffset;
2663 
2664   LLT Ty = MRI.getType(Dst);
2665   LLT EltTy = Ty.getScalarType();
2666   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
2667   const bool Unpacked = ST.hasUnpackedD16VMem();
2668 
2669   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
2670   if (TotalOffset != 0)
2671     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
2672 
2673   unsigned Opc;
2674 
2675   if (IsTyped) {
2676     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
2677                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
2678   } else if (IsFormat) {
2679     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
2680                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
2681   } else {
2682     switch (MemSize) {
2683     case 1:
2684       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
2685       break;
2686     case 2:
2687       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
2688       break;
2689     default:
2690       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
2691       break;
2692     }
2693   }
2694 
2695   Register LoadDstReg;
2696 
2697   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
2698   LLT UnpackedTy = Ty.changeElementSize(32);
2699 
2700   if (IsExtLoad)
2701     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
2702   else if (Unpacked && IsD16 && Ty.isVector())
2703     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
2704   else
2705     LoadDstReg = Dst;
2706 
2707   if (!VIndex)
2708     VIndex = B.buildConstant(S32, 0).getReg(0);
2709 
2710   auto MIB = B.buildInstr(Opc)
2711     .addDef(LoadDstReg)         // vdata
2712     .addUse(RSrc)               // rsrc
2713     .addUse(VIndex)             // vindex
2714     .addUse(VOffset)            // voffset
2715     .addUse(SOffset)            // soffset
2716     .addImm(ImmOffset);         // offset(imm)
2717 
2718   if (IsTyped)
2719     MIB.addImm(Format);
2720 
2721   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
2722      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
2723      .addMemOperand(MMO);
2724 
2725   if (LoadDstReg != Dst) {
2726     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2727 
2728     // Widen result for extending loads was widened.
2729     if (IsExtLoad)
2730       B.buildTrunc(Dst, LoadDstReg);
2731     else {
2732       // Repack to original 16-bit vector result
2733       // FIXME: G_TRUNC should work, but legalization currently fails
2734       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
2735       SmallVector<Register, 4> Repack;
2736       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
2737         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
2738       B.buildMerge(Dst, Repack);
2739     }
2740   }
2741 
2742   MI.eraseFromParent();
2743   return true;
2744 }
2745 
2746 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
2747                                                MachineIRBuilder &B,
2748                                                bool IsInc) const {
2749   B.setInstr(MI);
2750   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
2751                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
2752   B.buildInstr(Opc)
2753     .addDef(MI.getOperand(0).getReg())
2754     .addUse(MI.getOperand(2).getReg())
2755     .addUse(MI.getOperand(3).getReg())
2756     .cloneMemRefs(MI);
2757   MI.eraseFromParent();
2758   return true;
2759 }
2760 
2761 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
2762   switch (IntrID) {
2763   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
2764   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
2765     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
2766   case Intrinsic::amdgcn_raw_buffer_atomic_add:
2767   case Intrinsic::amdgcn_struct_buffer_atomic_add:
2768     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
2769   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
2770   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
2771     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
2772   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
2773   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
2774     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
2775   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
2776   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
2777     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
2778   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
2779   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
2780     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
2781   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
2782   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
2783     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
2784   case Intrinsic::amdgcn_raw_buffer_atomic_and:
2785   case Intrinsic::amdgcn_struct_buffer_atomic_and:
2786     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
2787   case Intrinsic::amdgcn_raw_buffer_atomic_or:
2788   case Intrinsic::amdgcn_struct_buffer_atomic_or:
2789     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
2790   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
2791   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
2792     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
2793   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
2794   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
2795     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
2796   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
2797   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
2798     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
2799   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
2800   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
2801     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
2802   default:
2803     llvm_unreachable("unhandled atomic opcode");
2804   }
2805 }
2806 
2807 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
2808                                                MachineIRBuilder &B,
2809                                                Intrinsic::ID IID) const {
2810   B.setInstr(MI);
2811 
2812   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
2813                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
2814 
2815   Register Dst = MI.getOperand(0).getReg();
2816   Register VData = MI.getOperand(2).getReg();
2817 
2818   Register CmpVal;
2819   int OpOffset = 0;
2820 
2821   if (IsCmpSwap) {
2822     CmpVal = MI.getOperand(3 + OpOffset).getReg();
2823     ++OpOffset;
2824   }
2825 
2826   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
2827   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
2828 
2829   // The struct intrinsic variants add one additional operand over raw.
2830   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
2831   Register VIndex;
2832   if (HasVIndex) {
2833     VIndex = MI.getOperand(4 + OpOffset).getReg();
2834     ++OpOffset;
2835   }
2836 
2837   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
2838   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
2839   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
2840 
2841   MachineMemOperand *MMO = *MI.memoperands_begin();
2842 
2843   unsigned ImmOffset;
2844   unsigned TotalOffset;
2845   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
2846   if (TotalOffset != 0)
2847     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
2848 
2849   if (!VIndex)
2850     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
2851 
2852   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
2853     .addDef(Dst)
2854     .addUse(VData); // vdata
2855 
2856   if (IsCmpSwap)
2857     MIB.addReg(CmpVal);
2858 
2859   MIB.addUse(RSrc)               // rsrc
2860      .addUse(VIndex)             // vindex
2861      .addUse(VOffset)            // voffset
2862      .addUse(SOffset)            // soffset
2863      .addImm(ImmOffset)          // offset(imm)
2864      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
2865      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
2866      .addMemOperand(MMO);
2867 
2868   MI.eraseFromParent();
2869   return true;
2870 }
2871 
2872 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
2873     MachineInstr &MI, MachineIRBuilder &B,
2874     GISelChangeObserver &Observer,
2875     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
2876   // We are only processing the operands of d16 image operations on subtargets
2877   // that use the unpacked register layout.
2878   if (!ST.hasUnpackedD16VMem())
2879     return true;
2880 
2881   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
2882     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
2883 
2884   if (BaseOpcode->Atomic) // No d16 atomics
2885     return true;
2886 
2887   MachineRegisterInfo *MRI = B.getMRI();
2888   const LLT S32 = LLT::scalar(32);
2889   const LLT S16 = LLT::scalar(16);
2890 
2891   if (BaseOpcode->Store) {
2892     Register VData = MI.getOperand(1).getReg();
2893     LLT Ty = MRI->getType(VData);
2894     if (!Ty.isVector() || Ty.getElementType() != S16)
2895       return true;
2896 
2897     B.setInstr(MI);
2898 
2899     Observer.changingInstr(MI);
2900     MI.getOperand(1).setReg(handleD16VData(B, *MRI, VData));
2901     Observer.changedInstr(MI);
2902     return true;
2903   }
2904 
2905   // Must be an image load.
2906   Register DstReg = MI.getOperand(0).getReg();
2907   LLT Ty = MRI->getType(DstReg);
2908   if (!Ty.isVector() || Ty.getElementType() != S16)
2909     return true;
2910 
2911   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
2912 
2913   LLT WidenedTy = Ty.changeElementType(S32);
2914   Register WideDstReg = MRI->createGenericVirtualRegister(WidenedTy);
2915 
2916   Observer.changingInstr(MI);
2917   MI.getOperand(0).setReg(WideDstReg);
2918   Observer.changedInstr(MI);
2919 
2920   // FIXME: Just vector trunc should be sufficent, but legalization currently
2921   // broken.
2922   auto Unmerge = B.buildUnmerge(S32, WideDstReg);
2923 
2924   int NumOps = Unmerge->getNumOperands() - 1;
2925   SmallVector<Register, 4> RemergeParts(NumOps);
2926   for (int I = 0; I != NumOps; ++I)
2927     RemergeParts[I] = B.buildTrunc(S16, Unmerge.getReg(I)).getReg(0);
2928 
2929   B.buildBuildVector(DstReg, RemergeParts);
2930   return true;
2931 }
2932 
2933 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
2934                                             MachineIRBuilder &B,
2935                                             GISelChangeObserver &Observer) const {
2936   MachineRegisterInfo &MRI = *B.getMRI();
2937 
2938   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
2939   auto IntrID = MI.getIntrinsicID();
2940   switch (IntrID) {
2941   case Intrinsic::amdgcn_if:
2942   case Intrinsic::amdgcn_else: {
2943     MachineInstr *Br = nullptr;
2944     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
2945       const SIRegisterInfo *TRI
2946         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
2947 
2948       B.setInstr(*BrCond);
2949       Register Def = MI.getOperand(1).getReg();
2950       Register Use = MI.getOperand(3).getReg();
2951 
2952       MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB();
2953       if (Br)
2954         BrTarget = Br->getOperand(0).getMBB();
2955 
2956       if (IntrID == Intrinsic::amdgcn_if) {
2957         B.buildInstr(AMDGPU::SI_IF)
2958           .addDef(Def)
2959           .addUse(Use)
2960           .addMBB(BrTarget);
2961       } else {
2962         B.buildInstr(AMDGPU::SI_ELSE)
2963           .addDef(Def)
2964           .addUse(Use)
2965           .addMBB(BrTarget)
2966           .addImm(0);
2967       }
2968 
2969       if (Br)
2970         Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB());
2971 
2972       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
2973       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
2974       MI.eraseFromParent();
2975       BrCond->eraseFromParent();
2976       return true;
2977     }
2978 
2979     return false;
2980   }
2981   case Intrinsic::amdgcn_loop: {
2982     MachineInstr *Br = nullptr;
2983     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
2984       const SIRegisterInfo *TRI
2985         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
2986 
2987       B.setInstr(*BrCond);
2988 
2989       // FIXME: Need to adjust branch targets based on unconditional branch.
2990       Register Reg = MI.getOperand(2).getReg();
2991       B.buildInstr(AMDGPU::SI_LOOP)
2992         .addUse(Reg)
2993         .addMBB(BrCond->getOperand(1).getMBB());
2994       MI.eraseFromParent();
2995       BrCond->eraseFromParent();
2996       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
2997       return true;
2998     }
2999 
3000     return false;
3001   }
3002   case Intrinsic::amdgcn_kernarg_segment_ptr:
3003     return legalizePreloadedArgIntrin(
3004       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
3005   case Intrinsic::amdgcn_implicitarg_ptr:
3006     return legalizeImplicitArgPtr(MI, MRI, B);
3007   case Intrinsic::amdgcn_workitem_id_x:
3008     return legalizePreloadedArgIntrin(MI, MRI, B,
3009                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
3010   case Intrinsic::amdgcn_workitem_id_y:
3011     return legalizePreloadedArgIntrin(MI, MRI, B,
3012                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
3013   case Intrinsic::amdgcn_workitem_id_z:
3014     return legalizePreloadedArgIntrin(MI, MRI, B,
3015                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
3016   case Intrinsic::amdgcn_workgroup_id_x:
3017     return legalizePreloadedArgIntrin(MI, MRI, B,
3018                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
3019   case Intrinsic::amdgcn_workgroup_id_y:
3020     return legalizePreloadedArgIntrin(MI, MRI, B,
3021                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
3022   case Intrinsic::amdgcn_workgroup_id_z:
3023     return legalizePreloadedArgIntrin(MI, MRI, B,
3024                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
3025   case Intrinsic::amdgcn_dispatch_ptr:
3026     return legalizePreloadedArgIntrin(MI, MRI, B,
3027                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
3028   case Intrinsic::amdgcn_queue_ptr:
3029     return legalizePreloadedArgIntrin(MI, MRI, B,
3030                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
3031   case Intrinsic::amdgcn_implicit_buffer_ptr:
3032     return legalizePreloadedArgIntrin(
3033       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
3034   case Intrinsic::amdgcn_dispatch_id:
3035     return legalizePreloadedArgIntrin(MI, MRI, B,
3036                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
3037   case Intrinsic::amdgcn_fdiv_fast:
3038     return legalizeFDIVFastIntrin(MI, MRI, B);
3039   case Intrinsic::amdgcn_is_shared:
3040     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
3041   case Intrinsic::amdgcn_is_private:
3042     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
3043   case Intrinsic::amdgcn_wavefrontsize: {
3044     B.setInstr(MI);
3045     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
3046     MI.eraseFromParent();
3047     return true;
3048   }
3049   case Intrinsic::amdgcn_raw_buffer_store:
3050   case Intrinsic::amdgcn_struct_buffer_store:
3051     return legalizeBufferStore(MI, MRI, B, false, false);
3052   case Intrinsic::amdgcn_raw_buffer_store_format:
3053   case Intrinsic::amdgcn_struct_buffer_store_format:
3054     return legalizeBufferStore(MI, MRI, B, false, true);
3055   case Intrinsic::amdgcn_raw_tbuffer_store:
3056   case Intrinsic::amdgcn_struct_tbuffer_store:
3057     return legalizeBufferStore(MI, MRI, B, true, true);
3058   case Intrinsic::amdgcn_raw_buffer_load:
3059   case Intrinsic::amdgcn_struct_buffer_load:
3060     return legalizeBufferLoad(MI, MRI, B, false, false);
3061   case Intrinsic::amdgcn_raw_buffer_load_format:
3062   case Intrinsic::amdgcn_struct_buffer_load_format:
3063     return legalizeBufferLoad(MI, MRI, B, true, false);
3064   case Intrinsic::amdgcn_raw_tbuffer_load:
3065   case Intrinsic::amdgcn_struct_tbuffer_load:
3066     return legalizeBufferLoad(MI, MRI, B, true, true);
3067   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3068   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3069   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3070   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3071   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3072   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3073   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3074   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3075   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3076   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3077   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3078   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3079   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3080   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3081   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3082   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3083   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3084   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3085   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3086   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3087   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3088   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3089   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3090   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3091   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3092   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3093     return legalizeBufferAtomic(MI, B, IntrID);
3094   case Intrinsic::amdgcn_atomic_inc:
3095     return legalizeAtomicIncDec(MI, B, true);
3096   case Intrinsic::amdgcn_atomic_dec:
3097     return legalizeAtomicIncDec(MI, B, false);
3098   default: {
3099     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
3100             AMDGPU::getImageDimIntrinsicInfo(IntrID))
3101       return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr);
3102     return true;
3103   }
3104   }
3105 
3106   return true;
3107 }
3108