1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
19 #endif
20 
21 #include "AMDGPULegalizerInfo.h"
22 
23 #include "AMDGPU.h"
24 #include "AMDGPUGlobalISelUtils.h"
25 #include "AMDGPUTargetMachine.h"
26 #include "SIMachineFunctionInfo.h"
27 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
28 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
29 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
30 #include "llvm/CodeGen/TargetOpcodes.h"
31 #include "llvm/CodeGen/ValueTypes.h"
32 #include "llvm/IR/DerivedTypes.h"
33 #include "llvm/IR/DiagnosticInfo.h"
34 #include "llvm/IR/Type.h"
35 #include "llvm/Support/Debug.h"
36 
37 #define DEBUG_TYPE "amdgpu-legalinfo"
38 
39 using namespace llvm;
40 using namespace LegalizeActions;
41 using namespace LegalizeMutations;
42 using namespace LegalityPredicates;
43 using namespace MIPatternMatch;
44 
45 // Round the number of elements to the next power of two elements
46 static LLT getPow2VectorType(LLT Ty) {
47   unsigned NElts = Ty.getNumElements();
48   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
49   return Ty.changeNumElements(Pow2NElts);
50 }
51 
52 // Round the number of bits to the next power of two bits
53 static LLT getPow2ScalarType(LLT Ty) {
54   unsigned Bits = Ty.getSizeInBits();
55   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
56   return LLT::scalar(Pow2Bits);
57 }
58 
59 static LegalityPredicate isMultiple32(unsigned TypeIdx,
60                                       unsigned MaxSize = 1024) {
61   return [=](const LegalityQuery &Query) {
62     const LLT Ty = Query.Types[TypeIdx];
63     const LLT EltTy = Ty.getScalarType();
64     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
65   };
66 }
67 
68 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
69   return [=](const LegalityQuery &Query) {
70     return Query.Types[TypeIdx].getSizeInBits() == Size;
71   };
72 }
73 
74 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
75   return [=](const LegalityQuery &Query) {
76     const LLT Ty = Query.Types[TypeIdx];
77     return Ty.isVector() &&
78            Ty.getNumElements() % 2 != 0 &&
79            Ty.getElementType().getSizeInBits() < 32 &&
80            Ty.getSizeInBits() % 32 != 0;
81   };
82 }
83 
84 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
85   return [=](const LegalityQuery &Query) {
86     const LLT Ty = Query.Types[TypeIdx];
87     const LLT EltTy = Ty.getScalarType();
88     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
89   };
90 }
91 
92 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
93   return [=](const LegalityQuery &Query) {
94     const LLT Ty = Query.Types[TypeIdx];
95     const LLT EltTy = Ty.getElementType();
96     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
97   };
98 }
99 
100 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
101   return [=](const LegalityQuery &Query) {
102     const LLT Ty = Query.Types[TypeIdx];
103     const LLT EltTy = Ty.getElementType();
104     unsigned Size = Ty.getSizeInBits();
105     unsigned Pieces = (Size + 63) / 64;
106     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
107     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
108   };
109 }
110 
111 // Increase the number of vector elements to reach the next multiple of 32-bit
112 // type.
113 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
114   return [=](const LegalityQuery &Query) {
115     const LLT Ty = Query.Types[TypeIdx];
116 
117     const LLT EltTy = Ty.getElementType();
118     const int Size = Ty.getSizeInBits();
119     const int EltSize = EltTy.getSizeInBits();
120     const int NextMul32 = (Size + 31) / 32;
121 
122     assert(EltSize < 32);
123 
124     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
125     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
126   };
127 }
128 
129 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
130   return [=](const LegalityQuery &Query) {
131     const LLT QueryTy = Query.Types[TypeIdx];
132     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
133   };
134 }
135 
136 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
137   return [=](const LegalityQuery &Query) {
138     const LLT QueryTy = Query.Types[TypeIdx];
139     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
140   };
141 }
142 
143 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
144   return [=](const LegalityQuery &Query) {
145     const LLT QueryTy = Query.Types[TypeIdx];
146     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
147   };
148 }
149 
150 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
151 // v2s16.
152 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
153   return [=](const LegalityQuery &Query) {
154     const LLT Ty = Query.Types[TypeIdx];
155     if (Ty.isVector()) {
156       const int EltSize = Ty.getElementType().getSizeInBits();
157       return EltSize == 32 || EltSize == 64 ||
158             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
159              EltSize == 128 || EltSize == 256;
160     }
161 
162     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
163   };
164 }
165 
166 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
167   return [=](const LegalityQuery &Query) {
168     const LLT QueryTy = Query.Types[TypeIdx];
169     return QueryTy.isVector() && QueryTy.getElementType() == Type;
170   };
171 }
172 
173 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
174   return [=](const LegalityQuery &Query) {
175     const LLT Ty = Query.Types[TypeIdx];
176     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
177            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
178   };
179 }
180 
181 static LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1) {
182   return [=](const LegalityQuery &Query) {
183     return Query.Types[TypeIdx0].getSizeInBits() <
184            Query.Types[TypeIdx1].getSizeInBits();
185   };
186 }
187 
188 static LegalityPredicate greaterThan(unsigned TypeIdx0, unsigned TypeIdx1) {
189   return [=](const LegalityQuery &Query) {
190     return Query.Types[TypeIdx0].getSizeInBits() >
191            Query.Types[TypeIdx1].getSizeInBits();
192   };
193 }
194 
195 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
196                                          const GCNTargetMachine &TM)
197   :  ST(ST_) {
198   using namespace TargetOpcode;
199 
200   auto GetAddrSpacePtr = [&TM](unsigned AS) {
201     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
202   };
203 
204   const LLT S1 = LLT::scalar(1);
205   const LLT S16 = LLT::scalar(16);
206   const LLT S32 = LLT::scalar(32);
207   const LLT S64 = LLT::scalar(64);
208   const LLT S128 = LLT::scalar(128);
209   const LLT S256 = LLT::scalar(256);
210   const LLT S1024 = LLT::scalar(1024);
211 
212   const LLT V2S16 = LLT::vector(2, 16);
213   const LLT V4S16 = LLT::vector(4, 16);
214 
215   const LLT V2S32 = LLT::vector(2, 32);
216   const LLT V3S32 = LLT::vector(3, 32);
217   const LLT V4S32 = LLT::vector(4, 32);
218   const LLT V5S32 = LLT::vector(5, 32);
219   const LLT V6S32 = LLT::vector(6, 32);
220   const LLT V7S32 = LLT::vector(7, 32);
221   const LLT V8S32 = LLT::vector(8, 32);
222   const LLT V9S32 = LLT::vector(9, 32);
223   const LLT V10S32 = LLT::vector(10, 32);
224   const LLT V11S32 = LLT::vector(11, 32);
225   const LLT V12S32 = LLT::vector(12, 32);
226   const LLT V13S32 = LLT::vector(13, 32);
227   const LLT V14S32 = LLT::vector(14, 32);
228   const LLT V15S32 = LLT::vector(15, 32);
229   const LLT V16S32 = LLT::vector(16, 32);
230   const LLT V32S32 = LLT::vector(32, 32);
231 
232   const LLT V2S64 = LLT::vector(2, 64);
233   const LLT V3S64 = LLT::vector(3, 64);
234   const LLT V4S64 = LLT::vector(4, 64);
235   const LLT V5S64 = LLT::vector(5, 64);
236   const LLT V6S64 = LLT::vector(6, 64);
237   const LLT V7S64 = LLT::vector(7, 64);
238   const LLT V8S64 = LLT::vector(8, 64);
239   const LLT V16S64 = LLT::vector(16, 64);
240 
241   std::initializer_list<LLT> AllS32Vectors =
242     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
243      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
244   std::initializer_list<LLT> AllS64Vectors =
245     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
246 
247   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
248   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
249   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
250   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
251   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
252   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
253   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
254 
255   const LLT CodePtr = FlatPtr;
256 
257   const std::initializer_list<LLT> AddrSpaces64 = {
258     GlobalPtr, ConstantPtr, FlatPtr
259   };
260 
261   const std::initializer_list<LLT> AddrSpaces32 = {
262     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
263   };
264 
265   const std::initializer_list<LLT> FPTypesBase = {
266     S32, S64
267   };
268 
269   const std::initializer_list<LLT> FPTypes16 = {
270     S32, S64, S16
271   };
272 
273   const std::initializer_list<LLT> FPTypesPK16 = {
274     S32, S64, S16, V2S16
275   };
276 
277   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
278 
279   setAction({G_BRCOND, S1}, Legal); // VCC branches
280   setAction({G_BRCOND, S32}, Legal); // SCC branches
281 
282   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
283   // elements for v3s16
284   getActionDefinitionsBuilder(G_PHI)
285     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
286     .legalFor(AllS32Vectors)
287     .legalFor(AllS64Vectors)
288     .legalFor(AddrSpaces64)
289     .legalFor(AddrSpaces32)
290     .clampScalar(0, S32, S256)
291     .widenScalarToNextPow2(0, 32)
292     .clampMaxNumElements(0, S32, 16)
293     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
294     .legalIf(isPointer(0));
295 
296   if (ST.has16BitInsts()) {
297     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
298       .legalFor({S32, S16})
299       .clampScalar(0, S16, S32)
300       .scalarize(0)
301       .widenScalarToNextPow2(0, 32);
302   } else {
303     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
304       .legalFor({S32})
305       .clampScalar(0, S32, S32)
306       .scalarize(0);
307   }
308 
309   // FIXME: Not really legal. Placeholder for custom lowering.
310   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
311     .customFor({S32, S64})
312     .clampScalar(0, S32, S64)
313     .widenScalarToNextPow2(0, 32)
314     .scalarize(0);
315 
316   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
317     .legalFor({S32})
318     .clampScalar(0, S32, S32)
319     .scalarize(0);
320 
321   // Report legal for any types we can handle anywhere. For the cases only legal
322   // on the SALU, RegBankSelect will be able to re-legalize.
323   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
324     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
325     .clampScalar(0, S32, S64)
326     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
327     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
328     .widenScalarToNextPow2(0)
329     .scalarize(0);
330 
331   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
332                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
333     .legalFor({{S32, S1}, {S32, S32}})
334     .minScalar(0, S32)
335     // TODO: .scalarize(0)
336     .lower();
337 
338   getActionDefinitionsBuilder(G_BITCAST)
339     // Don't worry about the size constraint.
340     .legalIf(all(isRegisterType(0), isRegisterType(1)))
341     .lower();
342 
343 
344   getActionDefinitionsBuilder(G_CONSTANT)
345     .legalFor({S1, S32, S64, S16, GlobalPtr,
346                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
347     .clampScalar(0, S32, S64)
348     .widenScalarToNextPow2(0)
349     .legalIf(isPointer(0));
350 
351   getActionDefinitionsBuilder(G_FCONSTANT)
352     .legalFor({S32, S64, S16})
353     .clampScalar(0, S16, S64);
354 
355   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
356     .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
357                ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
358     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
359     .clampScalarOrElt(0, S32, S1024)
360     .legalIf(isMultiple32(0))
361     .widenScalarToNextPow2(0, 32)
362     .clampMaxNumElements(0, S32, 16);
363 
364   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
365   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
366     .unsupportedFor({PrivatePtr})
367     .custom();
368   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
369 
370   auto &FPOpActions = getActionDefinitionsBuilder(
371     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
372     .legalFor({S32, S64});
373   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
374     .customFor({S32, S64});
375   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
376     .customFor({S32, S64});
377 
378   if (ST.has16BitInsts()) {
379     if (ST.hasVOP3PInsts())
380       FPOpActions.legalFor({S16, V2S16});
381     else
382       FPOpActions.legalFor({S16});
383 
384     TrigActions.customFor({S16});
385     FDIVActions.customFor({S16});
386   }
387 
388   auto &MinNumMaxNum = getActionDefinitionsBuilder({
389       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
390 
391   if (ST.hasVOP3PInsts()) {
392     MinNumMaxNum.customFor(FPTypesPK16)
393       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
394       .clampMaxNumElements(0, S16, 2)
395       .clampScalar(0, S16, S64)
396       .scalarize(0);
397   } else if (ST.has16BitInsts()) {
398     MinNumMaxNum.customFor(FPTypes16)
399       .clampScalar(0, S16, S64)
400       .scalarize(0);
401   } else {
402     MinNumMaxNum.customFor(FPTypesBase)
403       .clampScalar(0, S32, S64)
404       .scalarize(0);
405   }
406 
407   if (ST.hasVOP3PInsts())
408     FPOpActions.clampMaxNumElements(0, S16, 2);
409 
410   FPOpActions
411     .scalarize(0)
412     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
413 
414   TrigActions
415     .scalarize(0)
416     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
417 
418   FDIVActions
419     .scalarize(0)
420     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
421 
422   getActionDefinitionsBuilder({G_FNEG, G_FABS})
423     .legalFor(FPTypesPK16)
424     .clampMaxNumElements(0, S16, 2)
425     .scalarize(0)
426     .clampScalar(0, S16, S64);
427 
428   if (ST.has16BitInsts()) {
429     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
430       .legalFor({S32, S64, S16})
431       .scalarize(0)
432       .clampScalar(0, S16, S64);
433   } else {
434     getActionDefinitionsBuilder(G_FSQRT)
435       .legalFor({S32, S64})
436       .scalarize(0)
437       .clampScalar(0, S32, S64);
438 
439     if (ST.hasFractBug()) {
440       getActionDefinitionsBuilder(G_FFLOOR)
441         .customFor({S64})
442         .legalFor({S32, S64})
443         .scalarize(0)
444         .clampScalar(0, S32, S64);
445     } else {
446       getActionDefinitionsBuilder(G_FFLOOR)
447         .legalFor({S32, S64})
448         .scalarize(0)
449         .clampScalar(0, S32, S64);
450     }
451   }
452 
453   getActionDefinitionsBuilder(G_FPTRUNC)
454     .legalFor({{S32, S64}, {S16, S32}})
455     .scalarize(0)
456     .lower();
457 
458   getActionDefinitionsBuilder(G_FPEXT)
459     .legalFor({{S64, S32}, {S32, S16}})
460     .lowerFor({{S64, S16}}) // FIXME: Implement
461     .scalarize(0);
462 
463   getActionDefinitionsBuilder(G_FSUB)
464       // Use actual fsub instruction
465       .legalFor({S32})
466       // Must use fadd + fneg
467       .lowerFor({S64, S16, V2S16})
468       .scalarize(0)
469       .clampScalar(0, S32, S64);
470 
471   // Whether this is legal depends on the floating point mode for the function.
472   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
473   if (ST.hasMadF16())
474     FMad.customFor({S32, S16});
475   else
476     FMad.customFor({S32});
477   FMad.scalarize(0)
478       .lower();
479 
480   getActionDefinitionsBuilder(G_TRUNC)
481     .alwaysLegal();
482 
483   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
484     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
485                {S32, S1}, {S64, S1}, {S16, S1}})
486     .scalarize(0)
487     .clampScalar(0, S32, S64)
488     .widenScalarToNextPow2(1, 32);
489 
490   // TODO: Split s1->s64 during regbankselect for VALU.
491   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
492     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
493     .lowerFor({{S32, S64}})
494     .lowerIf(typeIs(1, S1))
495     .customFor({{S64, S64}});
496   if (ST.has16BitInsts())
497     IToFP.legalFor({{S16, S16}});
498   IToFP.clampScalar(1, S32, S64)
499        .scalarize(0)
500        .widenScalarToNextPow2(1);
501 
502   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
503     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
504     .customFor({{S64, S64}});
505   if (ST.has16BitInsts())
506     FPToI.legalFor({{S16, S16}});
507   else
508     FPToI.minScalar(1, S32);
509 
510   FPToI.minScalar(0, S32)
511        .scalarize(0)
512        .lower();
513 
514   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
515     .scalarize(0)
516     .lower();
517 
518   if (ST.has16BitInsts()) {
519     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
520       .legalFor({S16, S32, S64})
521       .clampScalar(0, S16, S64)
522       .scalarize(0);
523   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
524     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
525       .legalFor({S32, S64})
526       .clampScalar(0, S32, S64)
527       .scalarize(0);
528   } else {
529     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
530       .legalFor({S32})
531       .customFor({S64})
532       .clampScalar(0, S32, S64)
533       .scalarize(0);
534   }
535 
536   getActionDefinitionsBuilder({G_PTR_ADD, G_PTR_MASK})
537     .scalarize(0)
538     .alwaysLegal();
539 
540   auto &CmpBuilder =
541     getActionDefinitionsBuilder(G_ICMP)
542     // The compare output type differs based on the register bank of the output,
543     // so make both s1 and s32 legal.
544     //
545     // Scalar compares producing output in scc will be promoted to s32, as that
546     // is the allocatable register type that will be needed for the copy from
547     // scc. This will be promoted during RegBankSelect, and we assume something
548     // before that won't try to use s32 result types.
549     //
550     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
551     // bank.
552     .legalForCartesianProduct(
553       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
554     .legalForCartesianProduct(
555       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
556   if (ST.has16BitInsts()) {
557     CmpBuilder.legalFor({{S1, S16}});
558   }
559 
560   CmpBuilder
561     .widenScalarToNextPow2(1)
562     .clampScalar(1, S32, S64)
563     .scalarize(0)
564     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
565 
566   getActionDefinitionsBuilder(G_FCMP)
567     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
568     .widenScalarToNextPow2(1)
569     .clampScalar(1, S32, S64)
570     .scalarize(0);
571 
572   // FIXME: fpow has a selection pattern that should move to custom lowering.
573   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
574   if (ST.has16BitInsts())
575     Exp2Ops.legalFor({S32, S16});
576   else
577     Exp2Ops.legalFor({S32});
578   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
579   Exp2Ops.scalarize(0);
580 
581   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
582   if (ST.has16BitInsts())
583     ExpOps.customFor({{S32}, {S16}});
584   else
585     ExpOps.customFor({S32});
586   ExpOps.clampScalar(0, MinScalarFPTy, S32)
587         .scalarize(0);
588 
589   // The 64-bit versions produce 32-bit results, but only on the SALU.
590   getActionDefinitionsBuilder(G_CTPOP)
591     .legalFor({{S32, S32}, {S32, S64}})
592     .clampScalar(0, S32, S32)
593     .clampScalar(1, S32, S64)
594     .scalarize(0)
595     .widenScalarToNextPow2(0, 32)
596     .widenScalarToNextPow2(1, 32);
597 
598   // The hardware instructions return a different result on 0 than the generic
599   // instructions expect. The hardware produces -1, but these produce the
600   // bitwidth.
601   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
602     .scalarize(0)
603     .clampScalar(0, S32, S32)
604     .clampScalar(1, S32, S64)
605     .widenScalarToNextPow2(0, 32)
606     .widenScalarToNextPow2(1, 32)
607     .lower();
608 
609   // The 64-bit versions produce 32-bit results, but only on the SALU.
610   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
611     .legalFor({{S32, S32}, {S32, S64}})
612     .clampScalar(0, S32, S32)
613     .clampScalar(1, S32, S64)
614     .scalarize(0)
615     .widenScalarToNextPow2(0, 32)
616     .widenScalarToNextPow2(1, 32);
617 
618   getActionDefinitionsBuilder(G_BITREVERSE)
619     .legalFor({S32})
620     .clampScalar(0, S32, S32)
621     .scalarize(0);
622 
623   if (ST.has16BitInsts()) {
624     getActionDefinitionsBuilder(G_BSWAP)
625       .legalFor({S16, S32, V2S16})
626       .clampMaxNumElements(0, S16, 2)
627       // FIXME: Fixing non-power-of-2 before clamp is workaround for
628       // narrowScalar limitation.
629       .widenScalarToNextPow2(0)
630       .clampScalar(0, S16, S32)
631       .scalarize(0);
632 
633     if (ST.hasVOP3PInsts()) {
634       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
635         .legalFor({S32, S16, V2S16})
636         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
637         .clampMaxNumElements(0, S16, 2)
638         .clampScalar(0, S16, S32)
639         .widenScalarToNextPow2(0)
640         .scalarize(0);
641     } else {
642       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
643         .legalFor({S32, S16})
644         .widenScalarToNextPow2(0)
645         .clampScalar(0, S16, S32)
646         .scalarize(0);
647     }
648   } else {
649     // TODO: Should have same legality without v_perm_b32
650     getActionDefinitionsBuilder(G_BSWAP)
651       .legalFor({S32})
652       .lowerIf(narrowerThan(0, 32))
653       // FIXME: Fixing non-power-of-2 before clamp is workaround for
654       // narrowScalar limitation.
655       .widenScalarToNextPow2(0)
656       .maxScalar(0, S32)
657       .scalarize(0)
658       .lower();
659 
660     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
661       .legalFor({S32})
662       .clampScalar(0, S32, S32)
663       .widenScalarToNextPow2(0)
664       .scalarize(0);
665   }
666 
667   getActionDefinitionsBuilder(G_INTTOPTR)
668     // List the common cases
669     .legalForCartesianProduct(AddrSpaces64, {S64})
670     .legalForCartesianProduct(AddrSpaces32, {S32})
671     .scalarize(0)
672     // Accept any address space as long as the size matches
673     .legalIf(sameSize(0, 1))
674     .widenScalarIf(smallerThan(1, 0),
675       [](const LegalityQuery &Query) {
676         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
677       })
678     .narrowScalarIf(greaterThan(1, 0),
679       [](const LegalityQuery &Query) {
680         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
681       });
682 
683   getActionDefinitionsBuilder(G_PTRTOINT)
684     // List the common cases
685     .legalForCartesianProduct(AddrSpaces64, {S64})
686     .legalForCartesianProduct(AddrSpaces32, {S32})
687     .scalarize(0)
688     // Accept any address space as long as the size matches
689     .legalIf(sameSize(0, 1))
690     .widenScalarIf(smallerThan(0, 1),
691       [](const LegalityQuery &Query) {
692         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
693       })
694     .narrowScalarIf(
695       greaterThan(0, 1),
696       [](const LegalityQuery &Query) {
697         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
698       });
699 
700   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
701     .scalarize(0)
702     .custom();
703 
704   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
705   // handle some operations by just promoting the register during
706   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
707   auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned {
708     switch (AS) {
709     // FIXME: Private element size.
710     case AMDGPUAS::PRIVATE_ADDRESS:
711       return 32;
712     // FIXME: Check subtarget
713     case AMDGPUAS::LOCAL_ADDRESS:
714       return ST.useDS128() ? 128 : 64;
715 
716     // Treat constant and global as identical. SMRD loads are sometimes usable
717     // for global loads (ideally constant address space should be eliminated)
718     // depending on the context. Legality cannot be context dependent, but
719     // RegBankSelect can split the load as necessary depending on the pointer
720     // register bank/uniformity and if the memory is invariant or not written in
721     // a kernel.
722     case AMDGPUAS::CONSTANT_ADDRESS:
723     case AMDGPUAS::GLOBAL_ADDRESS:
724       return IsLoad ? 512 : 128;
725     default:
726       return 128;
727     }
728   };
729 
730   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
731                                     bool IsLoad) -> bool {
732     const LLT DstTy = Query.Types[0];
733 
734     // Split vector extloads.
735     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
736     unsigned Align = Query.MMODescrs[0].AlignInBits;
737 
738     if (MemSize < DstTy.getSizeInBits())
739       MemSize = std::max(MemSize, Align);
740 
741     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
742       return true;
743 
744     const LLT PtrTy = Query.Types[1];
745     unsigned AS = PtrTy.getAddressSpace();
746     if (MemSize > maxSizeForAddrSpace(AS, IsLoad))
747       return true;
748 
749     // Catch weird sized loads that don't evenly divide into the access sizes
750     // TODO: May be able to widen depending on alignment etc.
751     unsigned NumRegs = (MemSize + 31) / 32;
752     if (NumRegs == 3) {
753       if (!ST.hasDwordx3LoadStores())
754         return true;
755     } else {
756       // If the alignment allows, these should have been widened.
757       if (!isPowerOf2_32(NumRegs))
758         return true;
759     }
760 
761     if (Align < MemSize) {
762       const SITargetLowering *TLI = ST.getTargetLowering();
763       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
764     }
765 
766     return false;
767   };
768 
769   const auto shouldWidenLoadResult = [=](const LegalityQuery &Query) -> bool {
770     unsigned Size = Query.Types[0].getSizeInBits();
771     if (isPowerOf2_32(Size))
772       return false;
773 
774     if (Size == 96 && ST.hasDwordx3LoadStores())
775       return false;
776 
777     unsigned AddrSpace = Query.Types[1].getAddressSpace();
778     if (Size >= maxSizeForAddrSpace(AddrSpace, true))
779       return false;
780 
781     unsigned Align = Query.MMODescrs[0].AlignInBits;
782     unsigned RoundedSize = NextPowerOf2(Size);
783     return (Align >= RoundedSize);
784   };
785 
786   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
787   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
788   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
789 
790   // TODO: Refine based on subtargets which support unaligned access or 128-bit
791   // LDS
792   // TODO: Unsupported flat for SI.
793 
794   for (unsigned Op : {G_LOAD, G_STORE}) {
795     const bool IsStore = Op == G_STORE;
796 
797     auto &Actions = getActionDefinitionsBuilder(Op);
798     // Whitelist the common cases.
799     // TODO: Loads to s16 on gfx9
800     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
801                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
802                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
803                                       {S128, GlobalPtr, 128, GlobalAlign32},
804                                       {S64, GlobalPtr, 64, GlobalAlign32},
805                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
806                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
807                                       {S32, GlobalPtr, 8, GlobalAlign8},
808                                       {S32, GlobalPtr, 16, GlobalAlign16},
809 
810                                       {S32, LocalPtr, 32, 32},
811                                       {S64, LocalPtr, 64, 32},
812                                       {V2S32, LocalPtr, 64, 32},
813                                       {S32, LocalPtr, 8, 8},
814                                       {S32, LocalPtr, 16, 16},
815                                       {V2S16, LocalPtr, 32, 32},
816 
817                                       {S32, PrivatePtr, 32, 32},
818                                       {S32, PrivatePtr, 8, 8},
819                                       {S32, PrivatePtr, 16, 16},
820                                       {V2S16, PrivatePtr, 32, 32},
821 
822                                       {S32, FlatPtr, 32, GlobalAlign32},
823                                       {S32, FlatPtr, 16, GlobalAlign16},
824                                       {S32, FlatPtr, 8, GlobalAlign8},
825                                       {V2S16, FlatPtr, 32, GlobalAlign32},
826 
827                                       {S32, ConstantPtr, 32, GlobalAlign32},
828                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
829                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
830                                       {S64, ConstantPtr, 64, GlobalAlign32},
831                                       {S128, ConstantPtr, 128, GlobalAlign32},
832                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
833     Actions
834         .customIf(typeIs(1, Constant32Ptr))
835         // Widen suitably aligned loads by loading extra elements.
836         .moreElementsIf([=](const LegalityQuery &Query) {
837             const LLT Ty = Query.Types[0];
838             return Op == G_LOAD && Ty.isVector() &&
839                    shouldWidenLoadResult(Query);
840           }, moreElementsToNextPow2(0))
841         .widenScalarIf([=](const LegalityQuery &Query) {
842             const LLT Ty = Query.Types[0];
843             return Op == G_LOAD && !Ty.isVector() &&
844                    shouldWidenLoadResult(Query);
845           }, widenScalarOrEltToNextPow2(0))
846         .narrowScalarIf(
847             [=](const LegalityQuery &Query) -> bool {
848               return !Query.Types[0].isVector() &&
849                      needToSplitMemOp(Query, Op == G_LOAD);
850             },
851             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
852               const LLT DstTy = Query.Types[0];
853               const LLT PtrTy = Query.Types[1];
854 
855               const unsigned DstSize = DstTy.getSizeInBits();
856               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
857 
858               // Split extloads.
859               if (DstSize > MemSize)
860                 return std::make_pair(0, LLT::scalar(MemSize));
861 
862               if (!isPowerOf2_32(DstSize)) {
863                 // We're probably decomposing an odd sized store. Try to split
864                 // to the widest type. TODO: Account for alignment. As-is it
865                 // should be OK, since the new parts will be further legalized.
866                 unsigned FloorSize = PowerOf2Floor(DstSize);
867                 return std::make_pair(0, LLT::scalar(FloorSize));
868               }
869 
870               if (DstSize > 32 && (DstSize % 32 != 0)) {
871                 // FIXME: Need a way to specify non-extload of larger size if
872                 // suitably aligned.
873                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
874               }
875 
876               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
877                                                      Op == G_LOAD);
878               if (MemSize > MaxSize)
879                 return std::make_pair(0, LLT::scalar(MaxSize));
880 
881               unsigned Align = Query.MMODescrs[0].AlignInBits;
882               return std::make_pair(0, LLT::scalar(Align));
883             })
884         .fewerElementsIf(
885             [=](const LegalityQuery &Query) -> bool {
886               return Query.Types[0].isVector() &&
887                      needToSplitMemOp(Query, Op == G_LOAD);
888             },
889             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
890               const LLT DstTy = Query.Types[0];
891               const LLT PtrTy = Query.Types[1];
892 
893               LLT EltTy = DstTy.getElementType();
894               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
895                                                      Op == G_LOAD);
896 
897               // FIXME: Handle widened to power of 2 results better. This ends
898               // up scalarizing.
899               // FIXME: 3 element stores scalarized on SI
900 
901               // Split if it's too large for the address space.
902               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
903                 unsigned NumElts = DstTy.getNumElements();
904                 unsigned EltSize = EltTy.getSizeInBits();
905 
906                 if (MaxSize % EltSize == 0) {
907                   return std::make_pair(
908                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
909                 }
910 
911                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
912 
913                 // FIXME: Refine when odd breakdowns handled
914                 // The scalars will need to be re-legalized.
915                 if (NumPieces == 1 || NumPieces >= NumElts ||
916                     NumElts % NumPieces != 0)
917                   return std::make_pair(0, EltTy);
918 
919                 return std::make_pair(0,
920                                       LLT::vector(NumElts / NumPieces, EltTy));
921               }
922 
923               // FIXME: We could probably handle weird extending loads better.
924               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
925               if (DstTy.getSizeInBits() > MemSize)
926                 return std::make_pair(0, EltTy);
927 
928               unsigned EltSize = EltTy.getSizeInBits();
929               unsigned DstSize = DstTy.getSizeInBits();
930               if (!isPowerOf2_32(DstSize)) {
931                 // We're probably decomposing an odd sized store. Try to split
932                 // to the widest type. TODO: Account for alignment. As-is it
933                 // should be OK, since the new parts will be further legalized.
934                 unsigned FloorSize = PowerOf2Floor(DstSize);
935                 return std::make_pair(
936                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
937               }
938 
939               // Need to split because of alignment.
940               unsigned Align = Query.MMODescrs[0].AlignInBits;
941               if (EltSize > Align &&
942                   (EltSize / Align < DstTy.getNumElements())) {
943                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
944               }
945 
946               // May need relegalization for the scalars.
947               return std::make_pair(0, EltTy);
948             })
949         .minScalar(0, S32);
950 
951     if (IsStore)
952       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
953 
954     // TODO: Need a bitcast lower option?
955     Actions
956         .legalIf([=](const LegalityQuery &Query) {
957           const LLT Ty0 = Query.Types[0];
958           unsigned Size = Ty0.getSizeInBits();
959           unsigned MemSize = Query.MMODescrs[0].SizeInBits;
960           unsigned Align = Query.MMODescrs[0].AlignInBits;
961 
962           // FIXME: Widening store from alignment not valid.
963           if (MemSize < Size)
964             MemSize = std::max(MemSize, Align);
965 
966           // No extending vector loads.
967           if (Size > MemSize && Ty0.isVector())
968             return false;
969 
970           switch (MemSize) {
971           case 8:
972           case 16:
973             return Size == 32;
974           case 32:
975           case 64:
976           case 128:
977             return true;
978           case 96:
979             return ST.hasDwordx3LoadStores();
980           case 256:
981           case 512:
982             return true;
983           default:
984             return false;
985           }
986         })
987         .widenScalarToNextPow2(0)
988         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
989   }
990 
991   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
992                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
993                                                   {S32, GlobalPtr, 16, 2 * 8},
994                                                   {S32, LocalPtr, 8, 8},
995                                                   {S32, LocalPtr, 16, 16},
996                                                   {S32, PrivatePtr, 8, 8},
997                                                   {S32, PrivatePtr, 16, 16},
998                                                   {S32, ConstantPtr, 8, 8},
999                                                   {S32, ConstantPtr, 16, 2 * 8}});
1000   if (ST.hasFlatAddressSpace()) {
1001     ExtLoads.legalForTypesWithMemDesc(
1002         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1003   }
1004 
1005   ExtLoads.clampScalar(0, S32, S32)
1006           .widenScalarToNextPow2(0)
1007           .unsupportedIfMemSizeNotPow2()
1008           .lower();
1009 
1010   auto &Atomics = getActionDefinitionsBuilder(
1011     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1012      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1013      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1014      G_ATOMICRMW_UMIN})
1015     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1016                {S64, GlobalPtr}, {S64, LocalPtr}});
1017   if (ST.hasFlatAddressSpace()) {
1018     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1019   }
1020 
1021   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1022     .legalFor({{S32, LocalPtr}});
1023 
1024   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1025   // demarshalling
1026   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1027     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1028                 {S32, FlatPtr}, {S64, FlatPtr}})
1029     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1030                {S32, RegionPtr}, {S64, RegionPtr}});
1031   // TODO: Pointer types, any 32-bit or 64-bit vector
1032 
1033   // Condition should be s32 for scalar, s1 for vector.
1034   getActionDefinitionsBuilder(G_SELECT)
1035     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1036           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1037           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1038     .clampScalar(0, S16, S64)
1039     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1040     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1041     .scalarize(1)
1042     .clampMaxNumElements(0, S32, 2)
1043     .clampMaxNumElements(0, LocalPtr, 2)
1044     .clampMaxNumElements(0, PrivatePtr, 2)
1045     .scalarize(0)
1046     .widenScalarToNextPow2(0)
1047     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1048 
1049   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1050   // be more flexible with the shift amount type.
1051   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1052     .legalFor({{S32, S32}, {S64, S32}});
1053   if (ST.has16BitInsts()) {
1054     if (ST.hasVOP3PInsts()) {
1055       Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
1056             .clampMaxNumElements(0, S16, 2);
1057     } else
1058       Shifts.legalFor({{S16, S32}, {S16, S16}});
1059 
1060     // TODO: Support 16-bit shift amounts
1061     Shifts.clampScalar(1, S32, S32);
1062     Shifts.clampScalar(0, S16, S64);
1063     Shifts.widenScalarToNextPow2(0, 16);
1064   } else {
1065     // Make sure we legalize the shift amount type first, as the general
1066     // expansion for the shifted type will produce much worse code if it hasn't
1067     // been truncated already.
1068     Shifts.clampScalar(1, S32, S32);
1069     Shifts.clampScalar(0, S32, S64);
1070     Shifts.widenScalarToNextPow2(0, 32);
1071   }
1072   Shifts.scalarize(0);
1073 
1074   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1075     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1076     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1077     unsigned IdxTypeIdx = 2;
1078 
1079     getActionDefinitionsBuilder(Op)
1080       .customIf([=](const LegalityQuery &Query) {
1081           const LLT EltTy = Query.Types[EltTypeIdx];
1082           const LLT VecTy = Query.Types[VecTypeIdx];
1083           const LLT IdxTy = Query.Types[IdxTypeIdx];
1084           return (EltTy.getSizeInBits() == 16 ||
1085                   EltTy.getSizeInBits() % 32 == 0) &&
1086                  VecTy.getSizeInBits() % 32 == 0 &&
1087                  VecTy.getSizeInBits() <= 1024 &&
1088                  IdxTy.getSizeInBits() == 32;
1089         })
1090       .clampScalar(EltTypeIdx, S32, S64)
1091       .clampScalar(VecTypeIdx, S32, S64)
1092       .clampScalar(IdxTypeIdx, S32, S32);
1093   }
1094 
1095   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1096     .unsupportedIf([=](const LegalityQuery &Query) {
1097         const LLT &EltTy = Query.Types[1].getElementType();
1098         return Query.Types[0] != EltTy;
1099       });
1100 
1101   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1102     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1103     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1104 
1105     // FIXME: Doesn't handle extract of illegal sizes.
1106     getActionDefinitionsBuilder(Op)
1107       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1108       // FIXME: Multiples of 16 should not be legal.
1109       .legalIf([=](const LegalityQuery &Query) {
1110           const LLT BigTy = Query.Types[BigTyIdx];
1111           const LLT LitTy = Query.Types[LitTyIdx];
1112           return (BigTy.getSizeInBits() % 32 == 0) &&
1113                  (LitTy.getSizeInBits() % 16 == 0);
1114         })
1115       .widenScalarIf(
1116         [=](const LegalityQuery &Query) {
1117           const LLT BigTy = Query.Types[BigTyIdx];
1118           return (BigTy.getScalarSizeInBits() < 16);
1119         },
1120         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1121       .widenScalarIf(
1122         [=](const LegalityQuery &Query) {
1123           const LLT LitTy = Query.Types[LitTyIdx];
1124           return (LitTy.getScalarSizeInBits() < 16);
1125         },
1126         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1127       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1128       .widenScalarToNextPow2(BigTyIdx, 32);
1129 
1130   }
1131 
1132   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1133     .legalForCartesianProduct(AllS32Vectors, {S32})
1134     .legalForCartesianProduct(AllS64Vectors, {S64})
1135     .clampNumElements(0, V16S32, V32S32)
1136     .clampNumElements(0, V2S64, V16S64)
1137     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1138 
1139   if (ST.hasScalarPackInsts()) {
1140     BuildVector
1141       // FIXME: Should probably widen s1 vectors straight to s32
1142       .minScalarOrElt(0, S16)
1143       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1144       .minScalar(1, S32);
1145 
1146     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1147       .legalFor({V2S16, S32})
1148       .lower();
1149     BuildVector.minScalarOrElt(0, S32);
1150   } else {
1151     BuildVector.customFor({V2S16, S16});
1152     BuildVector.minScalarOrElt(0, S32);
1153 
1154     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1155       .customFor({V2S16, S32})
1156       .lower();
1157   }
1158 
1159   BuildVector.legalIf(isRegisterType(0));
1160 
1161   // FIXME: Clamp maximum size
1162   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1163     .legalIf(isRegisterType(0));
1164 
1165   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1166   // pre-legalize.
1167   if (ST.hasVOP3PInsts()) {
1168     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1169       .customFor({V2S16, V2S16})
1170       .lower();
1171   } else
1172     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1173 
1174   // Merge/Unmerge
1175   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1176     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1177     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1178 
1179     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1180       const LLT &Ty = Query.Types[TypeIdx];
1181       if (Ty.isVector()) {
1182         const LLT &EltTy = Ty.getElementType();
1183         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
1184           return true;
1185         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1186           return true;
1187       }
1188       return false;
1189     };
1190 
1191     auto &Builder = getActionDefinitionsBuilder(Op)
1192       // Try to widen to s16 first for small types.
1193       // TODO: Only do this on targets with legal s16 shifts
1194       .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1195 
1196       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1197       .lowerFor({{S16, V2S16}})
1198       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1199       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1200                            elementTypeIs(1, S16)),
1201                        changeTo(1, V2S16))
1202       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1203       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1204       // valid.
1205       .clampScalar(LitTyIdx, S32, S256)
1206       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1207       // Break up vectors with weird elements into scalars
1208       .fewerElementsIf(
1209         [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
1210         scalarize(0))
1211       .fewerElementsIf(
1212         [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
1213         scalarize(1))
1214       .clampScalar(BigTyIdx, S32, S1024);
1215 
1216     if (Op == G_MERGE_VALUES) {
1217       Builder.widenScalarIf(
1218         // TODO: Use 16-bit shifts if legal for 8-bit values?
1219         [=](const LegalityQuery &Query) {
1220           const LLT Ty = Query.Types[LitTyIdx];
1221           return Ty.getSizeInBits() < 32;
1222         },
1223         changeTo(LitTyIdx, S32));
1224     }
1225 
1226     Builder.widenScalarIf(
1227       [=](const LegalityQuery &Query) {
1228         const LLT Ty = Query.Types[BigTyIdx];
1229         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1230           Ty.getSizeInBits() % 16 != 0;
1231       },
1232       [=](const LegalityQuery &Query) {
1233         // Pick the next power of 2, or a multiple of 64 over 128.
1234         // Whichever is smaller.
1235         const LLT &Ty = Query.Types[BigTyIdx];
1236         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1237         if (NewSizeInBits >= 256) {
1238           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1239           if (RoundedTo < NewSizeInBits)
1240             NewSizeInBits = RoundedTo;
1241         }
1242         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1243       })
1244       .legalIf([=](const LegalityQuery &Query) {
1245           const LLT &BigTy = Query.Types[BigTyIdx];
1246           const LLT &LitTy = Query.Types[LitTyIdx];
1247 
1248           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1249             return false;
1250           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1251             return false;
1252 
1253           return BigTy.getSizeInBits() % 16 == 0 &&
1254                  LitTy.getSizeInBits() % 16 == 0 &&
1255                  BigTy.getSizeInBits() <= 1024;
1256         })
1257       // Any vectors left are the wrong size. Scalarize them.
1258       .scalarize(0)
1259       .scalarize(1);
1260   }
1261 
1262   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1263   // RegBankSelect.
1264   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1265     .legalFor({{S32}, {S64}});
1266 
1267   if (ST.hasVOP3PInsts()) {
1268     SextInReg.lowerFor({{V2S16}})
1269       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1270       // get more vector shift opportunities, since we'll get those when
1271       // expanded.
1272       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1273   } else if (ST.has16BitInsts()) {
1274     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1275   } else {
1276     // Prefer to promote to s32 before lowering if we don't have 16-bit
1277     // shifts. This avoid a lot of intermediate truncate and extend operations.
1278     SextInReg.lowerFor({{S32}, {S64}});
1279   }
1280 
1281   SextInReg
1282     .scalarize(0)
1283     .clampScalar(0, S32, S64)
1284     .lower();
1285 
1286   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1287     .legalFor({S64});
1288 
1289   getActionDefinitionsBuilder({
1290       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1291       G_FCOPYSIGN,
1292 
1293       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1294       G_READ_REGISTER,
1295       G_WRITE_REGISTER,
1296 
1297       G_SADDO, G_SSUBO,
1298 
1299        // TODO: Implement
1300       G_FMINIMUM, G_FMAXIMUM
1301     }).lower();
1302 
1303   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1304         G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1305         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1306     .unsupported();
1307 
1308   computeTables();
1309   verify(*ST.getInstrInfo());
1310 }
1311 
1312 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1313                                          MachineRegisterInfo &MRI,
1314                                          MachineIRBuilder &B,
1315                                          GISelChangeObserver &Observer) const {
1316   switch (MI.getOpcode()) {
1317   case TargetOpcode::G_ADDRSPACE_CAST:
1318     return legalizeAddrSpaceCast(MI, MRI, B);
1319   case TargetOpcode::G_FRINT:
1320     return legalizeFrint(MI, MRI, B);
1321   case TargetOpcode::G_FCEIL:
1322     return legalizeFceil(MI, MRI, B);
1323   case TargetOpcode::G_INTRINSIC_TRUNC:
1324     return legalizeIntrinsicTrunc(MI, MRI, B);
1325   case TargetOpcode::G_SITOFP:
1326     return legalizeITOFP(MI, MRI, B, true);
1327   case TargetOpcode::G_UITOFP:
1328     return legalizeITOFP(MI, MRI, B, false);
1329   case TargetOpcode::G_FPTOSI:
1330     return legalizeFPTOI(MI, MRI, B, true);
1331   case TargetOpcode::G_FPTOUI:
1332     return legalizeFPTOI(MI, MRI, B, false);
1333   case TargetOpcode::G_FMINNUM:
1334   case TargetOpcode::G_FMAXNUM:
1335   case TargetOpcode::G_FMINNUM_IEEE:
1336   case TargetOpcode::G_FMAXNUM_IEEE:
1337     return legalizeMinNumMaxNum(MI, MRI, B);
1338   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1339     return legalizeExtractVectorElt(MI, MRI, B);
1340   case TargetOpcode::G_INSERT_VECTOR_ELT:
1341     return legalizeInsertVectorElt(MI, MRI, B);
1342   case TargetOpcode::G_SHUFFLE_VECTOR:
1343     return legalizeShuffleVector(MI, MRI, B);
1344   case TargetOpcode::G_FSIN:
1345   case TargetOpcode::G_FCOS:
1346     return legalizeSinCos(MI, MRI, B);
1347   case TargetOpcode::G_GLOBAL_VALUE:
1348     return legalizeGlobalValue(MI, MRI, B);
1349   case TargetOpcode::G_LOAD:
1350     return legalizeLoad(MI, MRI, B, Observer);
1351   case TargetOpcode::G_FMAD:
1352     return legalizeFMad(MI, MRI, B);
1353   case TargetOpcode::G_FDIV:
1354     return legalizeFDIV(MI, MRI, B);
1355   case TargetOpcode::G_UDIV:
1356   case TargetOpcode::G_UREM:
1357     return legalizeUDIV_UREM(MI, MRI, B);
1358   case TargetOpcode::G_SDIV:
1359   case TargetOpcode::G_SREM:
1360     return legalizeSDIV_SREM(MI, MRI, B);
1361   case TargetOpcode::G_ATOMIC_CMPXCHG:
1362     return legalizeAtomicCmpXChg(MI, MRI, B);
1363   case TargetOpcode::G_FLOG:
1364     return legalizeFlog(MI, B, 1.0f / numbers::log2ef);
1365   case TargetOpcode::G_FLOG10:
1366     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1367   case TargetOpcode::G_FEXP:
1368     return legalizeFExp(MI, B);
1369   case TargetOpcode::G_FPOW:
1370     return legalizeFPow(MI, B);
1371   case TargetOpcode::G_FFLOOR:
1372     return legalizeFFloor(MI, MRI, B);
1373   case TargetOpcode::G_BUILD_VECTOR:
1374     return legalizeBuildVector(MI, MRI, B);
1375   default:
1376     return false;
1377   }
1378 
1379   llvm_unreachable("expected switch to return");
1380 }
1381 
1382 Register AMDGPULegalizerInfo::getSegmentAperture(
1383   unsigned AS,
1384   MachineRegisterInfo &MRI,
1385   MachineIRBuilder &B) const {
1386   MachineFunction &MF = B.getMF();
1387   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1388   const LLT S32 = LLT::scalar(32);
1389 
1390   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1391 
1392   if (ST.hasApertureRegs()) {
1393     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1394     // getreg.
1395     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1396         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1397         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1398     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1399         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1400         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1401     unsigned Encoding =
1402         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1403         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1404         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1405 
1406     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1407 
1408     B.buildInstr(AMDGPU::S_GETREG_B32)
1409       .addDef(GetReg)
1410       .addImm(Encoding);
1411     MRI.setType(GetReg, S32);
1412 
1413     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1414     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1415   }
1416 
1417   Register QueuePtr = MRI.createGenericVirtualRegister(
1418     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1419 
1420   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1421   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1422     return Register();
1423 
1424   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1425   // private_segment_aperture_base_hi.
1426   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1427 
1428   // TODO: can we be smarter about machine pointer info?
1429   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1430   MachineMemOperand *MMO = MF.getMachineMemOperand(
1431     PtrInfo,
1432     MachineMemOperand::MOLoad |
1433     MachineMemOperand::MODereferenceable |
1434     MachineMemOperand::MOInvariant,
1435     4,
1436     MinAlign(64, StructOffset));
1437 
1438   Register LoadAddr;
1439 
1440   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1441   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1442 }
1443 
1444 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1445   MachineInstr &MI, MachineRegisterInfo &MRI,
1446   MachineIRBuilder &B) const {
1447   MachineFunction &MF = B.getMF();
1448 
1449   B.setInstr(MI);
1450 
1451   const LLT S32 = LLT::scalar(32);
1452   Register Dst = MI.getOperand(0).getReg();
1453   Register Src = MI.getOperand(1).getReg();
1454 
1455   LLT DstTy = MRI.getType(Dst);
1456   LLT SrcTy = MRI.getType(Src);
1457   unsigned DestAS = DstTy.getAddressSpace();
1458   unsigned SrcAS = SrcTy.getAddressSpace();
1459 
1460   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1461   // vector element.
1462   assert(!DstTy.isVector());
1463 
1464   const AMDGPUTargetMachine &TM
1465     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1466 
1467   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1468   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1469     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1470     return true;
1471   }
1472 
1473   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1474     // Truncate.
1475     B.buildExtract(Dst, Src, 0);
1476     MI.eraseFromParent();
1477     return true;
1478   }
1479 
1480   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1481     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1482     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1483 
1484     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1485     // another. Merge operands are required to be the same type, but creating an
1486     // extra ptrtoint would be kind of pointless.
1487     auto HighAddr = B.buildConstant(
1488       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1489     B.buildMerge(Dst, {Src, HighAddr});
1490     MI.eraseFromParent();
1491     return true;
1492   }
1493 
1494   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1495     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1496            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1497     unsigned NullVal = TM.getNullPointerValue(DestAS);
1498 
1499     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1500     auto FlatNull = B.buildConstant(SrcTy, 0);
1501 
1502     // Extract low 32-bits of the pointer.
1503     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1504 
1505     auto CmpRes =
1506         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1507     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1508 
1509     MI.eraseFromParent();
1510     return true;
1511   }
1512 
1513   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1514     return false;
1515 
1516   if (!ST.hasFlatAddressSpace())
1517     return false;
1518 
1519   auto SegmentNull =
1520       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1521   auto FlatNull =
1522       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1523 
1524   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1525   if (!ApertureReg.isValid())
1526     return false;
1527 
1528   auto CmpRes =
1529       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1530 
1531   // Coerce the type of the low half of the result so we can use merge_values.
1532   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1533 
1534   // TODO: Should we allow mismatched types but matching sizes in merges to
1535   // avoid the ptrtoint?
1536   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1537   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1538 
1539   MI.eraseFromParent();
1540   return true;
1541 }
1542 
1543 bool AMDGPULegalizerInfo::legalizeFrint(
1544   MachineInstr &MI, MachineRegisterInfo &MRI,
1545   MachineIRBuilder &B) const {
1546   B.setInstr(MI);
1547 
1548   Register Src = MI.getOperand(1).getReg();
1549   LLT Ty = MRI.getType(Src);
1550   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1551 
1552   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1553   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1554 
1555   auto C1 = B.buildFConstant(Ty, C1Val);
1556   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1557 
1558   // TODO: Should this propagate fast-math-flags?
1559   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1560   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1561 
1562   auto C2 = B.buildFConstant(Ty, C2Val);
1563   auto Fabs = B.buildFAbs(Ty, Src);
1564 
1565   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1566   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1567   return true;
1568 }
1569 
1570 bool AMDGPULegalizerInfo::legalizeFceil(
1571   MachineInstr &MI, MachineRegisterInfo &MRI,
1572   MachineIRBuilder &B) const {
1573   B.setInstr(MI);
1574 
1575   const LLT S1 = LLT::scalar(1);
1576   const LLT S64 = LLT::scalar(64);
1577 
1578   Register Src = MI.getOperand(1).getReg();
1579   assert(MRI.getType(Src) == S64);
1580 
1581   // result = trunc(src)
1582   // if (src > 0.0 && src != result)
1583   //   result += 1.0
1584 
1585   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1586 
1587   const auto Zero = B.buildFConstant(S64, 0.0);
1588   const auto One = B.buildFConstant(S64, 1.0);
1589   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1590   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1591   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1592   auto Add = B.buildSelect(S64, And, One, Zero);
1593 
1594   // TODO: Should this propagate fast-math-flags?
1595   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1596   return true;
1597 }
1598 
1599 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1600                                               MachineIRBuilder &B) {
1601   const unsigned FractBits = 52;
1602   const unsigned ExpBits = 11;
1603   LLT S32 = LLT::scalar(32);
1604 
1605   auto Const0 = B.buildConstant(S32, FractBits - 32);
1606   auto Const1 = B.buildConstant(S32, ExpBits);
1607 
1608   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1609     .addUse(Const0.getReg(0))
1610     .addUse(Const1.getReg(0));
1611 
1612   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1613 }
1614 
1615 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1616   MachineInstr &MI, MachineRegisterInfo &MRI,
1617   MachineIRBuilder &B) const {
1618   B.setInstr(MI);
1619 
1620   const LLT S1 = LLT::scalar(1);
1621   const LLT S32 = LLT::scalar(32);
1622   const LLT S64 = LLT::scalar(64);
1623 
1624   Register Src = MI.getOperand(1).getReg();
1625   assert(MRI.getType(Src) == S64);
1626 
1627   // TODO: Should this use extract since the low half is unused?
1628   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1629   Register Hi = Unmerge.getReg(1);
1630 
1631   // Extract the upper half, since this is where we will find the sign and
1632   // exponent.
1633   auto Exp = extractF64Exponent(Hi, B);
1634 
1635   const unsigned FractBits = 52;
1636 
1637   // Extract the sign bit.
1638   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1639   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1640 
1641   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1642 
1643   const auto Zero32 = B.buildConstant(S32, 0);
1644 
1645   // Extend back to 64-bits.
1646   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1647 
1648   auto Shr = B.buildAShr(S64, FractMask, Exp);
1649   auto Not = B.buildNot(S64, Shr);
1650   auto Tmp0 = B.buildAnd(S64, Src, Not);
1651   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1652 
1653   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1654   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1655 
1656   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1657   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1658   return true;
1659 }
1660 
1661 bool AMDGPULegalizerInfo::legalizeITOFP(
1662   MachineInstr &MI, MachineRegisterInfo &MRI,
1663   MachineIRBuilder &B, bool Signed) const {
1664   B.setInstr(MI);
1665 
1666   Register Dst = MI.getOperand(0).getReg();
1667   Register Src = MI.getOperand(1).getReg();
1668 
1669   const LLT S64 = LLT::scalar(64);
1670   const LLT S32 = LLT::scalar(32);
1671 
1672   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1673 
1674   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1675 
1676   auto CvtHi = Signed ?
1677     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1678     B.buildUITOFP(S64, Unmerge.getReg(1));
1679 
1680   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1681 
1682   auto ThirtyTwo = B.buildConstant(S32, 32);
1683   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1684     .addUse(CvtHi.getReg(0))
1685     .addUse(ThirtyTwo.getReg(0));
1686 
1687   // TODO: Should this propagate fast-math-flags?
1688   B.buildFAdd(Dst, LdExp, CvtLo);
1689   MI.eraseFromParent();
1690   return true;
1691 }
1692 
1693 // TODO: Copied from DAG implementation. Verify logic and document how this
1694 // actually works.
1695 bool AMDGPULegalizerInfo::legalizeFPTOI(
1696   MachineInstr &MI, MachineRegisterInfo &MRI,
1697   MachineIRBuilder &B, bool Signed) const {
1698   B.setInstr(MI);
1699 
1700   Register Dst = MI.getOperand(0).getReg();
1701   Register Src = MI.getOperand(1).getReg();
1702 
1703   const LLT S64 = LLT::scalar(64);
1704   const LLT S32 = LLT::scalar(32);
1705 
1706   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1707 
1708   unsigned Flags = MI.getFlags();
1709 
1710   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
1711   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
1712   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
1713 
1714   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
1715   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
1716   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
1717 
1718   auto Hi = Signed ?
1719     B.buildFPTOSI(S32, FloorMul) :
1720     B.buildFPTOUI(S32, FloorMul);
1721   auto Lo = B.buildFPTOUI(S32, Fma);
1722 
1723   B.buildMerge(Dst, { Lo, Hi });
1724   MI.eraseFromParent();
1725 
1726   return true;
1727 }
1728 
1729 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1730   MachineInstr &MI, MachineRegisterInfo &MRI,
1731   MachineIRBuilder &B) const {
1732   MachineFunction &MF = B.getMF();
1733   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1734 
1735   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1736                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1737 
1738   // With ieee_mode disabled, the instructions have the correct behavior
1739   // already for G_FMINNUM/G_FMAXNUM
1740   if (!MFI->getMode().IEEE)
1741     return !IsIEEEOp;
1742 
1743   if (IsIEEEOp)
1744     return true;
1745 
1746   MachineIRBuilder HelperBuilder(MI);
1747   GISelObserverWrapper DummyObserver;
1748   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1749   HelperBuilder.setInstr(MI);
1750   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1751 }
1752 
1753 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1754   MachineInstr &MI, MachineRegisterInfo &MRI,
1755   MachineIRBuilder &B) const {
1756   // TODO: Should move some of this into LegalizerHelper.
1757 
1758   // TODO: Promote dynamic indexing of s16 to s32
1759 
1760   // FIXME: Artifact combiner probably should have replaced the truncated
1761   // constant before this, so we shouldn't need
1762   // getConstantVRegValWithLookThrough.
1763   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1764     MI.getOperand(2).getReg(), MRI);
1765   if (!IdxVal) // Dynamic case will be selected to register indexing.
1766     return true;
1767 
1768   Register Dst = MI.getOperand(0).getReg();
1769   Register Vec = MI.getOperand(1).getReg();
1770 
1771   LLT VecTy = MRI.getType(Vec);
1772   LLT EltTy = VecTy.getElementType();
1773   assert(EltTy == MRI.getType(Dst));
1774 
1775   B.setInstr(MI);
1776 
1777   if (IdxVal->Value < VecTy.getNumElements())
1778     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
1779   else
1780     B.buildUndef(Dst);
1781 
1782   MI.eraseFromParent();
1783   return true;
1784 }
1785 
1786 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1787   MachineInstr &MI, MachineRegisterInfo &MRI,
1788   MachineIRBuilder &B) const {
1789   // TODO: Should move some of this into LegalizerHelper.
1790 
1791   // TODO: Promote dynamic indexing of s16 to s32
1792 
1793   // FIXME: Artifact combiner probably should have replaced the truncated
1794   // constant before this, so we shouldn't need
1795   // getConstantVRegValWithLookThrough.
1796   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1797     MI.getOperand(3).getReg(), MRI);
1798   if (!IdxVal) // Dynamic case will be selected to register indexing.
1799     return true;
1800 
1801   Register Dst = MI.getOperand(0).getReg();
1802   Register Vec = MI.getOperand(1).getReg();
1803   Register Ins = MI.getOperand(2).getReg();
1804 
1805   LLT VecTy = MRI.getType(Vec);
1806   LLT EltTy = VecTy.getElementType();
1807   assert(EltTy == MRI.getType(Ins));
1808 
1809   B.setInstr(MI);
1810 
1811   if (IdxVal->Value < VecTy.getNumElements())
1812     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
1813   else
1814     B.buildUndef(Dst);
1815 
1816   MI.eraseFromParent();
1817   return true;
1818 }
1819 
1820 bool AMDGPULegalizerInfo::legalizeShuffleVector(
1821   MachineInstr &MI, MachineRegisterInfo &MRI,
1822   MachineIRBuilder &B) const {
1823   const LLT V2S16 = LLT::vector(2, 16);
1824 
1825   Register Dst = MI.getOperand(0).getReg();
1826   Register Src0 = MI.getOperand(1).getReg();
1827   LLT DstTy = MRI.getType(Dst);
1828   LLT SrcTy = MRI.getType(Src0);
1829 
1830   if (SrcTy == V2S16 && DstTy == V2S16 &&
1831       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
1832     return true;
1833 
1834   MachineIRBuilder HelperBuilder(MI);
1835   GISelObserverWrapper DummyObserver;
1836   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
1837   HelperBuilder.setInstr(MI);
1838   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
1839 }
1840 
1841 bool AMDGPULegalizerInfo::legalizeSinCos(
1842   MachineInstr &MI, MachineRegisterInfo &MRI,
1843   MachineIRBuilder &B) const {
1844   B.setInstr(MI);
1845 
1846   Register DstReg = MI.getOperand(0).getReg();
1847   Register SrcReg = MI.getOperand(1).getReg();
1848   LLT Ty = MRI.getType(DstReg);
1849   unsigned Flags = MI.getFlags();
1850 
1851   Register TrigVal;
1852   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1853   if (ST.hasTrigReducedRange()) {
1854     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1855     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1856       .addUse(MulVal.getReg(0))
1857       .setMIFlags(Flags).getReg(0);
1858   } else
1859     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1860 
1861   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1862     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1863   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1864     .addUse(TrigVal)
1865     .setMIFlags(Flags);
1866   MI.eraseFromParent();
1867   return true;
1868 }
1869 
1870 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1871   Register DstReg, LLT PtrTy,
1872   MachineIRBuilder &B, const GlobalValue *GV,
1873   unsigned Offset, unsigned GAFlags) const {
1874   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1875   // to the following code sequence:
1876   //
1877   // For constant address space:
1878   //   s_getpc_b64 s[0:1]
1879   //   s_add_u32 s0, s0, $symbol
1880   //   s_addc_u32 s1, s1, 0
1881   //
1882   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1883   //   a fixup or relocation is emitted to replace $symbol with a literal
1884   //   constant, which is a pc-relative offset from the encoding of the $symbol
1885   //   operand to the global variable.
1886   //
1887   // For global address space:
1888   //   s_getpc_b64 s[0:1]
1889   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1890   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1891   //
1892   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1893   //   fixups or relocations are emitted to replace $symbol@*@lo and
1894   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1895   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
1896   //   operand to the global variable.
1897   //
1898   // What we want here is an offset from the value returned by s_getpc
1899   // (which is the address of the s_add_u32 instruction) to the global
1900   // variable, but since the encoding of $symbol starts 4 bytes after the start
1901   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1902   // small. This requires us to add 4 to the global variable offset in order to
1903   // compute the correct address.
1904 
1905   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1906 
1907   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1908     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1909 
1910   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1911     .addDef(PCReg);
1912 
1913   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1914   if (GAFlags == SIInstrInfo::MO_NONE)
1915     MIB.addImm(0);
1916   else
1917     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1918 
1919   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1920 
1921   if (PtrTy.getSizeInBits() == 32)
1922     B.buildExtract(DstReg, PCReg, 0);
1923   return true;
1924  }
1925 
1926 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1927   MachineInstr &MI, MachineRegisterInfo &MRI,
1928   MachineIRBuilder &B) const {
1929   Register DstReg = MI.getOperand(0).getReg();
1930   LLT Ty = MRI.getType(DstReg);
1931   unsigned AS = Ty.getAddressSpace();
1932 
1933   const GlobalValue *GV = MI.getOperand(1).getGlobal();
1934   MachineFunction &MF = B.getMF();
1935   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1936   B.setInstr(MI);
1937 
1938   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1939     if (!MFI->isEntryFunction()) {
1940       const Function &Fn = MF.getFunction();
1941       DiagnosticInfoUnsupported BadLDSDecl(
1942         Fn, "local memory global used by non-kernel function", MI.getDebugLoc());
1943       Fn.getContext().diagnose(BadLDSDecl);
1944     }
1945 
1946     // TODO: We could emit code to handle the initialization somewhere.
1947     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1948       const SITargetLowering *TLI = ST.getTargetLowering();
1949       if (!TLI->shouldUseLDSConstAddress(GV)) {
1950         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
1951         return true; // Leave in place;
1952       }
1953 
1954       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1955       MI.eraseFromParent();
1956       return true;
1957     }
1958 
1959     const Function &Fn = MF.getFunction();
1960     DiagnosticInfoUnsupported BadInit(
1961       Fn, "unsupported initializer for address space", MI.getDebugLoc());
1962     Fn.getContext().diagnose(BadInit);
1963     return true;
1964   }
1965 
1966   const SITargetLowering *TLI = ST.getTargetLowering();
1967 
1968   if (TLI->shouldEmitFixup(GV)) {
1969     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
1970     MI.eraseFromParent();
1971     return true;
1972   }
1973 
1974   if (TLI->shouldEmitPCReloc(GV)) {
1975     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
1976     MI.eraseFromParent();
1977     return true;
1978   }
1979 
1980   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1981   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
1982 
1983   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
1984     MachinePointerInfo::getGOT(MF),
1985     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1986     MachineMemOperand::MOInvariant,
1987     8 /*Size*/, 8 /*Align*/);
1988 
1989   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
1990 
1991   if (Ty.getSizeInBits() == 32) {
1992     // Truncate if this is a 32-bit constant adrdess.
1993     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
1994     B.buildExtract(DstReg, Load, 0);
1995   } else
1996     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
1997 
1998   MI.eraseFromParent();
1999   return true;
2000 }
2001 
2002 bool AMDGPULegalizerInfo::legalizeLoad(
2003   MachineInstr &MI, MachineRegisterInfo &MRI,
2004   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
2005   B.setInstr(MI);
2006   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2007   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
2008   Observer.changingInstr(MI);
2009   MI.getOperand(1).setReg(Cast.getReg(0));
2010   Observer.changedInstr(MI);
2011   return true;
2012 }
2013 
2014 bool AMDGPULegalizerInfo::legalizeFMad(
2015   MachineInstr &MI, MachineRegisterInfo &MRI,
2016   MachineIRBuilder &B) const {
2017   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2018   assert(Ty.isScalar());
2019 
2020   MachineFunction &MF = B.getMF();
2021   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2022 
2023   // TODO: Always legal with future ftz flag.
2024   // FIXME: Do we need just output?
2025   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2026     return true;
2027   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2028     return true;
2029 
2030   MachineIRBuilder HelperBuilder(MI);
2031   GISelObserverWrapper DummyObserver;
2032   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2033   HelperBuilder.setMBB(*MI.getParent());
2034   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2035 }
2036 
2037 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2038   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2039   Register DstReg = MI.getOperand(0).getReg();
2040   Register PtrReg = MI.getOperand(1).getReg();
2041   Register CmpVal = MI.getOperand(2).getReg();
2042   Register NewVal = MI.getOperand(3).getReg();
2043 
2044   assert(SITargetLowering::isFlatGlobalAddrSpace(
2045            MRI.getType(PtrReg).getAddressSpace()) &&
2046          "this should not have been custom lowered");
2047 
2048   LLT ValTy = MRI.getType(CmpVal);
2049   LLT VecTy = LLT::vector(2, ValTy);
2050 
2051   B.setInstr(MI);
2052   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2053 
2054   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2055     .addDef(DstReg)
2056     .addUse(PtrReg)
2057     .addUse(PackedVal)
2058     .setMemRefs(MI.memoperands());
2059 
2060   MI.eraseFromParent();
2061   return true;
2062 }
2063 
2064 bool AMDGPULegalizerInfo::legalizeFlog(
2065   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2066   Register Dst = MI.getOperand(0).getReg();
2067   Register Src = MI.getOperand(1).getReg();
2068   LLT Ty = B.getMRI()->getType(Dst);
2069   unsigned Flags = MI.getFlags();
2070   B.setInstr(MI);
2071 
2072   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2073   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2074 
2075   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2076   MI.eraseFromParent();
2077   return true;
2078 }
2079 
2080 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2081                                        MachineIRBuilder &B) const {
2082   Register Dst = MI.getOperand(0).getReg();
2083   Register Src = MI.getOperand(1).getReg();
2084   unsigned Flags = MI.getFlags();
2085   LLT Ty = B.getMRI()->getType(Dst);
2086   B.setInstr(MI);
2087 
2088   auto K = B.buildFConstant(Ty, numbers::log2e);
2089   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2090   B.buildFExp2(Dst, Mul, Flags);
2091   MI.eraseFromParent();
2092   return true;
2093 }
2094 
2095 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2096                                        MachineIRBuilder &B) const {
2097   Register Dst = MI.getOperand(0).getReg();
2098   Register Src0 = MI.getOperand(1).getReg();
2099   Register Src1 = MI.getOperand(2).getReg();
2100   unsigned Flags = MI.getFlags();
2101   LLT Ty = B.getMRI()->getType(Dst);
2102   B.setInstr(MI);
2103   const LLT S16 = LLT::scalar(16);
2104   const LLT S32 = LLT::scalar(32);
2105 
2106   if (Ty == S32) {
2107     auto Log = B.buildFLog2(S32, Src0, Flags);
2108     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2109       .addUse(Log.getReg(0))
2110       .addUse(Src1)
2111       .setMIFlags(Flags);
2112     B.buildFExp2(Dst, Mul, Flags);
2113   } else if (Ty == S16) {
2114     // There's no f16 fmul_legacy, so we need to convert for it.
2115     auto Log = B.buildFLog2(S16, Src0, Flags);
2116     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2117     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2118     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2119       .addUse(Ext0.getReg(0))
2120       .addUse(Ext1.getReg(0))
2121       .setMIFlags(Flags);
2122 
2123     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2124   } else
2125     return false;
2126 
2127   MI.eraseFromParent();
2128   return true;
2129 }
2130 
2131 // Find a source register, ignoring any possible source modifiers.
2132 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2133   Register ModSrc = OrigSrc;
2134   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2135     ModSrc = SrcFNeg->getOperand(1).getReg();
2136     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2137       ModSrc = SrcFAbs->getOperand(1).getReg();
2138   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2139     ModSrc = SrcFAbs->getOperand(1).getReg();
2140   return ModSrc;
2141 }
2142 
2143 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2144                                          MachineRegisterInfo &MRI,
2145                                          MachineIRBuilder &B) const {
2146   B.setInstr(MI);
2147 
2148   const LLT S1 = LLT::scalar(1);
2149   const LLT S64 = LLT::scalar(64);
2150   Register Dst = MI.getOperand(0).getReg();
2151   Register OrigSrc = MI.getOperand(1).getReg();
2152   unsigned Flags = MI.getFlags();
2153   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2154          "this should not have been custom lowered");
2155 
2156   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2157   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2158   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2159   // V_FRACT bug is:
2160   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2161   //
2162   // Convert floor(x) to (x - fract(x))
2163 
2164   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2165     .addUse(OrigSrc)
2166     .setMIFlags(Flags);
2167 
2168   // Give source modifier matching some assistance before obscuring a foldable
2169   // pattern.
2170 
2171   // TODO: We can avoid the neg on the fract? The input sign to fract
2172   // shouldn't matter?
2173   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2174 
2175   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2176 
2177   Register Min = MRI.createGenericVirtualRegister(S64);
2178 
2179   // We don't need to concern ourselves with the snan handling difference, so
2180   // use the one which will directly select.
2181   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2182   if (MFI->getMode().IEEE)
2183     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2184   else
2185     B.buildFMinNum(Min, Fract, Const, Flags);
2186 
2187   Register CorrectedFract = Min;
2188   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2189     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2190     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2191   }
2192 
2193   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2194   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2195 
2196   MI.eraseFromParent();
2197   return true;
2198 }
2199 
2200 // Turn an illegal packed v2s16 build vector into bit operations.
2201 // TODO: This should probably be a bitcast action in LegalizerHelper.
2202 bool AMDGPULegalizerInfo::legalizeBuildVector(
2203   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2204   Register Dst = MI.getOperand(0).getReg();
2205   LLT DstTy = MRI.getType(Dst);
2206   const LLT S32 = LLT::scalar(32);
2207   const LLT V2S16 = LLT::vector(2, 16);
2208   (void)DstTy;
2209   (void)V2S16;
2210   assert(DstTy == V2S16);
2211 
2212   Register Src0 = MI.getOperand(1).getReg();
2213   Register Src1 = MI.getOperand(2).getReg();
2214   assert(MRI.getType(Src0) == LLT::scalar(16));
2215 
2216   B.setInstr(MI);
2217   auto Merge = B.buildMerge(S32, {Src0, Src1});
2218   B.buildBitcast(Dst, Merge);
2219 
2220   MI.eraseFromParent();
2221   return true;
2222 }
2223 
2224 // Return the use branch instruction, otherwise null if the usage is invalid.
2225 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2226                                        MachineRegisterInfo &MRI,
2227                                        MachineInstr *&Br) {
2228   Register CondDef = MI.getOperand(0).getReg();
2229   if (!MRI.hasOneNonDBGUse(CondDef))
2230     return nullptr;
2231 
2232   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2233   if (UseMI.getParent() != MI.getParent() ||
2234       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2235     return nullptr;
2236 
2237   // Make sure the cond br is followed by a G_BR
2238   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2239   if (Next != MI.getParent()->end()) {
2240     if (Next->getOpcode() != AMDGPU::G_BR)
2241       return nullptr;
2242     Br = &*Next;
2243   }
2244 
2245   return &UseMI;
2246 }
2247 
2248 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
2249                                                 Register Reg, LLT Ty) const {
2250   Register LiveIn = MRI.getLiveInVirtReg(Reg);
2251   if (LiveIn)
2252     return LiveIn;
2253 
2254   Register NewReg = MRI.createGenericVirtualRegister(Ty);
2255   MRI.addLiveIn(Reg, NewReg);
2256   return NewReg;
2257 }
2258 
2259 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2260                                          const ArgDescriptor *Arg) const {
2261   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2262     return false; // TODO: Handle these
2263 
2264   assert(Arg->getRegister().isPhysical());
2265 
2266   MachineRegisterInfo &MRI = *B.getMRI();
2267 
2268   LLT Ty = MRI.getType(DstReg);
2269   Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
2270 
2271   if (Arg->isMasked()) {
2272     // TODO: Should we try to emit this once in the entry block?
2273     const LLT S32 = LLT::scalar(32);
2274     const unsigned Mask = Arg->getMask();
2275     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2276 
2277     Register AndMaskSrc = LiveIn;
2278 
2279     if (Shift != 0) {
2280       auto ShiftAmt = B.buildConstant(S32, Shift);
2281       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2282     }
2283 
2284     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2285   } else
2286     B.buildCopy(DstReg, LiveIn);
2287 
2288   // Insert the argument copy if it doens't already exist.
2289   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
2290   if (!MRI.getVRegDef(LiveIn)) {
2291     // FIXME: Should have scoped insert pt
2292     MachineBasicBlock &OrigInsBB = B.getMBB();
2293     auto OrigInsPt = B.getInsertPt();
2294 
2295     MachineBasicBlock &EntryMBB = B.getMF().front();
2296     EntryMBB.addLiveIn(Arg->getRegister());
2297     B.setInsertPt(EntryMBB, EntryMBB.begin());
2298     B.buildCopy(LiveIn, Arg->getRegister());
2299 
2300     B.setInsertPt(OrigInsBB, OrigInsPt);
2301   }
2302 
2303   return true;
2304 }
2305 
2306 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2307   MachineInstr &MI,
2308   MachineRegisterInfo &MRI,
2309   MachineIRBuilder &B,
2310   AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2311   B.setInstr(MI);
2312 
2313   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2314 
2315   const ArgDescriptor *Arg;
2316   const TargetRegisterClass *RC;
2317   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
2318   if (!Arg) {
2319     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
2320     return false;
2321   }
2322 
2323   if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
2324     MI.eraseFromParent();
2325     return true;
2326   }
2327 
2328   return false;
2329 }
2330 
2331 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2332                                        MachineRegisterInfo &MRI,
2333                                        MachineIRBuilder &B) const {
2334   B.setInstr(MI);
2335   Register Dst = MI.getOperand(0).getReg();
2336   LLT DstTy = MRI.getType(Dst);
2337   LLT S16 = LLT::scalar(16);
2338   LLT S32 = LLT::scalar(32);
2339   LLT S64 = LLT::scalar(64);
2340 
2341   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2342     return true;
2343 
2344   if (DstTy == S16)
2345     return legalizeFDIV16(MI, MRI, B);
2346   if (DstTy == S32)
2347     return legalizeFDIV32(MI, MRI, B);
2348   if (DstTy == S64)
2349     return legalizeFDIV64(MI, MRI, B);
2350 
2351   return false;
2352 }
2353 
2354 static Register buildDivRCP(MachineIRBuilder &B, Register Src) {
2355   const LLT S32 = LLT::scalar(32);
2356 
2357   auto Cvt0 = B.buildUITOFP(S32, Src);
2358   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Cvt0});
2359   auto FPUIntMaxPlus1 = B.buildFConstant(S32, BitsToFloat(0x4f800000));
2360   auto Mul = B.buildFMul(S32, RcpIFlag, FPUIntMaxPlus1);
2361   return B.buildFPTOUI(S32, Mul).getReg(0);
2362 }
2363 
2364 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2365                                                   Register DstReg,
2366                                                   Register Num,
2367                                                   Register Den,
2368                                                   bool IsRem) const {
2369   const LLT S1 = LLT::scalar(1);
2370   const LLT S32 = LLT::scalar(32);
2371 
2372   // RCP =  URECIP(Den) = 2^32 / Den + e
2373   // e is rounding error.
2374   auto RCP = buildDivRCP(B, Den);
2375 
2376   // RCP_LO = mul(RCP, Den)
2377   auto RCP_LO = B.buildMul(S32, RCP, Den);
2378 
2379   // RCP_HI = mulhu (RCP, Den) */
2380   auto RCP_HI = B.buildUMulH(S32, RCP, Den);
2381 
2382   // NEG_RCP_LO = -RCP_LO
2383   auto Zero = B.buildConstant(S32, 0);
2384   auto NEG_RCP_LO = B.buildSub(S32, Zero, RCP_LO);
2385 
2386   // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
2387   auto CmpRcpHiZero = B.buildICmp(CmpInst::ICMP_EQ, S1, RCP_HI, Zero);
2388   auto ABS_RCP_LO = B.buildSelect(S32, CmpRcpHiZero, NEG_RCP_LO, RCP_LO);
2389 
2390   // Calculate the rounding error from the URECIP instruction
2391   // E = mulhu(ABS_RCP_LO, RCP)
2392   auto E = B.buildUMulH(S32, ABS_RCP_LO, RCP);
2393 
2394   // RCP_A_E = RCP + E
2395   auto RCP_A_E = B.buildAdd(S32, RCP, E);
2396 
2397   // RCP_S_E = RCP - E
2398   auto RCP_S_E = B.buildSub(S32, RCP, E);
2399 
2400   // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
2401   auto Tmp0 = B.buildSelect(S32, CmpRcpHiZero, RCP_A_E, RCP_S_E);
2402 
2403   // Quotient = mulhu(Tmp0, Num)stmp
2404   auto Quotient = B.buildUMulH(S32, Tmp0, Num);
2405 
2406   // Num_S_Remainder = Quotient * Den
2407   auto Num_S_Remainder = B.buildMul(S32, Quotient, Den);
2408 
2409   // Remainder = Num - Num_S_Remainder
2410   auto Remainder = B.buildSub(S32, Num, Num_S_Remainder);
2411 
2412   // Remainder_GE_Den = Remainder >= Den
2413   auto Remainder_GE_Den = B.buildICmp(CmpInst::ICMP_UGE, S1, Remainder, Den);
2414 
2415   // Remainder_GE_Zero = Num >= Num_S_Remainder;
2416   auto Remainder_GE_Zero = B.buildICmp(CmpInst::ICMP_UGE, S1,
2417                                        Num, Num_S_Remainder);
2418 
2419   // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
2420   auto Tmp1 = B.buildAnd(S1, Remainder_GE_Den, Remainder_GE_Zero);
2421 
2422   // Calculate Division result:
2423 
2424   // Quotient_A_One = Quotient + 1
2425   auto One = B.buildConstant(S32, 1);
2426   auto Quotient_A_One = B.buildAdd(S32, Quotient, One);
2427 
2428   // Quotient_S_One = Quotient - 1
2429   auto Quotient_S_One = B.buildSub(S32, Quotient, One);
2430 
2431   // Div = (Tmp1 == 0 ? Quotient_A_One : Quotient)
2432   auto Div = B.buildSelect(S32, Tmp1, Quotient, Quotient_A_One);
2433 
2434   // Div = (Remainder_GE_Zero ? Div : Quotient_S_One)
2435   if (IsRem) {
2436     Div = B.buildSelect(S32, Remainder_GE_Zero, Div, Quotient_S_One);
2437 
2438     // Calculate Rem result:
2439     auto Remainder_S_Den = B.buildSub(S32, Remainder, Den);
2440 
2441     // Remainder_A_Den = Remainder + Den
2442     auto Remainder_A_Den = B.buildAdd(S32, Remainder, Den);
2443 
2444     // Rem = (Tmp1 ? Remainder_S_Den : Remainder)
2445     auto Rem = B.buildSelect(S32, Tmp1, Remainder_S_Den, Remainder);
2446 
2447     // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den)
2448     B.buildSelect(DstReg, Remainder_GE_Zero, Rem, Remainder_A_Den);
2449   } else {
2450     B.buildSelect(DstReg, Remainder_GE_Zero, Div, Quotient_S_One);
2451   }
2452 }
2453 
2454 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2455                                               MachineRegisterInfo &MRI,
2456                                               MachineIRBuilder &B) const {
2457   B.setInstr(MI);
2458   const bool IsRem = MI.getOpcode() == AMDGPU::G_UREM;
2459   Register DstReg = MI.getOperand(0).getReg();
2460   Register Num = MI.getOperand(1).getReg();
2461   Register Den = MI.getOperand(2).getReg();
2462   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsRem);
2463   MI.eraseFromParent();
2464   return true;
2465 }
2466 
2467 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2468                                             MachineRegisterInfo &MRI,
2469                                             MachineIRBuilder &B) const {
2470   if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32))
2471     return legalizeUDIV_UREM32(MI, MRI, B);
2472   return false;
2473 }
2474 
2475 bool AMDGPULegalizerInfo::legalizeSDIV_SREM32(MachineInstr &MI,
2476                                               MachineRegisterInfo &MRI,
2477                                               MachineIRBuilder &B) const {
2478   B.setInstr(MI);
2479   const LLT S32 = LLT::scalar(32);
2480 
2481   const bool IsRem = MI.getOpcode() == AMDGPU::G_SREM;
2482   Register DstReg = MI.getOperand(0).getReg();
2483   Register LHS = MI.getOperand(1).getReg();
2484   Register RHS = MI.getOperand(2).getReg();
2485 
2486   auto ThirtyOne = B.buildConstant(S32, 31);
2487   auto LHSign = B.buildAShr(S32, LHS, ThirtyOne);
2488   auto RHSign = B.buildAShr(S32, LHS, ThirtyOne);
2489 
2490   LHS = B.buildAdd(S32, LHS, LHSign).getReg(0);
2491   RHS = B.buildAdd(S32, RHS, RHSign).getReg(0);
2492 
2493   LHS = B.buildXor(S32, LHS, LHSign).getReg(0);
2494   RHS = B.buildXor(S32, RHS, RHSign).getReg(0);
2495 
2496   Register UDivRem = MRI.createGenericVirtualRegister(S32);
2497   legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsRem);
2498 
2499   if (IsRem) {
2500     auto RSign = LHSign; // Remainder sign is the same as LHS
2501     UDivRem = B.buildXor(S32, UDivRem, RSign).getReg(0);
2502     B.buildSub(DstReg, UDivRem, RSign);
2503   } else {
2504     auto DSign = B.buildXor(S32, LHSign, RHSign);
2505     UDivRem = B.buildXor(S32, UDivRem, DSign).getReg(0);
2506     B.buildSub(DstReg, UDivRem, DSign);
2507   }
2508 
2509   MI.eraseFromParent();
2510   return true;
2511 }
2512 
2513 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2514                                             MachineRegisterInfo &MRI,
2515                                             MachineIRBuilder &B) const {
2516   if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32))
2517     return legalizeSDIV_SREM32(MI, MRI, B);
2518   return false;
2519 }
2520 
2521 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2522                                                  MachineRegisterInfo &MRI,
2523                                                  MachineIRBuilder &B) const {
2524   Register Res = MI.getOperand(0).getReg();
2525   Register LHS = MI.getOperand(1).getReg();
2526   Register RHS = MI.getOperand(2).getReg();
2527 
2528   uint16_t Flags = MI.getFlags();
2529 
2530   LLT ResTy = MRI.getType(Res);
2531   LLT S32 = LLT::scalar(32);
2532   LLT S64 = LLT::scalar(64);
2533 
2534   const MachineFunction &MF = B.getMF();
2535   bool Unsafe =
2536     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2537 
2538   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2539     return false;
2540 
2541   if (!Unsafe && ResTy == S32 &&
2542       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2543     return false;
2544 
2545   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2546     // 1 / x -> RCP(x)
2547     if (CLHS->isExactlyValue(1.0)) {
2548       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2549         .addUse(RHS)
2550         .setMIFlags(Flags);
2551 
2552       MI.eraseFromParent();
2553       return true;
2554     }
2555 
2556     // -1 / x -> RCP( FNEG(x) )
2557     if (CLHS->isExactlyValue(-1.0)) {
2558       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2559       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2560         .addUse(FNeg.getReg(0))
2561         .setMIFlags(Flags);
2562 
2563       MI.eraseFromParent();
2564       return true;
2565     }
2566   }
2567 
2568   // x / y -> x * (1.0 / y)
2569   if (Unsafe) {
2570     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2571       .addUse(RHS)
2572       .setMIFlags(Flags);
2573     B.buildFMul(Res, LHS, RCP, Flags);
2574 
2575     MI.eraseFromParent();
2576     return true;
2577   }
2578 
2579   return false;
2580 }
2581 
2582 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2583                                          MachineRegisterInfo &MRI,
2584                                          MachineIRBuilder &B) const {
2585   B.setInstr(MI);
2586   Register Res = MI.getOperand(0).getReg();
2587   Register LHS = MI.getOperand(1).getReg();
2588   Register RHS = MI.getOperand(2).getReg();
2589 
2590   uint16_t Flags = MI.getFlags();
2591 
2592   LLT S16 = LLT::scalar(16);
2593   LLT S32 = LLT::scalar(32);
2594 
2595   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2596   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2597 
2598   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2599     .addUse(RHSExt.getReg(0))
2600     .setMIFlags(Flags);
2601 
2602   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2603   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2604 
2605   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2606     .addUse(RDst.getReg(0))
2607     .addUse(RHS)
2608     .addUse(LHS)
2609     .setMIFlags(Flags);
2610 
2611   MI.eraseFromParent();
2612   return true;
2613 }
2614 
2615 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2616 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2617 static void toggleSPDenormMode(bool Enable,
2618                                MachineIRBuilder &B,
2619                                const GCNSubtarget &ST,
2620                                AMDGPU::SIModeRegisterDefaults Mode) {
2621   // Set SP denorm mode to this value.
2622   unsigned SPDenormMode =
2623     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
2624 
2625   if (ST.hasDenormModeInst()) {
2626     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2627     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
2628 
2629     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2630     B.buildInstr(AMDGPU::S_DENORM_MODE)
2631       .addImm(NewDenormModeValue);
2632 
2633   } else {
2634     // Select FP32 bit field in mode register.
2635     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2636                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2637                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2638 
2639     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2640       .addImm(SPDenormMode)
2641       .addImm(SPDenormModeBitField);
2642   }
2643 }
2644 
2645 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2646                                          MachineRegisterInfo &MRI,
2647                                          MachineIRBuilder &B) const {
2648   B.setInstr(MI);
2649   Register Res = MI.getOperand(0).getReg();
2650   Register LHS = MI.getOperand(1).getReg();
2651   Register RHS = MI.getOperand(2).getReg();
2652   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2653   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2654 
2655   uint16_t Flags = MI.getFlags();
2656 
2657   LLT S32 = LLT::scalar(32);
2658   LLT S1 = LLT::scalar(1);
2659 
2660   auto One = B.buildFConstant(S32, 1.0f);
2661 
2662   auto DenominatorScaled =
2663     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2664       .addUse(RHS)
2665       .addUse(LHS)
2666       .addImm(1)
2667       .setMIFlags(Flags);
2668   auto NumeratorScaled =
2669     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2670       .addUse(LHS)
2671       .addUse(RHS)
2672       .addImm(0)
2673       .setMIFlags(Flags);
2674 
2675   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2676     .addUse(DenominatorScaled.getReg(0))
2677     .setMIFlags(Flags);
2678   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2679 
2680   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2681   // aren't modeled as reading it.
2682   if (!Mode.allFP32Denormals())
2683     toggleSPDenormMode(true, B, ST, Mode);
2684 
2685   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2686   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2687   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2688   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2689   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2690   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2691 
2692   if (!Mode.allFP32Denormals())
2693     toggleSPDenormMode(false, B, ST, Mode);
2694 
2695   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2696     .addUse(Fma4.getReg(0))
2697     .addUse(Fma1.getReg(0))
2698     .addUse(Fma3.getReg(0))
2699     .addUse(NumeratorScaled.getReg(1))
2700     .setMIFlags(Flags);
2701 
2702   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2703     .addUse(Fmas.getReg(0))
2704     .addUse(RHS)
2705     .addUse(LHS)
2706     .setMIFlags(Flags);
2707 
2708   MI.eraseFromParent();
2709   return true;
2710 }
2711 
2712 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
2713                                          MachineRegisterInfo &MRI,
2714                                          MachineIRBuilder &B) const {
2715   B.setInstr(MI);
2716   Register Res = MI.getOperand(0).getReg();
2717   Register LHS = MI.getOperand(1).getReg();
2718   Register RHS = MI.getOperand(2).getReg();
2719 
2720   uint16_t Flags = MI.getFlags();
2721 
2722   LLT S64 = LLT::scalar(64);
2723   LLT S1 = LLT::scalar(1);
2724 
2725   auto One = B.buildFConstant(S64, 1.0);
2726 
2727   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2728     .addUse(LHS)
2729     .addUse(RHS)
2730     .addImm(1)
2731     .setMIFlags(Flags);
2732 
2733   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
2734 
2735   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
2736     .addUse(DivScale0.getReg(0))
2737     .setMIFlags(Flags);
2738 
2739   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
2740   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
2741   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
2742 
2743   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2744     .addUse(LHS)
2745     .addUse(RHS)
2746     .addImm(0)
2747     .setMIFlags(Flags);
2748 
2749   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
2750   auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags);
2751   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
2752 
2753   Register Scale;
2754   if (!ST.hasUsableDivScaleConditionOutput()) {
2755     // Workaround a hardware bug on SI where the condition output from div_scale
2756     // is not usable.
2757 
2758     LLT S32 = LLT::scalar(32);
2759 
2760     auto NumUnmerge = B.buildUnmerge(S32, LHS);
2761     auto DenUnmerge = B.buildUnmerge(S32, RHS);
2762     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
2763     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
2764 
2765     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
2766                               Scale1Unmerge.getReg(1));
2767     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
2768                               Scale0Unmerge.getReg(1));
2769     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
2770   } else {
2771     Scale = DivScale1.getReg(1);
2772   }
2773 
2774   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
2775     .addUse(Fma4.getReg(0))
2776     .addUse(Fma3.getReg(0))
2777     .addUse(Mul.getReg(0))
2778     .addUse(Scale)
2779     .setMIFlags(Flags);
2780 
2781   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
2782     .addUse(Fmas.getReg(0))
2783     .addUse(RHS)
2784     .addUse(LHS)
2785     .setMIFlags(Flags);
2786 
2787   MI.eraseFromParent();
2788   return true;
2789 }
2790 
2791 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
2792                                                  MachineRegisterInfo &MRI,
2793                                                  MachineIRBuilder &B) const {
2794   B.setInstr(MI);
2795   Register Res = MI.getOperand(0).getReg();
2796   Register LHS = MI.getOperand(2).getReg();
2797   Register RHS = MI.getOperand(3).getReg();
2798   uint16_t Flags = MI.getFlags();
2799 
2800   LLT S32 = LLT::scalar(32);
2801   LLT S1 = LLT::scalar(1);
2802 
2803   auto Abs = B.buildFAbs(S32, RHS, Flags);
2804   const APFloat C0Val(1.0f);
2805 
2806   auto C0 = B.buildConstant(S32, 0x6f800000);
2807   auto C1 = B.buildConstant(S32, 0x2f800000);
2808   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
2809 
2810   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
2811   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
2812 
2813   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
2814 
2815   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2816     .addUse(Mul0.getReg(0))
2817     .setMIFlags(Flags);
2818 
2819   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
2820 
2821   B.buildFMul(Res, Sel, Mul1, Flags);
2822 
2823   MI.eraseFromParent();
2824   return true;
2825 }
2826 
2827 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
2828                                                  MachineRegisterInfo &MRI,
2829                                                  MachineIRBuilder &B) const {
2830   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2831   if (!MFI->isEntryFunction()) {
2832     return legalizePreloadedArgIntrin(MI, MRI, B,
2833                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
2834   }
2835 
2836   B.setInstr(MI);
2837 
2838   uint64_t Offset =
2839     ST.getTargetLowering()->getImplicitParameterOffset(
2840       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
2841   Register DstReg = MI.getOperand(0).getReg();
2842   LLT DstTy = MRI.getType(DstReg);
2843   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
2844 
2845   const ArgDescriptor *Arg;
2846   const TargetRegisterClass *RC;
2847   std::tie(Arg, RC)
2848     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2849   if (!Arg)
2850     return false;
2851 
2852   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
2853   if (!loadInputValue(KernargPtrReg, B, Arg))
2854     return false;
2855 
2856   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
2857   MI.eraseFromParent();
2858   return true;
2859 }
2860 
2861 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
2862                                               MachineRegisterInfo &MRI,
2863                                               MachineIRBuilder &B,
2864                                               unsigned AddrSpace) const {
2865   B.setInstr(MI);
2866   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
2867   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
2868   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
2869   MI.eraseFromParent();
2870   return true;
2871 }
2872 
2873 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
2874 // offset (the offset that is included in bounds checking and swizzling, to be
2875 // split between the instruction's voffset and immoffset fields) and soffset
2876 // (the offset that is excluded from bounds checking and swizzling, to go in
2877 // the instruction's soffset field).  This function takes the first kind of
2878 // offset and figures out how to split it between voffset and immoffset.
2879 std::tuple<Register, unsigned, unsigned>
2880 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
2881                                         Register OrigOffset) const {
2882   const unsigned MaxImm = 4095;
2883   Register BaseReg;
2884   unsigned TotalConstOffset;
2885   MachineInstr *OffsetDef;
2886   const LLT S32 = LLT::scalar(32);
2887 
2888   std::tie(BaseReg, TotalConstOffset, OffsetDef)
2889     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
2890 
2891   unsigned ImmOffset = TotalConstOffset;
2892 
2893   // If the immediate value is too big for the immoffset field, put the value
2894   // and -4096 into the immoffset field so that the value that is copied/added
2895   // for the voffset field is a multiple of 4096, and it stands more chance
2896   // of being CSEd with the copy/add for another similar load/store.
2897   // However, do not do that rounding down to a multiple of 4096 if that is a
2898   // negative number, as it appears to be illegal to have a negative offset
2899   // in the vgpr, even if adding the immediate offset makes it positive.
2900   unsigned Overflow = ImmOffset & ~MaxImm;
2901   ImmOffset -= Overflow;
2902   if ((int32_t)Overflow < 0) {
2903     Overflow += ImmOffset;
2904     ImmOffset = 0;
2905   }
2906 
2907   if (Overflow != 0) {
2908     if (!BaseReg) {
2909       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
2910     } else {
2911       auto OverflowVal = B.buildConstant(S32, Overflow);
2912       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
2913     }
2914   }
2915 
2916   if (!BaseReg)
2917     BaseReg = B.buildConstant(S32, 0).getReg(0);
2918 
2919   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
2920 }
2921 
2922 /// Handle register layout difference for f16 images for some subtargets.
2923 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
2924                                              MachineRegisterInfo &MRI,
2925                                              Register Reg) const {
2926   if (!ST.hasUnpackedD16VMem())
2927     return Reg;
2928 
2929   const LLT S16 = LLT::scalar(16);
2930   const LLT S32 = LLT::scalar(32);
2931   LLT StoreVT = MRI.getType(Reg);
2932   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
2933 
2934   auto Unmerge = B.buildUnmerge(S16, Reg);
2935 
2936   SmallVector<Register, 4> WideRegs;
2937   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
2938     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
2939 
2940   int NumElts = StoreVT.getNumElements();
2941 
2942   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
2943 }
2944 
2945 Register AMDGPULegalizerInfo::fixStoreSourceType(
2946   MachineIRBuilder &B, Register VData, bool IsFormat) const {
2947   MachineRegisterInfo *MRI = B.getMRI();
2948   LLT Ty = MRI->getType(VData);
2949 
2950   const LLT S16 = LLT::scalar(16);
2951 
2952   // Fixup illegal register types for i8 stores.
2953   if (Ty == LLT::scalar(8) || Ty == S16) {
2954     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
2955     return AnyExt;
2956   }
2957 
2958   if (Ty.isVector()) {
2959     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
2960       if (IsFormat)
2961         return handleD16VData(B, *MRI, VData);
2962     }
2963   }
2964 
2965   return VData;
2966 }
2967 
2968 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
2969                                               MachineRegisterInfo &MRI,
2970                                               MachineIRBuilder &B,
2971                                               bool IsTyped,
2972                                               bool IsFormat) const {
2973   B.setInstr(MI);
2974 
2975   Register VData = MI.getOperand(1).getReg();
2976   LLT Ty = MRI.getType(VData);
2977   LLT EltTy = Ty.getScalarType();
2978   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
2979   const LLT S32 = LLT::scalar(32);
2980 
2981   VData = fixStoreSourceType(B, VData, IsFormat);
2982   Register RSrc = MI.getOperand(2).getReg();
2983 
2984   MachineMemOperand *MMO = *MI.memoperands_begin();
2985   const int MemSize = MMO->getSize();
2986 
2987   unsigned ImmOffset;
2988   unsigned TotalOffset;
2989 
2990   // The typed intrinsics add an immediate after the registers.
2991   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
2992 
2993   // The struct intrinsic variants add one additional operand over raw.
2994   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
2995   Register VIndex;
2996   int OpOffset = 0;
2997   if (HasVIndex) {
2998     VIndex = MI.getOperand(3).getReg();
2999     OpOffset = 1;
3000   }
3001 
3002   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3003   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3004 
3005   unsigned Format = 0;
3006   if (IsTyped) {
3007     Format = MI.getOperand(5 + OpOffset).getImm();
3008     ++OpOffset;
3009   }
3010 
3011   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3012 
3013   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3014   if (TotalOffset != 0)
3015     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3016 
3017   unsigned Opc;
3018   if (IsTyped) {
3019     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3020                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3021   } else if (IsFormat) {
3022     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3023                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3024   } else {
3025     switch (MemSize) {
3026     case 1:
3027       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3028       break;
3029     case 2:
3030       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3031       break;
3032     default:
3033       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3034       break;
3035     }
3036   }
3037 
3038   if (!VIndex)
3039     VIndex = B.buildConstant(S32, 0).getReg(0);
3040 
3041   auto MIB = B.buildInstr(Opc)
3042     .addUse(VData)              // vdata
3043     .addUse(RSrc)               // rsrc
3044     .addUse(VIndex)             // vindex
3045     .addUse(VOffset)            // voffset
3046     .addUse(SOffset)            // soffset
3047     .addImm(ImmOffset);         // offset(imm)
3048 
3049   if (IsTyped)
3050     MIB.addImm(Format);
3051 
3052   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3053      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3054      .addMemOperand(MMO);
3055 
3056   MI.eraseFromParent();
3057   return true;
3058 }
3059 
3060 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3061                                              MachineRegisterInfo &MRI,
3062                                              MachineIRBuilder &B,
3063                                              bool IsFormat,
3064                                              bool IsTyped) const {
3065   B.setInstr(MI);
3066 
3067   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3068   MachineMemOperand *MMO = *MI.memoperands_begin();
3069   const int MemSize = MMO->getSize();
3070   const LLT S32 = LLT::scalar(32);
3071 
3072   Register Dst = MI.getOperand(0).getReg();
3073   Register RSrc = MI.getOperand(2).getReg();
3074 
3075   // The typed intrinsics add an immediate after the registers.
3076   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3077 
3078   // The struct intrinsic variants add one additional operand over raw.
3079   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3080   Register VIndex;
3081   int OpOffset = 0;
3082   if (HasVIndex) {
3083     VIndex = MI.getOperand(3).getReg();
3084     OpOffset = 1;
3085   }
3086 
3087   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3088   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3089 
3090   unsigned Format = 0;
3091   if (IsTyped) {
3092     Format = MI.getOperand(5 + OpOffset).getImm();
3093     ++OpOffset;
3094   }
3095 
3096   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3097   unsigned ImmOffset;
3098   unsigned TotalOffset;
3099 
3100   LLT Ty = MRI.getType(Dst);
3101   LLT EltTy = Ty.getScalarType();
3102   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3103   const bool Unpacked = ST.hasUnpackedD16VMem();
3104 
3105   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3106   if (TotalOffset != 0)
3107     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3108 
3109   unsigned Opc;
3110 
3111   if (IsTyped) {
3112     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3113                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3114   } else if (IsFormat) {
3115     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3116                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3117   } else {
3118     switch (MemSize) {
3119     case 1:
3120       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3121       break;
3122     case 2:
3123       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3124       break;
3125     default:
3126       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3127       break;
3128     }
3129   }
3130 
3131   Register LoadDstReg;
3132 
3133   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3134   LLT UnpackedTy = Ty.changeElementSize(32);
3135 
3136   if (IsExtLoad)
3137     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3138   else if (Unpacked && IsD16 && Ty.isVector())
3139     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3140   else
3141     LoadDstReg = Dst;
3142 
3143   if (!VIndex)
3144     VIndex = B.buildConstant(S32, 0).getReg(0);
3145 
3146   auto MIB = B.buildInstr(Opc)
3147     .addDef(LoadDstReg)         // vdata
3148     .addUse(RSrc)               // rsrc
3149     .addUse(VIndex)             // vindex
3150     .addUse(VOffset)            // voffset
3151     .addUse(SOffset)            // soffset
3152     .addImm(ImmOffset);         // offset(imm)
3153 
3154   if (IsTyped)
3155     MIB.addImm(Format);
3156 
3157   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3158      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3159      .addMemOperand(MMO);
3160 
3161   if (LoadDstReg != Dst) {
3162     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3163 
3164     // Widen result for extending loads was widened.
3165     if (IsExtLoad)
3166       B.buildTrunc(Dst, LoadDstReg);
3167     else {
3168       // Repack to original 16-bit vector result
3169       // FIXME: G_TRUNC should work, but legalization currently fails
3170       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3171       SmallVector<Register, 4> Repack;
3172       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3173         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3174       B.buildMerge(Dst, Repack);
3175     }
3176   }
3177 
3178   MI.eraseFromParent();
3179   return true;
3180 }
3181 
3182 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3183                                                MachineIRBuilder &B,
3184                                                bool IsInc) const {
3185   B.setInstr(MI);
3186   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3187                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3188   B.buildInstr(Opc)
3189     .addDef(MI.getOperand(0).getReg())
3190     .addUse(MI.getOperand(2).getReg())
3191     .addUse(MI.getOperand(3).getReg())
3192     .cloneMemRefs(MI);
3193   MI.eraseFromParent();
3194   return true;
3195 }
3196 
3197 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3198   switch (IntrID) {
3199   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3200   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3201     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3202   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3203   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3204     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3205   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3206   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3207     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3208   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3209   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3210     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3211   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3212   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3213     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3214   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3215   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3216     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3217   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3218   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3219     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3220   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3221   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3222     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3223   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3224   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3225     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3226   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3227   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3228     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3229   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3230   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3231     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3232   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3233   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3234     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3235   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3236   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3237     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3238   default:
3239     llvm_unreachable("unhandled atomic opcode");
3240   }
3241 }
3242 
3243 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3244                                                MachineIRBuilder &B,
3245                                                Intrinsic::ID IID) const {
3246   B.setInstr(MI);
3247 
3248   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3249                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3250 
3251   Register Dst = MI.getOperand(0).getReg();
3252   Register VData = MI.getOperand(2).getReg();
3253 
3254   Register CmpVal;
3255   int OpOffset = 0;
3256 
3257   if (IsCmpSwap) {
3258     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3259     ++OpOffset;
3260   }
3261 
3262   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3263   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
3264 
3265   // The struct intrinsic variants add one additional operand over raw.
3266   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3267   Register VIndex;
3268   if (HasVIndex) {
3269     VIndex = MI.getOperand(4 + OpOffset).getReg();
3270     ++OpOffset;
3271   }
3272 
3273   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3274   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3275   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3276 
3277   MachineMemOperand *MMO = *MI.memoperands_begin();
3278 
3279   unsigned ImmOffset;
3280   unsigned TotalOffset;
3281   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3282   if (TotalOffset != 0)
3283     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3284 
3285   if (!VIndex)
3286     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3287 
3288   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
3289     .addDef(Dst)
3290     .addUse(VData); // vdata
3291 
3292   if (IsCmpSwap)
3293     MIB.addReg(CmpVal);
3294 
3295   MIB.addUse(RSrc)               // rsrc
3296      .addUse(VIndex)             // vindex
3297      .addUse(VOffset)            // voffset
3298      .addUse(SOffset)            // soffset
3299      .addImm(ImmOffset)          // offset(imm)
3300      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3301      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3302      .addMemOperand(MMO);
3303 
3304   MI.eraseFromParent();
3305   return true;
3306 }
3307 
3308 // Produce a vector of s16 elements from s32 pieces.
3309 static void truncToS16Vector(MachineIRBuilder &B, Register DstReg,
3310                              ArrayRef<Register> UnmergeParts) {
3311   const LLT S16 = LLT::scalar(16);
3312 
3313   SmallVector<Register, 4> RemergeParts(UnmergeParts.size());
3314   for (int I = 0, E = UnmergeParts.size(); I != E; ++I)
3315     RemergeParts[I] = B.buildTrunc(S16, UnmergeParts[I]).getReg(0);
3316 
3317   B.buildBuildVector(DstReg, RemergeParts);
3318 }
3319 
3320 /// Convert a set of s32 registers to a result vector with s16 elements.
3321 static void bitcastToS16Vector(MachineIRBuilder &B, Register DstReg,
3322                                ArrayRef<Register> UnmergeParts) {
3323   MachineRegisterInfo &MRI = *B.getMRI();
3324   const LLT V2S16 = LLT::vector(2, 16);
3325   LLT TargetTy = MRI.getType(DstReg);
3326   int NumElts = UnmergeParts.size();
3327 
3328   if (NumElts == 1) {
3329     assert(TargetTy == V2S16);
3330     B.buildBitcast(DstReg, UnmergeParts[0]);
3331     return;
3332   }
3333 
3334   SmallVector<Register, 4> RemergeParts(NumElts);
3335   for (int I = 0; I != NumElts; ++I)
3336     RemergeParts[I] = B.buildBitcast(V2S16, UnmergeParts[I]).getReg(0);
3337 
3338   if (TargetTy.getSizeInBits() == 32u * NumElts) {
3339     B.buildConcatVectors(DstReg, RemergeParts);
3340     return;
3341   }
3342 
3343   const LLT V3S16 = LLT::vector(3, 16);
3344   const LLT V6S16 = LLT::vector(6, 16);
3345 
3346   // Widen to v6s16 and unpack v3 parts.
3347   assert(TargetTy == V3S16);
3348 
3349   RemergeParts.push_back(B.buildUndef(V2S16).getReg(0));
3350   auto Concat = B.buildConcatVectors(V6S16, RemergeParts);
3351   B.buildUnmerge({DstReg, MRI.createGenericVirtualRegister(V3S16)}, Concat);
3352 }
3353 
3354 // FIXME: Just vector trunc should be sufficent, but legalization currently
3355 // broken.
3356 static void repackUnpackedD16Load(MachineIRBuilder &B, Register DstReg,
3357                                   Register WideDstReg) {
3358   const LLT S32 = LLT::scalar(32);
3359   const LLT S16 = LLT::scalar(16);
3360 
3361   auto Unmerge = B.buildUnmerge(S32, WideDstReg);
3362 
3363   int NumOps = Unmerge->getNumOperands() - 1;
3364   SmallVector<Register, 4> RemergeParts(NumOps);
3365   for (int I = 0; I != NumOps; ++I)
3366     RemergeParts[I] = B.buildTrunc(S16, Unmerge.getReg(I)).getReg(0);
3367 
3368   B.buildBuildVector(DstReg, RemergeParts);
3369 }
3370 
3371 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3372     MachineInstr &MI, MachineIRBuilder &B,
3373     GISelChangeObserver &Observer,
3374     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3375   bool IsTFE = MI.getNumExplicitDefs() == 2;
3376 
3377   // We are only processing the operands of d16 image operations on subtargets
3378   // that use the unpacked register layout, or need to repack the TFE result.
3379 
3380   // TODO: Need to handle a16 images too
3381   // TODO: Do we need to guard against already legalized intrinsics?
3382   if (!IsTFE && !ST.hasUnpackedD16VMem())
3383     return true;
3384 
3385   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3386     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3387 
3388   if (BaseOpcode->Atomic) // No d16 atomics, or TFE.
3389     return true;
3390 
3391   B.setInstr(MI);
3392 
3393   MachineRegisterInfo *MRI = B.getMRI();
3394   const LLT S32 = LLT::scalar(32);
3395   const LLT S16 = LLT::scalar(16);
3396 
3397   if (BaseOpcode->Store) { // No TFE for stores?
3398     Register VData = MI.getOperand(1).getReg();
3399     LLT Ty = MRI->getType(VData);
3400     if (!Ty.isVector() || Ty.getElementType() != S16)
3401       return true;
3402 
3403     B.setInstr(MI);
3404 
3405     Observer.changingInstr(MI);
3406     MI.getOperand(1).setReg(handleD16VData(B, *MRI, VData));
3407     Observer.changedInstr(MI);
3408     return true;
3409   }
3410 
3411   Register DstReg = MI.getOperand(0).getReg();
3412   LLT Ty = MRI->getType(DstReg);
3413   const LLT EltTy = Ty.getScalarType();
3414   const bool IsD16 = Ty.getScalarType() == S16;
3415   const unsigned NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
3416 
3417   if (IsTFE) {
3418     // In the IR, TFE is supposed to be used with a 2 element struct return
3419     // type. The intruction really returns these two values in one contiguous
3420     // register, with one additional dword beyond the loaded data. Rewrite the
3421     // return type to use a single register result.
3422     Register Dst1Reg = MI.getOperand(1).getReg();
3423     if (MRI->getType(Dst1Reg) != S32)
3424       return false;
3425 
3426     // TODO: Make sure the TFE operand bit is set.
3427 
3428     // The raw dword aligned data component of the load. The only legal cases
3429     // where this matters should be when using the packed D16 format, for
3430     // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
3431     LLT RoundedTy;
3432     LLT TFETy;
3433 
3434     if (IsD16 && ST.hasUnpackedD16VMem()) {
3435       RoundedTy = LLT::scalarOrVector(NumElts, 32);
3436       TFETy = LLT::vector(NumElts + 1, 32);
3437     } else {
3438       unsigned EltSize = Ty.getScalarSizeInBits();
3439       unsigned RoundedElts = (Ty.getSizeInBits() + 31) / 32;
3440       unsigned RoundedSize = 32 * RoundedElts;
3441       RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
3442       TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
3443     }
3444 
3445     Register TFEReg = MRI->createGenericVirtualRegister(TFETy);
3446     Observer.changingInstr(MI);
3447 
3448     MI.getOperand(0).setReg(TFEReg);
3449     MI.RemoveOperand(1);
3450 
3451     Observer.changedInstr(MI);
3452 
3453     // Insert after the instruction.
3454     B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3455 
3456     // Now figure out how to copy the new result register back into the old
3457     // result.
3458 
3459     SmallVector<Register, 5> UnmergeResults(TFETy.getNumElements(), Dst1Reg);
3460     int NumDataElts = TFETy.getNumElements() - 1;
3461 
3462     if (!Ty.isVector()) {
3463       // Simplest case is a trivial unmerge (plus a truncate for d16).
3464       UnmergeResults[0] = Ty == S32 ?
3465         DstReg : MRI->createGenericVirtualRegister(S32);
3466 
3467       B.buildUnmerge(UnmergeResults, TFEReg);
3468       if (Ty != S32)
3469         B.buildTrunc(DstReg, UnmergeResults[0]);
3470       return true;
3471     }
3472 
3473     // We have to repack into a new vector of some kind.
3474     for (int I = 0; I != NumDataElts; ++I)
3475       UnmergeResults[I] = MRI->createGenericVirtualRegister(S32);
3476     B.buildUnmerge(UnmergeResults, TFEReg);
3477 
3478     // Drop the final TFE element.
3479     ArrayRef<Register> DataPart(UnmergeResults.data(), NumDataElts);
3480 
3481     if (EltTy == S32)
3482       B.buildBuildVector(DstReg, DataPart);
3483     else if (ST.hasUnpackedD16VMem())
3484       truncToS16Vector(B, DstReg, DataPart);
3485     else
3486       bitcastToS16Vector(B, DstReg, DataPart);
3487 
3488     return true;
3489   }
3490 
3491   // Must be an image load.
3492   if (!Ty.isVector() || Ty.getElementType() != S16)
3493     return true;
3494 
3495   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3496 
3497   LLT WidenedTy = Ty.changeElementType(S32);
3498   Register WideDstReg = MRI->createGenericVirtualRegister(WidenedTy);
3499 
3500   Observer.changingInstr(MI);
3501   MI.getOperand(0).setReg(WideDstReg);
3502   Observer.changedInstr(MI);
3503 
3504   repackUnpackedD16Load(B, DstReg, WideDstReg);
3505   return true;
3506 }
3507 
3508 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
3509   MachineInstr &MI, MachineIRBuilder &B,
3510   GISelChangeObserver &Observer) const {
3511   Register Dst = MI.getOperand(0).getReg();
3512   LLT Ty = B.getMRI()->getType(Dst);
3513   unsigned Size = Ty.getSizeInBits();
3514   MachineFunction &MF = B.getMF();
3515 
3516   Observer.changingInstr(MI);
3517 
3518   // FIXME: We don't really need this intermediate instruction. The intrinsic
3519   // should be fixed to have a memory operand. Since it's readnone, we're not
3520   // allowed to add one.
3521   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
3522   MI.RemoveOperand(1); // Remove intrinsic ID
3523 
3524   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
3525   // TODO: Should this use datalayout alignment?
3526   const unsigned MemSize = (Size + 7) / 8;
3527   const unsigned MemAlign = 4;
3528   MachineMemOperand *MMO = MF.getMachineMemOperand(
3529     MachinePointerInfo(),
3530     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
3531     MachineMemOperand::MOInvariant, MemSize, MemAlign);
3532   MI.addMemOperand(MF, MMO);
3533 
3534   // There are no 96-bit result scalar loads, but widening to 128-bit should
3535   // always be legal. We may need to restore this to a 96-bit result if it turns
3536   // out this needs to be converted to a vector load during RegBankSelect.
3537   if (!isPowerOf2_32(Size)) {
3538     LegalizerHelper Helper(MF, *this, Observer, B);
3539     B.setInstr(MI);
3540 
3541     if (Ty.isVector())
3542       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
3543     else
3544       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
3545   }
3546 
3547   Observer.changedInstr(MI);
3548   return true;
3549 }
3550 
3551 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
3552                                             MachineIRBuilder &B,
3553                                             GISelChangeObserver &Observer) const {
3554   MachineRegisterInfo &MRI = *B.getMRI();
3555 
3556   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
3557   auto IntrID = MI.getIntrinsicID();
3558   switch (IntrID) {
3559   case Intrinsic::amdgcn_if:
3560   case Intrinsic::amdgcn_else: {
3561     MachineInstr *Br = nullptr;
3562     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
3563       const SIRegisterInfo *TRI
3564         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
3565 
3566       B.setInstr(*BrCond);
3567       Register Def = MI.getOperand(1).getReg();
3568       Register Use = MI.getOperand(3).getReg();
3569 
3570       MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB();
3571       if (Br)
3572         BrTarget = Br->getOperand(0).getMBB();
3573 
3574       if (IntrID == Intrinsic::amdgcn_if) {
3575         B.buildInstr(AMDGPU::SI_IF)
3576           .addDef(Def)
3577           .addUse(Use)
3578           .addMBB(BrTarget);
3579       } else {
3580         B.buildInstr(AMDGPU::SI_ELSE)
3581           .addDef(Def)
3582           .addUse(Use)
3583           .addMBB(BrTarget)
3584           .addImm(0);
3585       }
3586 
3587       if (Br)
3588         Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB());
3589 
3590       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
3591       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
3592       MI.eraseFromParent();
3593       BrCond->eraseFromParent();
3594       return true;
3595     }
3596 
3597     return false;
3598   }
3599   case Intrinsic::amdgcn_loop: {
3600     MachineInstr *Br = nullptr;
3601     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
3602       const SIRegisterInfo *TRI
3603         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
3604 
3605       B.setInstr(*BrCond);
3606 
3607       MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB();
3608       if (Br)
3609         BrTarget = Br->getOperand(0).getMBB();
3610 
3611       Register Reg = MI.getOperand(2).getReg();
3612       B.buildInstr(AMDGPU::SI_LOOP)
3613         .addUse(Reg)
3614         .addMBB(BrTarget);
3615 
3616       if (Br)
3617         Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB());
3618 
3619       MI.eraseFromParent();
3620       BrCond->eraseFromParent();
3621       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
3622       return true;
3623     }
3624 
3625     return false;
3626   }
3627   case Intrinsic::amdgcn_kernarg_segment_ptr:
3628     return legalizePreloadedArgIntrin(
3629       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
3630   case Intrinsic::amdgcn_implicitarg_ptr:
3631     return legalizeImplicitArgPtr(MI, MRI, B);
3632   case Intrinsic::amdgcn_workitem_id_x:
3633     return legalizePreloadedArgIntrin(MI, MRI, B,
3634                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
3635   case Intrinsic::amdgcn_workitem_id_y:
3636     return legalizePreloadedArgIntrin(MI, MRI, B,
3637                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
3638   case Intrinsic::amdgcn_workitem_id_z:
3639     return legalizePreloadedArgIntrin(MI, MRI, B,
3640                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
3641   case Intrinsic::amdgcn_workgroup_id_x:
3642     return legalizePreloadedArgIntrin(MI, MRI, B,
3643                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
3644   case Intrinsic::amdgcn_workgroup_id_y:
3645     return legalizePreloadedArgIntrin(MI, MRI, B,
3646                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
3647   case Intrinsic::amdgcn_workgroup_id_z:
3648     return legalizePreloadedArgIntrin(MI, MRI, B,
3649                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
3650   case Intrinsic::amdgcn_dispatch_ptr:
3651     return legalizePreloadedArgIntrin(MI, MRI, B,
3652                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
3653   case Intrinsic::amdgcn_queue_ptr:
3654     return legalizePreloadedArgIntrin(MI, MRI, B,
3655                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
3656   case Intrinsic::amdgcn_implicit_buffer_ptr:
3657     return legalizePreloadedArgIntrin(
3658       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
3659   case Intrinsic::amdgcn_dispatch_id:
3660     return legalizePreloadedArgIntrin(MI, MRI, B,
3661                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
3662   case Intrinsic::amdgcn_fdiv_fast:
3663     return legalizeFDIVFastIntrin(MI, MRI, B);
3664   case Intrinsic::amdgcn_is_shared:
3665     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
3666   case Intrinsic::amdgcn_is_private:
3667     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
3668   case Intrinsic::amdgcn_wavefrontsize: {
3669     B.setInstr(MI);
3670     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
3671     MI.eraseFromParent();
3672     return true;
3673   }
3674   case Intrinsic::amdgcn_s_buffer_load:
3675     return legalizeSBufferLoad(MI, B, Observer);
3676   case Intrinsic::amdgcn_raw_buffer_store:
3677   case Intrinsic::amdgcn_struct_buffer_store:
3678     return legalizeBufferStore(MI, MRI, B, false, false);
3679   case Intrinsic::amdgcn_raw_buffer_store_format:
3680   case Intrinsic::amdgcn_struct_buffer_store_format:
3681     return legalizeBufferStore(MI, MRI, B, false, true);
3682   case Intrinsic::amdgcn_raw_tbuffer_store:
3683   case Intrinsic::amdgcn_struct_tbuffer_store:
3684     return legalizeBufferStore(MI, MRI, B, true, true);
3685   case Intrinsic::amdgcn_raw_buffer_load:
3686   case Intrinsic::amdgcn_struct_buffer_load:
3687     return legalizeBufferLoad(MI, MRI, B, false, false);
3688   case Intrinsic::amdgcn_raw_buffer_load_format:
3689   case Intrinsic::amdgcn_struct_buffer_load_format:
3690     return legalizeBufferLoad(MI, MRI, B, true, false);
3691   case Intrinsic::amdgcn_raw_tbuffer_load:
3692   case Intrinsic::amdgcn_struct_tbuffer_load:
3693     return legalizeBufferLoad(MI, MRI, B, true, true);
3694   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3695   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3696   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3697   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3698   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3699   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3700   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3701   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3702   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3703   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3704   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3705   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3706   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3707   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3708   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3709   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3710   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3711   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3712   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3713   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3714   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3715   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3716   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3717   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3718   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3719   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3720     return legalizeBufferAtomic(MI, B, IntrID);
3721   case Intrinsic::amdgcn_atomic_inc:
3722     return legalizeAtomicIncDec(MI, B, true);
3723   case Intrinsic::amdgcn_atomic_dec:
3724     return legalizeAtomicIncDec(MI, B, false);
3725   default: {
3726     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
3727             AMDGPU::getImageDimIntrinsicInfo(IntrID))
3728       return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr);
3729     return true;
3730   }
3731   }
3732 
3733   return true;
3734 }
3735