1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
19 #endif
20 
21 #include "AMDGPULegalizerInfo.h"
22 
23 #include "AMDGPU.h"
24 #include "AMDGPUGlobalISelUtils.h"
25 #include "AMDGPUTargetMachine.h"
26 #include "SIMachineFunctionInfo.h"
27 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
28 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
29 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
30 #include "llvm/CodeGen/TargetOpcodes.h"
31 #include "llvm/CodeGen/ValueTypes.h"
32 #include "llvm/IR/DerivedTypes.h"
33 #include "llvm/IR/DiagnosticInfo.h"
34 #include "llvm/IR/Type.h"
35 #include "llvm/Support/Debug.h"
36 
37 #define DEBUG_TYPE "amdgpu-legalinfo"
38 
39 using namespace llvm;
40 using namespace LegalizeActions;
41 using namespace LegalizeMutations;
42 using namespace LegalityPredicates;
43 using namespace MIPatternMatch;
44 
45 // Round the number of elements to the next power of two elements
46 static LLT getPow2VectorType(LLT Ty) {
47   unsigned NElts = Ty.getNumElements();
48   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
49   return Ty.changeNumElements(Pow2NElts);
50 }
51 
52 // Round the number of bits to the next power of two bits
53 static LLT getPow2ScalarType(LLT Ty) {
54   unsigned Bits = Ty.getSizeInBits();
55   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
56   return LLT::scalar(Pow2Bits);
57 }
58 
59 static LegalityPredicate isMultiple32(unsigned TypeIdx,
60                                       unsigned MaxSize = 1024) {
61   return [=](const LegalityQuery &Query) {
62     const LLT Ty = Query.Types[TypeIdx];
63     const LLT EltTy = Ty.getScalarType();
64     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
65   };
66 }
67 
68 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
69   return [=](const LegalityQuery &Query) {
70     return Query.Types[TypeIdx].getSizeInBits() == Size;
71   };
72 }
73 
74 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
75   return [=](const LegalityQuery &Query) {
76     const LLT Ty = Query.Types[TypeIdx];
77     return Ty.isVector() &&
78            Ty.getNumElements() % 2 != 0 &&
79            Ty.getElementType().getSizeInBits() < 32 &&
80            Ty.getSizeInBits() % 32 != 0;
81   };
82 }
83 
84 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
85   return [=](const LegalityQuery &Query) {
86     const LLT Ty = Query.Types[TypeIdx];
87     const LLT EltTy = Ty.getScalarType();
88     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
89   };
90 }
91 
92 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
93   return [=](const LegalityQuery &Query) {
94     const LLT Ty = Query.Types[TypeIdx];
95     const LLT EltTy = Ty.getElementType();
96     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
97   };
98 }
99 
100 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
101   return [=](const LegalityQuery &Query) {
102     const LLT Ty = Query.Types[TypeIdx];
103     const LLT EltTy = Ty.getElementType();
104     unsigned Size = Ty.getSizeInBits();
105     unsigned Pieces = (Size + 63) / 64;
106     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
107     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
108   };
109 }
110 
111 // Increase the number of vector elements to reach the next multiple of 32-bit
112 // type.
113 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
114   return [=](const LegalityQuery &Query) {
115     const LLT Ty = Query.Types[TypeIdx];
116 
117     const LLT EltTy = Ty.getElementType();
118     const int Size = Ty.getSizeInBits();
119     const int EltSize = EltTy.getSizeInBits();
120     const int NextMul32 = (Size + 31) / 32;
121 
122     assert(EltSize < 32);
123 
124     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
125     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
126   };
127 }
128 
129 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
130   return [=](const LegalityQuery &Query) {
131     const LLT QueryTy = Query.Types[TypeIdx];
132     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
133   };
134 }
135 
136 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
137   return [=](const LegalityQuery &Query) {
138     const LLT QueryTy = Query.Types[TypeIdx];
139     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
140   };
141 }
142 
143 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
144   return [=](const LegalityQuery &Query) {
145     const LLT QueryTy = Query.Types[TypeIdx];
146     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
147   };
148 }
149 
150 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
151 // v2s16.
152 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
153   return [=](const LegalityQuery &Query) {
154     const LLT Ty = Query.Types[TypeIdx];
155     if (Ty.isVector()) {
156       const int EltSize = Ty.getElementType().getSizeInBits();
157       return EltSize == 32 || EltSize == 64 ||
158             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
159              EltSize == 128 || EltSize == 256;
160     }
161 
162     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
163   };
164 }
165 
166 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
167   return [=](const LegalityQuery &Query) {
168     const LLT QueryTy = Query.Types[TypeIdx];
169     return QueryTy.isVector() && QueryTy.getElementType() == Type;
170   };
171 }
172 
173 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
174   return [=](const LegalityQuery &Query) {
175     const LLT QueryTy = Query.Types[TypeIdx];
176     if (!QueryTy.isVector())
177       return false;
178     const LLT EltTy = QueryTy.getElementType();
179     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
180   };
181 }
182 
183 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
184   return [=](const LegalityQuery &Query) {
185     const LLT Ty = Query.Types[TypeIdx];
186     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
187            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
188   };
189 }
190 
191 static LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1) {
192   return [=](const LegalityQuery &Query) {
193     return Query.Types[TypeIdx0].getSizeInBits() <
194            Query.Types[TypeIdx1].getSizeInBits();
195   };
196 }
197 
198 static LegalityPredicate greaterThan(unsigned TypeIdx0, unsigned TypeIdx1) {
199   return [=](const LegalityQuery &Query) {
200     return Query.Types[TypeIdx0].getSizeInBits() >
201            Query.Types[TypeIdx1].getSizeInBits();
202   };
203 }
204 
205 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
206                                          const GCNTargetMachine &TM)
207   :  ST(ST_) {
208   using namespace TargetOpcode;
209 
210   auto GetAddrSpacePtr = [&TM](unsigned AS) {
211     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
212   };
213 
214   const LLT S1 = LLT::scalar(1);
215   const LLT S16 = LLT::scalar(16);
216   const LLT S32 = LLT::scalar(32);
217   const LLT S64 = LLT::scalar(64);
218   const LLT S128 = LLT::scalar(128);
219   const LLT S256 = LLT::scalar(256);
220   const LLT S1024 = LLT::scalar(1024);
221 
222   const LLT V2S16 = LLT::vector(2, 16);
223   const LLT V4S16 = LLT::vector(4, 16);
224 
225   const LLT V2S32 = LLT::vector(2, 32);
226   const LLT V3S32 = LLT::vector(3, 32);
227   const LLT V4S32 = LLT::vector(4, 32);
228   const LLT V5S32 = LLT::vector(5, 32);
229   const LLT V6S32 = LLT::vector(6, 32);
230   const LLT V7S32 = LLT::vector(7, 32);
231   const LLT V8S32 = LLT::vector(8, 32);
232   const LLT V9S32 = LLT::vector(9, 32);
233   const LLT V10S32 = LLT::vector(10, 32);
234   const LLT V11S32 = LLT::vector(11, 32);
235   const LLT V12S32 = LLT::vector(12, 32);
236   const LLT V13S32 = LLT::vector(13, 32);
237   const LLT V14S32 = LLT::vector(14, 32);
238   const LLT V15S32 = LLT::vector(15, 32);
239   const LLT V16S32 = LLT::vector(16, 32);
240   const LLT V32S32 = LLT::vector(32, 32);
241 
242   const LLT V2S64 = LLT::vector(2, 64);
243   const LLT V3S64 = LLT::vector(3, 64);
244   const LLT V4S64 = LLT::vector(4, 64);
245   const LLT V5S64 = LLT::vector(5, 64);
246   const LLT V6S64 = LLT::vector(6, 64);
247   const LLT V7S64 = LLT::vector(7, 64);
248   const LLT V8S64 = LLT::vector(8, 64);
249   const LLT V16S64 = LLT::vector(16, 64);
250 
251   std::initializer_list<LLT> AllS32Vectors =
252     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
253      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
254   std::initializer_list<LLT> AllS64Vectors =
255     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
256 
257   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
258   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
259   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
260   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
261   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
262   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
263   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
264 
265   const LLT CodePtr = FlatPtr;
266 
267   const std::initializer_list<LLT> AddrSpaces64 = {
268     GlobalPtr, ConstantPtr, FlatPtr
269   };
270 
271   const std::initializer_list<LLT> AddrSpaces32 = {
272     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
273   };
274 
275   const std::initializer_list<LLT> FPTypesBase = {
276     S32, S64
277   };
278 
279   const std::initializer_list<LLT> FPTypes16 = {
280     S32, S64, S16
281   };
282 
283   const std::initializer_list<LLT> FPTypesPK16 = {
284     S32, S64, S16, V2S16
285   };
286 
287   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
288 
289   setAction({G_BRCOND, S1}, Legal); // VCC branches
290   setAction({G_BRCOND, S32}, Legal); // SCC branches
291 
292   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
293   // elements for v3s16
294   getActionDefinitionsBuilder(G_PHI)
295     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
296     .legalFor(AllS32Vectors)
297     .legalFor(AllS64Vectors)
298     .legalFor(AddrSpaces64)
299     .legalFor(AddrSpaces32)
300     .clampScalar(0, S32, S256)
301     .widenScalarToNextPow2(0, 32)
302     .clampMaxNumElements(0, S32, 16)
303     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
304     .legalIf(isPointer(0));
305 
306   if (ST.hasVOP3PInsts()) {
307     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
308       .legalFor({S32, S16, V2S16})
309       .clampScalar(0, S16, S32)
310       .clampMaxNumElements(0, S16, 2)
311       .scalarize(0)
312       .widenScalarToNextPow2(0, 32);
313   } else if (ST.has16BitInsts()) {
314     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
315       .legalFor({S32, S16})
316       .clampScalar(0, S16, S32)
317       .scalarize(0)
318       .widenScalarToNextPow2(0, 32);
319   } else {
320     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
321       .legalFor({S32})
322       .clampScalar(0, S32, S32)
323       .scalarize(0);
324   }
325 
326   // FIXME: Not really legal. Placeholder for custom lowering.
327   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
328     .customFor({S32, S64})
329     .clampScalar(0, S32, S64)
330     .widenScalarToNextPow2(0, 32)
331     .scalarize(0);
332 
333   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
334     .legalFor({S32})
335     .clampScalar(0, S32, S32)
336     .scalarize(0);
337 
338   // Report legal for any types we can handle anywhere. For the cases only legal
339   // on the SALU, RegBankSelect will be able to re-legalize.
340   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
341     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
342     .clampScalar(0, S32, S64)
343     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
344     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
345     .widenScalarToNextPow2(0)
346     .scalarize(0);
347 
348   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
349                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
350     .legalFor({{S32, S1}, {S32, S32}})
351     .minScalar(0, S32)
352     // TODO: .scalarize(0)
353     .lower();
354 
355   getActionDefinitionsBuilder(G_BITCAST)
356     // Don't worry about the size constraint.
357     .legalIf(all(isRegisterType(0), isRegisterType(1)))
358     .lower();
359 
360 
361   getActionDefinitionsBuilder(G_CONSTANT)
362     .legalFor({S1, S32, S64, S16, GlobalPtr,
363                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
364     .clampScalar(0, S32, S64)
365     .widenScalarToNextPow2(0)
366     .legalIf(isPointer(0));
367 
368   getActionDefinitionsBuilder(G_FCONSTANT)
369     .legalFor({S32, S64, S16})
370     .clampScalar(0, S16, S64);
371 
372   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
373     .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
374                ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
375     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
376     .clampScalarOrElt(0, S32, S1024)
377     .legalIf(isMultiple32(0))
378     .widenScalarToNextPow2(0, 32)
379     .clampMaxNumElements(0, S32, 16);
380 
381   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
382   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
383     .unsupportedFor({PrivatePtr})
384     .custom();
385   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
386 
387   auto &FPOpActions = getActionDefinitionsBuilder(
388     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
389     .legalFor({S32, S64});
390   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
391     .customFor({S32, S64});
392   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
393     .customFor({S32, S64});
394 
395   if (ST.has16BitInsts()) {
396     if (ST.hasVOP3PInsts())
397       FPOpActions.legalFor({S16, V2S16});
398     else
399       FPOpActions.legalFor({S16});
400 
401     TrigActions.customFor({S16});
402     FDIVActions.customFor({S16});
403   }
404 
405   auto &MinNumMaxNum = getActionDefinitionsBuilder({
406       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
407 
408   if (ST.hasVOP3PInsts()) {
409     MinNumMaxNum.customFor(FPTypesPK16)
410       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
411       .clampMaxNumElements(0, S16, 2)
412       .clampScalar(0, S16, S64)
413       .scalarize(0);
414   } else if (ST.has16BitInsts()) {
415     MinNumMaxNum.customFor(FPTypes16)
416       .clampScalar(0, S16, S64)
417       .scalarize(0);
418   } else {
419     MinNumMaxNum.customFor(FPTypesBase)
420       .clampScalar(0, S32, S64)
421       .scalarize(0);
422   }
423 
424   if (ST.hasVOP3PInsts())
425     FPOpActions.clampMaxNumElements(0, S16, 2);
426 
427   FPOpActions
428     .scalarize(0)
429     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
430 
431   TrigActions
432     .scalarize(0)
433     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
434 
435   FDIVActions
436     .scalarize(0)
437     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
438 
439   getActionDefinitionsBuilder({G_FNEG, G_FABS})
440     .legalFor(FPTypesPK16)
441     .clampMaxNumElements(0, S16, 2)
442     .scalarize(0)
443     .clampScalar(0, S16, S64);
444 
445   if (ST.has16BitInsts()) {
446     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
447       .legalFor({S32, S64, S16})
448       .scalarize(0)
449       .clampScalar(0, S16, S64);
450   } else {
451     getActionDefinitionsBuilder(G_FSQRT)
452       .legalFor({S32, S64})
453       .scalarize(0)
454       .clampScalar(0, S32, S64);
455 
456     if (ST.hasFractBug()) {
457       getActionDefinitionsBuilder(G_FFLOOR)
458         .customFor({S64})
459         .legalFor({S32, S64})
460         .scalarize(0)
461         .clampScalar(0, S32, S64);
462     } else {
463       getActionDefinitionsBuilder(G_FFLOOR)
464         .legalFor({S32, S64})
465         .scalarize(0)
466         .clampScalar(0, S32, S64);
467     }
468   }
469 
470   getActionDefinitionsBuilder(G_FPTRUNC)
471     .legalFor({{S32, S64}, {S16, S32}})
472     .scalarize(0)
473     .lower();
474 
475   getActionDefinitionsBuilder(G_FPEXT)
476     .legalFor({{S64, S32}, {S32, S16}})
477     .lowerFor({{S64, S16}}) // FIXME: Implement
478     .scalarize(0);
479 
480   getActionDefinitionsBuilder(G_FSUB)
481       // Use actual fsub instruction
482       .legalFor({S32})
483       // Must use fadd + fneg
484       .lowerFor({S64, S16, V2S16})
485       .scalarize(0)
486       .clampScalar(0, S32, S64);
487 
488   // Whether this is legal depends on the floating point mode for the function.
489   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
490   if (ST.hasMadF16())
491     FMad.customFor({S32, S16});
492   else
493     FMad.customFor({S32});
494   FMad.scalarize(0)
495       .lower();
496 
497   // TODO: Do we need to clamp maximum bitwidth?
498   getActionDefinitionsBuilder(G_TRUNC)
499     .legalIf(isScalar(0))
500     .legalFor({{V2S16, V2S32}})
501     .clampMaxNumElements(0, S16, 2)
502     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
503     // situations (like an invalid implicit use), we don't want to infinite loop
504     // in the legalizer.
505     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
506     .alwaysLegal();
507 
508   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
509     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
510                {S32, S1}, {S64, S1}, {S16, S1}})
511     .scalarize(0)
512     .clampScalar(0, S32, S64)
513     .widenScalarToNextPow2(1, 32);
514 
515   // TODO: Split s1->s64 during regbankselect for VALU.
516   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
517     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
518     .lowerFor({{S32, S64}})
519     .lowerIf(typeIs(1, S1))
520     .customFor({{S64, S64}});
521   if (ST.has16BitInsts())
522     IToFP.legalFor({{S16, S16}});
523   IToFP.clampScalar(1, S32, S64)
524        .scalarize(0)
525        .widenScalarToNextPow2(1);
526 
527   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
528     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
529     .customFor({{S64, S64}});
530   if (ST.has16BitInsts())
531     FPToI.legalFor({{S16, S16}});
532   else
533     FPToI.minScalar(1, S32);
534 
535   FPToI.minScalar(0, S32)
536        .scalarize(0)
537        .lower();
538 
539   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
540     .scalarize(0)
541     .lower();
542 
543   if (ST.has16BitInsts()) {
544     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
545       .legalFor({S16, S32, S64})
546       .clampScalar(0, S16, S64)
547       .scalarize(0);
548   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
549     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
550       .legalFor({S32, S64})
551       .clampScalar(0, S32, S64)
552       .scalarize(0);
553   } else {
554     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
555       .legalFor({S32})
556       .customFor({S64})
557       .clampScalar(0, S32, S64)
558       .scalarize(0);
559   }
560 
561   getActionDefinitionsBuilder({G_PTR_ADD, G_PTR_MASK})
562     .scalarize(0)
563     .alwaysLegal();
564 
565   auto &CmpBuilder =
566     getActionDefinitionsBuilder(G_ICMP)
567     // The compare output type differs based on the register bank of the output,
568     // so make both s1 and s32 legal.
569     //
570     // Scalar compares producing output in scc will be promoted to s32, as that
571     // is the allocatable register type that will be needed for the copy from
572     // scc. This will be promoted during RegBankSelect, and we assume something
573     // before that won't try to use s32 result types.
574     //
575     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
576     // bank.
577     .legalForCartesianProduct(
578       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
579     .legalForCartesianProduct(
580       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
581   if (ST.has16BitInsts()) {
582     CmpBuilder.legalFor({{S1, S16}});
583   }
584 
585   CmpBuilder
586     .widenScalarToNextPow2(1)
587     .clampScalar(1, S32, S64)
588     .scalarize(0)
589     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
590 
591   getActionDefinitionsBuilder(G_FCMP)
592     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
593     .widenScalarToNextPow2(1)
594     .clampScalar(1, S32, S64)
595     .scalarize(0);
596 
597   // FIXME: fpow has a selection pattern that should move to custom lowering.
598   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
599   if (ST.has16BitInsts())
600     Exp2Ops.legalFor({S32, S16});
601   else
602     Exp2Ops.legalFor({S32});
603   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
604   Exp2Ops.scalarize(0);
605 
606   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
607   if (ST.has16BitInsts())
608     ExpOps.customFor({{S32}, {S16}});
609   else
610     ExpOps.customFor({S32});
611   ExpOps.clampScalar(0, MinScalarFPTy, S32)
612         .scalarize(0);
613 
614   // The 64-bit versions produce 32-bit results, but only on the SALU.
615   getActionDefinitionsBuilder(G_CTPOP)
616     .legalFor({{S32, S32}, {S32, S64}})
617     .clampScalar(0, S32, S32)
618     .clampScalar(1, S32, S64)
619     .scalarize(0)
620     .widenScalarToNextPow2(0, 32)
621     .widenScalarToNextPow2(1, 32);
622 
623   // The hardware instructions return a different result on 0 than the generic
624   // instructions expect. The hardware produces -1, but these produce the
625   // bitwidth.
626   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
627     .scalarize(0)
628     .clampScalar(0, S32, S32)
629     .clampScalar(1, S32, S64)
630     .widenScalarToNextPow2(0, 32)
631     .widenScalarToNextPow2(1, 32)
632     .lower();
633 
634   // The 64-bit versions produce 32-bit results, but only on the SALU.
635   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
636     .legalFor({{S32, S32}, {S32, S64}})
637     .clampScalar(0, S32, S32)
638     .clampScalar(1, S32, S64)
639     .scalarize(0)
640     .widenScalarToNextPow2(0, 32)
641     .widenScalarToNextPow2(1, 32);
642 
643   getActionDefinitionsBuilder(G_BITREVERSE)
644     .legalFor({S32})
645     .clampScalar(0, S32, S32)
646     .scalarize(0);
647 
648   if (ST.has16BitInsts()) {
649     getActionDefinitionsBuilder(G_BSWAP)
650       .legalFor({S16, S32, V2S16})
651       .clampMaxNumElements(0, S16, 2)
652       // FIXME: Fixing non-power-of-2 before clamp is workaround for
653       // narrowScalar limitation.
654       .widenScalarToNextPow2(0)
655       .clampScalar(0, S16, S32)
656       .scalarize(0);
657 
658     if (ST.hasVOP3PInsts()) {
659       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
660         .legalFor({S32, S16, V2S16})
661         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
662         .clampMaxNumElements(0, S16, 2)
663         .minScalar(0, S16)
664         .widenScalarToNextPow2(0)
665         .scalarize(0)
666         .lower();
667     } else {
668       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
669         .legalFor({S32, S16})
670         .widenScalarToNextPow2(0)
671         .minScalar(0, S16)
672         .scalarize(0)
673         .lower();
674     }
675   } else {
676     // TODO: Should have same legality without v_perm_b32
677     getActionDefinitionsBuilder(G_BSWAP)
678       .legalFor({S32})
679       .lowerIf(narrowerThan(0, 32))
680       // FIXME: Fixing non-power-of-2 before clamp is workaround for
681       // narrowScalar limitation.
682       .widenScalarToNextPow2(0)
683       .maxScalar(0, S32)
684       .scalarize(0)
685       .lower();
686 
687     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
688       .legalFor({S32})
689       .minScalar(0, S32)
690       .widenScalarToNextPow2(0)
691       .scalarize(0)
692       .lower();
693   }
694 
695   getActionDefinitionsBuilder(G_INTTOPTR)
696     // List the common cases
697     .legalForCartesianProduct(AddrSpaces64, {S64})
698     .legalForCartesianProduct(AddrSpaces32, {S32})
699     .scalarize(0)
700     // Accept any address space as long as the size matches
701     .legalIf(sameSize(0, 1))
702     .widenScalarIf(smallerThan(1, 0),
703       [](const LegalityQuery &Query) {
704         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
705       })
706     .narrowScalarIf(greaterThan(1, 0),
707       [](const LegalityQuery &Query) {
708         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
709       });
710 
711   getActionDefinitionsBuilder(G_PTRTOINT)
712     // List the common cases
713     .legalForCartesianProduct(AddrSpaces64, {S64})
714     .legalForCartesianProduct(AddrSpaces32, {S32})
715     .scalarize(0)
716     // Accept any address space as long as the size matches
717     .legalIf(sameSize(0, 1))
718     .widenScalarIf(smallerThan(0, 1),
719       [](const LegalityQuery &Query) {
720         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
721       })
722     .narrowScalarIf(
723       greaterThan(0, 1),
724       [](const LegalityQuery &Query) {
725         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
726       });
727 
728   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
729     .scalarize(0)
730     .custom();
731 
732   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
733   // handle some operations by just promoting the register during
734   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
735   auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned {
736     switch (AS) {
737     // FIXME: Private element size.
738     case AMDGPUAS::PRIVATE_ADDRESS:
739       return 32;
740     // FIXME: Check subtarget
741     case AMDGPUAS::LOCAL_ADDRESS:
742       return ST.useDS128() ? 128 : 64;
743 
744     // Treat constant and global as identical. SMRD loads are sometimes usable
745     // for global loads (ideally constant address space should be eliminated)
746     // depending on the context. Legality cannot be context dependent, but
747     // RegBankSelect can split the load as necessary depending on the pointer
748     // register bank/uniformity and if the memory is invariant or not written in
749     // a kernel.
750     case AMDGPUAS::CONSTANT_ADDRESS:
751     case AMDGPUAS::GLOBAL_ADDRESS:
752       return IsLoad ? 512 : 128;
753     default:
754       return 128;
755     }
756   };
757 
758   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
759                                     bool IsLoad) -> bool {
760     const LLT DstTy = Query.Types[0];
761 
762     // Split vector extloads.
763     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
764     unsigned Align = Query.MMODescrs[0].AlignInBits;
765 
766     if (MemSize < DstTy.getSizeInBits())
767       MemSize = std::max(MemSize, Align);
768 
769     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
770       return true;
771 
772     const LLT PtrTy = Query.Types[1];
773     unsigned AS = PtrTy.getAddressSpace();
774     if (MemSize > maxSizeForAddrSpace(AS, IsLoad))
775       return true;
776 
777     // Catch weird sized loads that don't evenly divide into the access sizes
778     // TODO: May be able to widen depending on alignment etc.
779     unsigned NumRegs = (MemSize + 31) / 32;
780     if (NumRegs == 3) {
781       if (!ST.hasDwordx3LoadStores())
782         return true;
783     } else {
784       // If the alignment allows, these should have been widened.
785       if (!isPowerOf2_32(NumRegs))
786         return true;
787     }
788 
789     if (Align < MemSize) {
790       const SITargetLowering *TLI = ST.getTargetLowering();
791       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
792     }
793 
794     return false;
795   };
796 
797   const auto shouldWidenLoadResult = [=](const LegalityQuery &Query) -> bool {
798     unsigned Size = Query.Types[0].getSizeInBits();
799     if (isPowerOf2_32(Size))
800       return false;
801 
802     if (Size == 96 && ST.hasDwordx3LoadStores())
803       return false;
804 
805     unsigned AddrSpace = Query.Types[1].getAddressSpace();
806     if (Size >= maxSizeForAddrSpace(AddrSpace, true))
807       return false;
808 
809     unsigned Align = Query.MMODescrs[0].AlignInBits;
810     unsigned RoundedSize = NextPowerOf2(Size);
811     return (Align >= RoundedSize);
812   };
813 
814   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
815   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
816   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
817 
818   // TODO: Refine based on subtargets which support unaligned access or 128-bit
819   // LDS
820   // TODO: Unsupported flat for SI.
821 
822   for (unsigned Op : {G_LOAD, G_STORE}) {
823     const bool IsStore = Op == G_STORE;
824 
825     auto &Actions = getActionDefinitionsBuilder(Op);
826     // Whitelist the common cases.
827     // TODO: Loads to s16 on gfx9
828     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
829                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
830                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
831                                       {S128, GlobalPtr, 128, GlobalAlign32},
832                                       {S64, GlobalPtr, 64, GlobalAlign32},
833                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
834                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
835                                       {S32, GlobalPtr, 8, GlobalAlign8},
836                                       {S32, GlobalPtr, 16, GlobalAlign16},
837 
838                                       {S32, LocalPtr, 32, 32},
839                                       {S64, LocalPtr, 64, 32},
840                                       {V2S32, LocalPtr, 64, 32},
841                                       {S32, LocalPtr, 8, 8},
842                                       {S32, LocalPtr, 16, 16},
843                                       {V2S16, LocalPtr, 32, 32},
844 
845                                       {S32, PrivatePtr, 32, 32},
846                                       {S32, PrivatePtr, 8, 8},
847                                       {S32, PrivatePtr, 16, 16},
848                                       {V2S16, PrivatePtr, 32, 32},
849 
850                                       {S32, FlatPtr, 32, GlobalAlign32},
851                                       {S32, FlatPtr, 16, GlobalAlign16},
852                                       {S32, FlatPtr, 8, GlobalAlign8},
853                                       {V2S16, FlatPtr, 32, GlobalAlign32},
854 
855                                       {S32, ConstantPtr, 32, GlobalAlign32},
856                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
857                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
858                                       {S64, ConstantPtr, 64, GlobalAlign32},
859                                       {S128, ConstantPtr, 128, GlobalAlign32},
860                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
861     Actions
862         .customIf(typeIs(1, Constant32Ptr))
863         // Widen suitably aligned loads by loading extra elements.
864         .moreElementsIf([=](const LegalityQuery &Query) {
865             const LLT Ty = Query.Types[0];
866             return Op == G_LOAD && Ty.isVector() &&
867                    shouldWidenLoadResult(Query);
868           }, moreElementsToNextPow2(0))
869         .widenScalarIf([=](const LegalityQuery &Query) {
870             const LLT Ty = Query.Types[0];
871             return Op == G_LOAD && !Ty.isVector() &&
872                    shouldWidenLoadResult(Query);
873           }, widenScalarOrEltToNextPow2(0))
874         .narrowScalarIf(
875             [=](const LegalityQuery &Query) -> bool {
876               return !Query.Types[0].isVector() &&
877                      needToSplitMemOp(Query, Op == G_LOAD);
878             },
879             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
880               const LLT DstTy = Query.Types[0];
881               const LLT PtrTy = Query.Types[1];
882 
883               const unsigned DstSize = DstTy.getSizeInBits();
884               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
885 
886               // Split extloads.
887               if (DstSize > MemSize)
888                 return std::make_pair(0, LLT::scalar(MemSize));
889 
890               if (!isPowerOf2_32(DstSize)) {
891                 // We're probably decomposing an odd sized store. Try to split
892                 // to the widest type. TODO: Account for alignment. As-is it
893                 // should be OK, since the new parts will be further legalized.
894                 unsigned FloorSize = PowerOf2Floor(DstSize);
895                 return std::make_pair(0, LLT::scalar(FloorSize));
896               }
897 
898               if (DstSize > 32 && (DstSize % 32 != 0)) {
899                 // FIXME: Need a way to specify non-extload of larger size if
900                 // suitably aligned.
901                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
902               }
903 
904               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
905                                                      Op == G_LOAD);
906               if (MemSize > MaxSize)
907                 return std::make_pair(0, LLT::scalar(MaxSize));
908 
909               unsigned Align = Query.MMODescrs[0].AlignInBits;
910               return std::make_pair(0, LLT::scalar(Align));
911             })
912         .fewerElementsIf(
913             [=](const LegalityQuery &Query) -> bool {
914               return Query.Types[0].isVector() &&
915                      needToSplitMemOp(Query, Op == G_LOAD);
916             },
917             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
918               const LLT DstTy = Query.Types[0];
919               const LLT PtrTy = Query.Types[1];
920 
921               LLT EltTy = DstTy.getElementType();
922               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
923                                                      Op == G_LOAD);
924 
925               // FIXME: Handle widened to power of 2 results better. This ends
926               // up scalarizing.
927               // FIXME: 3 element stores scalarized on SI
928 
929               // Split if it's too large for the address space.
930               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
931                 unsigned NumElts = DstTy.getNumElements();
932                 unsigned EltSize = EltTy.getSizeInBits();
933 
934                 if (MaxSize % EltSize == 0) {
935                   return std::make_pair(
936                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
937                 }
938 
939                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
940 
941                 // FIXME: Refine when odd breakdowns handled
942                 // The scalars will need to be re-legalized.
943                 if (NumPieces == 1 || NumPieces >= NumElts ||
944                     NumElts % NumPieces != 0)
945                   return std::make_pair(0, EltTy);
946 
947                 return std::make_pair(0,
948                                       LLT::vector(NumElts / NumPieces, EltTy));
949               }
950 
951               // FIXME: We could probably handle weird extending loads better.
952               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
953               if (DstTy.getSizeInBits() > MemSize)
954                 return std::make_pair(0, EltTy);
955 
956               unsigned EltSize = EltTy.getSizeInBits();
957               unsigned DstSize = DstTy.getSizeInBits();
958               if (!isPowerOf2_32(DstSize)) {
959                 // We're probably decomposing an odd sized store. Try to split
960                 // to the widest type. TODO: Account for alignment. As-is it
961                 // should be OK, since the new parts will be further legalized.
962                 unsigned FloorSize = PowerOf2Floor(DstSize);
963                 return std::make_pair(
964                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
965               }
966 
967               // Need to split because of alignment.
968               unsigned Align = Query.MMODescrs[0].AlignInBits;
969               if (EltSize > Align &&
970                   (EltSize / Align < DstTy.getNumElements())) {
971                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
972               }
973 
974               // May need relegalization for the scalars.
975               return std::make_pair(0, EltTy);
976             })
977         .minScalar(0, S32);
978 
979     if (IsStore)
980       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
981 
982     // TODO: Need a bitcast lower option?
983     Actions
984         .legalIf([=](const LegalityQuery &Query) {
985           const LLT Ty0 = Query.Types[0];
986           unsigned Size = Ty0.getSizeInBits();
987           unsigned MemSize = Query.MMODescrs[0].SizeInBits;
988           unsigned Align = Query.MMODescrs[0].AlignInBits;
989 
990           // FIXME: Widening store from alignment not valid.
991           if (MemSize < Size)
992             MemSize = std::max(MemSize, Align);
993 
994           // No extending vector loads.
995           if (Size > MemSize && Ty0.isVector())
996             return false;
997 
998           switch (MemSize) {
999           case 8:
1000           case 16:
1001             return Size == 32;
1002           case 32:
1003           case 64:
1004           case 128:
1005             return true;
1006           case 96:
1007             return ST.hasDwordx3LoadStores();
1008           case 256:
1009           case 512:
1010             return true;
1011           default:
1012             return false;
1013           }
1014         })
1015         .widenScalarToNextPow2(0)
1016         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
1017   }
1018 
1019   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1020                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1021                                                   {S32, GlobalPtr, 16, 2 * 8},
1022                                                   {S32, LocalPtr, 8, 8},
1023                                                   {S32, LocalPtr, 16, 16},
1024                                                   {S32, PrivatePtr, 8, 8},
1025                                                   {S32, PrivatePtr, 16, 16},
1026                                                   {S32, ConstantPtr, 8, 8},
1027                                                   {S32, ConstantPtr, 16, 2 * 8}});
1028   if (ST.hasFlatAddressSpace()) {
1029     ExtLoads.legalForTypesWithMemDesc(
1030         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1031   }
1032 
1033   ExtLoads.clampScalar(0, S32, S32)
1034           .widenScalarToNextPow2(0)
1035           .unsupportedIfMemSizeNotPow2()
1036           .lower();
1037 
1038   auto &Atomics = getActionDefinitionsBuilder(
1039     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1040      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1041      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1042      G_ATOMICRMW_UMIN})
1043     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1044                {S64, GlobalPtr}, {S64, LocalPtr}});
1045   if (ST.hasFlatAddressSpace()) {
1046     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1047   }
1048 
1049   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1050     .legalFor({{S32, LocalPtr}});
1051 
1052   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1053   // demarshalling
1054   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1055     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1056                 {S32, FlatPtr}, {S64, FlatPtr}})
1057     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1058                {S32, RegionPtr}, {S64, RegionPtr}});
1059   // TODO: Pointer types, any 32-bit or 64-bit vector
1060 
1061   // Condition should be s32 for scalar, s1 for vector.
1062   getActionDefinitionsBuilder(G_SELECT)
1063     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1064           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1065           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1066     .clampScalar(0, S16, S64)
1067     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1068     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1069     .scalarize(1)
1070     .clampMaxNumElements(0, S32, 2)
1071     .clampMaxNumElements(0, LocalPtr, 2)
1072     .clampMaxNumElements(0, PrivatePtr, 2)
1073     .scalarize(0)
1074     .widenScalarToNextPow2(0)
1075     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1076 
1077   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1078   // be more flexible with the shift amount type.
1079   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1080     .legalFor({{S32, S32}, {S64, S32}});
1081   if (ST.has16BitInsts()) {
1082     if (ST.hasVOP3PInsts()) {
1083       Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
1084             .clampMaxNumElements(0, S16, 2);
1085     } else
1086       Shifts.legalFor({{S16, S32}, {S16, S16}});
1087 
1088     // TODO: Support 16-bit shift amounts
1089     Shifts.clampScalar(1, S32, S32);
1090     Shifts.clampScalar(0, S16, S64);
1091     Shifts.widenScalarToNextPow2(0, 16);
1092   } else {
1093     // Make sure we legalize the shift amount type first, as the general
1094     // expansion for the shifted type will produce much worse code if it hasn't
1095     // been truncated already.
1096     Shifts.clampScalar(1, S32, S32);
1097     Shifts.clampScalar(0, S32, S64);
1098     Shifts.widenScalarToNextPow2(0, 32);
1099   }
1100   Shifts.scalarize(0);
1101 
1102   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1103     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1104     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1105     unsigned IdxTypeIdx = 2;
1106 
1107     getActionDefinitionsBuilder(Op)
1108       .customIf([=](const LegalityQuery &Query) {
1109           const LLT EltTy = Query.Types[EltTypeIdx];
1110           const LLT VecTy = Query.Types[VecTypeIdx];
1111           const LLT IdxTy = Query.Types[IdxTypeIdx];
1112           return (EltTy.getSizeInBits() == 16 ||
1113                   EltTy.getSizeInBits() % 32 == 0) &&
1114                  VecTy.getSizeInBits() % 32 == 0 &&
1115                  VecTy.getSizeInBits() <= 1024 &&
1116                  IdxTy.getSizeInBits() == 32;
1117         })
1118       .clampScalar(EltTypeIdx, S32, S64)
1119       .clampScalar(VecTypeIdx, S32, S64)
1120       .clampScalar(IdxTypeIdx, S32, S32);
1121   }
1122 
1123   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1124     .unsupportedIf([=](const LegalityQuery &Query) {
1125         const LLT &EltTy = Query.Types[1].getElementType();
1126         return Query.Types[0] != EltTy;
1127       });
1128 
1129   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1130     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1131     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1132 
1133     // FIXME: Doesn't handle extract of illegal sizes.
1134     getActionDefinitionsBuilder(Op)
1135       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1136       // FIXME: Multiples of 16 should not be legal.
1137       .legalIf([=](const LegalityQuery &Query) {
1138           const LLT BigTy = Query.Types[BigTyIdx];
1139           const LLT LitTy = Query.Types[LitTyIdx];
1140           return (BigTy.getSizeInBits() % 32 == 0) &&
1141                  (LitTy.getSizeInBits() % 16 == 0);
1142         })
1143       .widenScalarIf(
1144         [=](const LegalityQuery &Query) {
1145           const LLT BigTy = Query.Types[BigTyIdx];
1146           return (BigTy.getScalarSizeInBits() < 16);
1147         },
1148         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1149       .widenScalarIf(
1150         [=](const LegalityQuery &Query) {
1151           const LLT LitTy = Query.Types[LitTyIdx];
1152           return (LitTy.getScalarSizeInBits() < 16);
1153         },
1154         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1155       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1156       .widenScalarToNextPow2(BigTyIdx, 32);
1157 
1158   }
1159 
1160   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1161     .legalForCartesianProduct(AllS32Vectors, {S32})
1162     .legalForCartesianProduct(AllS64Vectors, {S64})
1163     .clampNumElements(0, V16S32, V32S32)
1164     .clampNumElements(0, V2S64, V16S64)
1165     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1166 
1167   if (ST.hasScalarPackInsts()) {
1168     BuildVector
1169       // FIXME: Should probably widen s1 vectors straight to s32
1170       .minScalarOrElt(0, S16)
1171       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1172       .minScalar(1, S32);
1173 
1174     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1175       .legalFor({V2S16, S32})
1176       .lower();
1177     BuildVector.minScalarOrElt(0, S32);
1178   } else {
1179     BuildVector.customFor({V2S16, S16});
1180     BuildVector.minScalarOrElt(0, S32);
1181 
1182     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1183       .customFor({V2S16, S32})
1184       .lower();
1185   }
1186 
1187   BuildVector.legalIf(isRegisterType(0));
1188 
1189   // FIXME: Clamp maximum size
1190   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1191     .legalIf(isRegisterType(0));
1192 
1193   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1194   // pre-legalize.
1195   if (ST.hasVOP3PInsts()) {
1196     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1197       .customFor({V2S16, V2S16})
1198       .lower();
1199   } else
1200     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1201 
1202   // Merge/Unmerge
1203   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1204     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1205     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1206 
1207     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1208       const LLT &Ty = Query.Types[TypeIdx];
1209       if (Ty.isVector()) {
1210         const LLT &EltTy = Ty.getElementType();
1211         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
1212           return true;
1213         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1214           return true;
1215       }
1216       return false;
1217     };
1218 
1219     auto &Builder = getActionDefinitionsBuilder(Op)
1220       // Try to widen to s16 first for small types.
1221       // TODO: Only do this on targets with legal s16 shifts
1222       .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1223 
1224       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1225       .lowerFor({{S16, V2S16}})
1226       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1227       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1228                            elementTypeIs(1, S16)),
1229                        changeTo(1, V2S16))
1230       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1231       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1232       // valid.
1233       .clampScalar(LitTyIdx, S32, S256)
1234       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1235       // Break up vectors with weird elements into scalars
1236       .fewerElementsIf(
1237         [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
1238         scalarize(0))
1239       .fewerElementsIf(
1240         [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
1241         scalarize(1))
1242       .clampScalar(BigTyIdx, S32, S1024);
1243 
1244     if (Op == G_MERGE_VALUES) {
1245       Builder.widenScalarIf(
1246         // TODO: Use 16-bit shifts if legal for 8-bit values?
1247         [=](const LegalityQuery &Query) {
1248           const LLT Ty = Query.Types[LitTyIdx];
1249           return Ty.getSizeInBits() < 32;
1250         },
1251         changeTo(LitTyIdx, S32));
1252     }
1253 
1254     Builder.widenScalarIf(
1255       [=](const LegalityQuery &Query) {
1256         const LLT Ty = Query.Types[BigTyIdx];
1257         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1258           Ty.getSizeInBits() % 16 != 0;
1259       },
1260       [=](const LegalityQuery &Query) {
1261         // Pick the next power of 2, or a multiple of 64 over 128.
1262         // Whichever is smaller.
1263         const LLT &Ty = Query.Types[BigTyIdx];
1264         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1265         if (NewSizeInBits >= 256) {
1266           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1267           if (RoundedTo < NewSizeInBits)
1268             NewSizeInBits = RoundedTo;
1269         }
1270         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1271       })
1272       .legalIf([=](const LegalityQuery &Query) {
1273           const LLT &BigTy = Query.Types[BigTyIdx];
1274           const LLT &LitTy = Query.Types[LitTyIdx];
1275 
1276           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1277             return false;
1278           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1279             return false;
1280 
1281           return BigTy.getSizeInBits() % 16 == 0 &&
1282                  LitTy.getSizeInBits() % 16 == 0 &&
1283                  BigTy.getSizeInBits() <= 1024;
1284         })
1285       // Any vectors left are the wrong size. Scalarize them.
1286       .scalarize(0)
1287       .scalarize(1);
1288   }
1289 
1290   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1291   // RegBankSelect.
1292   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1293     .legalFor({{S32}, {S64}});
1294 
1295   if (ST.hasVOP3PInsts()) {
1296     SextInReg.lowerFor({{V2S16}})
1297       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1298       // get more vector shift opportunities, since we'll get those when
1299       // expanded.
1300       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1301   } else if (ST.has16BitInsts()) {
1302     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1303   } else {
1304     // Prefer to promote to s32 before lowering if we don't have 16-bit
1305     // shifts. This avoid a lot of intermediate truncate and extend operations.
1306     SextInReg.lowerFor({{S32}, {S64}});
1307   }
1308 
1309   SextInReg
1310     .scalarize(0)
1311     .clampScalar(0, S32, S64)
1312     .lower();
1313 
1314   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1315     .legalFor({S64});
1316 
1317   getActionDefinitionsBuilder({
1318       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1319       G_FCOPYSIGN,
1320 
1321       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1322       G_READ_REGISTER,
1323       G_WRITE_REGISTER,
1324 
1325       G_SADDO, G_SSUBO,
1326 
1327        // TODO: Implement
1328       G_FMINIMUM, G_FMAXIMUM
1329     }).lower();
1330 
1331   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1332         G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1333         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1334     .unsupported();
1335 
1336   computeTables();
1337   verify(*ST.getInstrInfo());
1338 }
1339 
1340 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1341                                          MachineRegisterInfo &MRI,
1342                                          MachineIRBuilder &B,
1343                                          GISelChangeObserver &Observer) const {
1344   switch (MI.getOpcode()) {
1345   case TargetOpcode::G_ADDRSPACE_CAST:
1346     return legalizeAddrSpaceCast(MI, MRI, B);
1347   case TargetOpcode::G_FRINT:
1348     return legalizeFrint(MI, MRI, B);
1349   case TargetOpcode::G_FCEIL:
1350     return legalizeFceil(MI, MRI, B);
1351   case TargetOpcode::G_INTRINSIC_TRUNC:
1352     return legalizeIntrinsicTrunc(MI, MRI, B);
1353   case TargetOpcode::G_SITOFP:
1354     return legalizeITOFP(MI, MRI, B, true);
1355   case TargetOpcode::G_UITOFP:
1356     return legalizeITOFP(MI, MRI, B, false);
1357   case TargetOpcode::G_FPTOSI:
1358     return legalizeFPTOI(MI, MRI, B, true);
1359   case TargetOpcode::G_FPTOUI:
1360     return legalizeFPTOI(MI, MRI, B, false);
1361   case TargetOpcode::G_FMINNUM:
1362   case TargetOpcode::G_FMAXNUM:
1363   case TargetOpcode::G_FMINNUM_IEEE:
1364   case TargetOpcode::G_FMAXNUM_IEEE:
1365     return legalizeMinNumMaxNum(MI, MRI, B);
1366   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1367     return legalizeExtractVectorElt(MI, MRI, B);
1368   case TargetOpcode::G_INSERT_VECTOR_ELT:
1369     return legalizeInsertVectorElt(MI, MRI, B);
1370   case TargetOpcode::G_SHUFFLE_VECTOR:
1371     return legalizeShuffleVector(MI, MRI, B);
1372   case TargetOpcode::G_FSIN:
1373   case TargetOpcode::G_FCOS:
1374     return legalizeSinCos(MI, MRI, B);
1375   case TargetOpcode::G_GLOBAL_VALUE:
1376     return legalizeGlobalValue(MI, MRI, B);
1377   case TargetOpcode::G_LOAD:
1378     return legalizeLoad(MI, MRI, B, Observer);
1379   case TargetOpcode::G_FMAD:
1380     return legalizeFMad(MI, MRI, B);
1381   case TargetOpcode::G_FDIV:
1382     return legalizeFDIV(MI, MRI, B);
1383   case TargetOpcode::G_UDIV:
1384   case TargetOpcode::G_UREM:
1385     return legalizeUDIV_UREM(MI, MRI, B);
1386   case TargetOpcode::G_SDIV:
1387   case TargetOpcode::G_SREM:
1388     return legalizeSDIV_SREM(MI, MRI, B);
1389   case TargetOpcode::G_ATOMIC_CMPXCHG:
1390     return legalizeAtomicCmpXChg(MI, MRI, B);
1391   case TargetOpcode::G_FLOG:
1392     return legalizeFlog(MI, B, 1.0f / numbers::log2ef);
1393   case TargetOpcode::G_FLOG10:
1394     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1395   case TargetOpcode::G_FEXP:
1396     return legalizeFExp(MI, B);
1397   case TargetOpcode::G_FPOW:
1398     return legalizeFPow(MI, B);
1399   case TargetOpcode::G_FFLOOR:
1400     return legalizeFFloor(MI, MRI, B);
1401   case TargetOpcode::G_BUILD_VECTOR:
1402     return legalizeBuildVector(MI, MRI, B);
1403   default:
1404     return false;
1405   }
1406 
1407   llvm_unreachable("expected switch to return");
1408 }
1409 
1410 Register AMDGPULegalizerInfo::getSegmentAperture(
1411   unsigned AS,
1412   MachineRegisterInfo &MRI,
1413   MachineIRBuilder &B) const {
1414   MachineFunction &MF = B.getMF();
1415   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1416   const LLT S32 = LLT::scalar(32);
1417 
1418   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1419 
1420   if (ST.hasApertureRegs()) {
1421     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1422     // getreg.
1423     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1424         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1425         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1426     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1427         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1428         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1429     unsigned Encoding =
1430         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1431         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1432         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1433 
1434     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1435 
1436     B.buildInstr(AMDGPU::S_GETREG_B32)
1437       .addDef(GetReg)
1438       .addImm(Encoding);
1439     MRI.setType(GetReg, S32);
1440 
1441     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1442     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1443   }
1444 
1445   Register QueuePtr = MRI.createGenericVirtualRegister(
1446     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1447 
1448   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1449   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1450     return Register();
1451 
1452   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1453   // private_segment_aperture_base_hi.
1454   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1455 
1456   // TODO: can we be smarter about machine pointer info?
1457   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1458   MachineMemOperand *MMO = MF.getMachineMemOperand(
1459     PtrInfo,
1460     MachineMemOperand::MOLoad |
1461     MachineMemOperand::MODereferenceable |
1462     MachineMemOperand::MOInvariant,
1463     4,
1464     MinAlign(64, StructOffset));
1465 
1466   Register LoadAddr;
1467 
1468   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1469   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1470 }
1471 
1472 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1473   MachineInstr &MI, MachineRegisterInfo &MRI,
1474   MachineIRBuilder &B) const {
1475   MachineFunction &MF = B.getMF();
1476 
1477   B.setInstr(MI);
1478 
1479   const LLT S32 = LLT::scalar(32);
1480   Register Dst = MI.getOperand(0).getReg();
1481   Register Src = MI.getOperand(1).getReg();
1482 
1483   LLT DstTy = MRI.getType(Dst);
1484   LLT SrcTy = MRI.getType(Src);
1485   unsigned DestAS = DstTy.getAddressSpace();
1486   unsigned SrcAS = SrcTy.getAddressSpace();
1487 
1488   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1489   // vector element.
1490   assert(!DstTy.isVector());
1491 
1492   const AMDGPUTargetMachine &TM
1493     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1494 
1495   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1496   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1497     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1498     return true;
1499   }
1500 
1501   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1502     // Truncate.
1503     B.buildExtract(Dst, Src, 0);
1504     MI.eraseFromParent();
1505     return true;
1506   }
1507 
1508   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1509     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1510     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1511 
1512     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1513     // another. Merge operands are required to be the same type, but creating an
1514     // extra ptrtoint would be kind of pointless.
1515     auto HighAddr = B.buildConstant(
1516       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1517     B.buildMerge(Dst, {Src, HighAddr});
1518     MI.eraseFromParent();
1519     return true;
1520   }
1521 
1522   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1523     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1524            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1525     unsigned NullVal = TM.getNullPointerValue(DestAS);
1526 
1527     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1528     auto FlatNull = B.buildConstant(SrcTy, 0);
1529 
1530     // Extract low 32-bits of the pointer.
1531     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1532 
1533     auto CmpRes =
1534         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1535     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1536 
1537     MI.eraseFromParent();
1538     return true;
1539   }
1540 
1541   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1542     return false;
1543 
1544   if (!ST.hasFlatAddressSpace())
1545     return false;
1546 
1547   auto SegmentNull =
1548       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1549   auto FlatNull =
1550       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1551 
1552   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1553   if (!ApertureReg.isValid())
1554     return false;
1555 
1556   auto CmpRes =
1557       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1558 
1559   // Coerce the type of the low half of the result so we can use merge_values.
1560   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1561 
1562   // TODO: Should we allow mismatched types but matching sizes in merges to
1563   // avoid the ptrtoint?
1564   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1565   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1566 
1567   MI.eraseFromParent();
1568   return true;
1569 }
1570 
1571 bool AMDGPULegalizerInfo::legalizeFrint(
1572   MachineInstr &MI, MachineRegisterInfo &MRI,
1573   MachineIRBuilder &B) const {
1574   B.setInstr(MI);
1575 
1576   Register Src = MI.getOperand(1).getReg();
1577   LLT Ty = MRI.getType(Src);
1578   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1579 
1580   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1581   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1582 
1583   auto C1 = B.buildFConstant(Ty, C1Val);
1584   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1585 
1586   // TODO: Should this propagate fast-math-flags?
1587   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1588   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1589 
1590   auto C2 = B.buildFConstant(Ty, C2Val);
1591   auto Fabs = B.buildFAbs(Ty, Src);
1592 
1593   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1594   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1595   return true;
1596 }
1597 
1598 bool AMDGPULegalizerInfo::legalizeFceil(
1599   MachineInstr &MI, MachineRegisterInfo &MRI,
1600   MachineIRBuilder &B) const {
1601   B.setInstr(MI);
1602 
1603   const LLT S1 = LLT::scalar(1);
1604   const LLT S64 = LLT::scalar(64);
1605 
1606   Register Src = MI.getOperand(1).getReg();
1607   assert(MRI.getType(Src) == S64);
1608 
1609   // result = trunc(src)
1610   // if (src > 0.0 && src != result)
1611   //   result += 1.0
1612 
1613   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1614 
1615   const auto Zero = B.buildFConstant(S64, 0.0);
1616   const auto One = B.buildFConstant(S64, 1.0);
1617   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1618   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1619   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1620   auto Add = B.buildSelect(S64, And, One, Zero);
1621 
1622   // TODO: Should this propagate fast-math-flags?
1623   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1624   return true;
1625 }
1626 
1627 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1628                                               MachineIRBuilder &B) {
1629   const unsigned FractBits = 52;
1630   const unsigned ExpBits = 11;
1631   LLT S32 = LLT::scalar(32);
1632 
1633   auto Const0 = B.buildConstant(S32, FractBits - 32);
1634   auto Const1 = B.buildConstant(S32, ExpBits);
1635 
1636   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1637     .addUse(Const0.getReg(0))
1638     .addUse(Const1.getReg(0));
1639 
1640   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1641 }
1642 
1643 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1644   MachineInstr &MI, MachineRegisterInfo &MRI,
1645   MachineIRBuilder &B) const {
1646   B.setInstr(MI);
1647 
1648   const LLT S1 = LLT::scalar(1);
1649   const LLT S32 = LLT::scalar(32);
1650   const LLT S64 = LLT::scalar(64);
1651 
1652   Register Src = MI.getOperand(1).getReg();
1653   assert(MRI.getType(Src) == S64);
1654 
1655   // TODO: Should this use extract since the low half is unused?
1656   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1657   Register Hi = Unmerge.getReg(1);
1658 
1659   // Extract the upper half, since this is where we will find the sign and
1660   // exponent.
1661   auto Exp = extractF64Exponent(Hi, B);
1662 
1663   const unsigned FractBits = 52;
1664 
1665   // Extract the sign bit.
1666   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1667   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1668 
1669   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1670 
1671   const auto Zero32 = B.buildConstant(S32, 0);
1672 
1673   // Extend back to 64-bits.
1674   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1675 
1676   auto Shr = B.buildAShr(S64, FractMask, Exp);
1677   auto Not = B.buildNot(S64, Shr);
1678   auto Tmp0 = B.buildAnd(S64, Src, Not);
1679   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1680 
1681   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1682   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1683 
1684   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1685   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1686   return true;
1687 }
1688 
1689 bool AMDGPULegalizerInfo::legalizeITOFP(
1690   MachineInstr &MI, MachineRegisterInfo &MRI,
1691   MachineIRBuilder &B, bool Signed) const {
1692   B.setInstr(MI);
1693 
1694   Register Dst = MI.getOperand(0).getReg();
1695   Register Src = MI.getOperand(1).getReg();
1696 
1697   const LLT S64 = LLT::scalar(64);
1698   const LLT S32 = LLT::scalar(32);
1699 
1700   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1701 
1702   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1703 
1704   auto CvtHi = Signed ?
1705     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1706     B.buildUITOFP(S64, Unmerge.getReg(1));
1707 
1708   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1709 
1710   auto ThirtyTwo = B.buildConstant(S32, 32);
1711   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1712     .addUse(CvtHi.getReg(0))
1713     .addUse(ThirtyTwo.getReg(0));
1714 
1715   // TODO: Should this propagate fast-math-flags?
1716   B.buildFAdd(Dst, LdExp, CvtLo);
1717   MI.eraseFromParent();
1718   return true;
1719 }
1720 
1721 // TODO: Copied from DAG implementation. Verify logic and document how this
1722 // actually works.
1723 bool AMDGPULegalizerInfo::legalizeFPTOI(
1724   MachineInstr &MI, MachineRegisterInfo &MRI,
1725   MachineIRBuilder &B, bool Signed) const {
1726   B.setInstr(MI);
1727 
1728   Register Dst = MI.getOperand(0).getReg();
1729   Register Src = MI.getOperand(1).getReg();
1730 
1731   const LLT S64 = LLT::scalar(64);
1732   const LLT S32 = LLT::scalar(32);
1733 
1734   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1735 
1736   unsigned Flags = MI.getFlags();
1737 
1738   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
1739   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
1740   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
1741 
1742   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
1743   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
1744   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
1745 
1746   auto Hi = Signed ?
1747     B.buildFPTOSI(S32, FloorMul) :
1748     B.buildFPTOUI(S32, FloorMul);
1749   auto Lo = B.buildFPTOUI(S32, Fma);
1750 
1751   B.buildMerge(Dst, { Lo, Hi });
1752   MI.eraseFromParent();
1753 
1754   return true;
1755 }
1756 
1757 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1758   MachineInstr &MI, MachineRegisterInfo &MRI,
1759   MachineIRBuilder &B) const {
1760   MachineFunction &MF = B.getMF();
1761   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1762 
1763   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1764                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1765 
1766   // With ieee_mode disabled, the instructions have the correct behavior
1767   // already for G_FMINNUM/G_FMAXNUM
1768   if (!MFI->getMode().IEEE)
1769     return !IsIEEEOp;
1770 
1771   if (IsIEEEOp)
1772     return true;
1773 
1774   MachineIRBuilder HelperBuilder(MI);
1775   GISelObserverWrapper DummyObserver;
1776   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1777   HelperBuilder.setInstr(MI);
1778   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1779 }
1780 
1781 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1782   MachineInstr &MI, MachineRegisterInfo &MRI,
1783   MachineIRBuilder &B) const {
1784   // TODO: Should move some of this into LegalizerHelper.
1785 
1786   // TODO: Promote dynamic indexing of s16 to s32
1787 
1788   // FIXME: Artifact combiner probably should have replaced the truncated
1789   // constant before this, so we shouldn't need
1790   // getConstantVRegValWithLookThrough.
1791   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1792     MI.getOperand(2).getReg(), MRI);
1793   if (!IdxVal) // Dynamic case will be selected to register indexing.
1794     return true;
1795 
1796   Register Dst = MI.getOperand(0).getReg();
1797   Register Vec = MI.getOperand(1).getReg();
1798 
1799   LLT VecTy = MRI.getType(Vec);
1800   LLT EltTy = VecTy.getElementType();
1801   assert(EltTy == MRI.getType(Dst));
1802 
1803   B.setInstr(MI);
1804 
1805   if (IdxVal->Value < VecTy.getNumElements())
1806     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
1807   else
1808     B.buildUndef(Dst);
1809 
1810   MI.eraseFromParent();
1811   return true;
1812 }
1813 
1814 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1815   MachineInstr &MI, MachineRegisterInfo &MRI,
1816   MachineIRBuilder &B) const {
1817   // TODO: Should move some of this into LegalizerHelper.
1818 
1819   // TODO: Promote dynamic indexing of s16 to s32
1820 
1821   // FIXME: Artifact combiner probably should have replaced the truncated
1822   // constant before this, so we shouldn't need
1823   // getConstantVRegValWithLookThrough.
1824   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1825     MI.getOperand(3).getReg(), MRI);
1826   if (!IdxVal) // Dynamic case will be selected to register indexing.
1827     return true;
1828 
1829   Register Dst = MI.getOperand(0).getReg();
1830   Register Vec = MI.getOperand(1).getReg();
1831   Register Ins = MI.getOperand(2).getReg();
1832 
1833   LLT VecTy = MRI.getType(Vec);
1834   LLT EltTy = VecTy.getElementType();
1835   assert(EltTy == MRI.getType(Ins));
1836 
1837   B.setInstr(MI);
1838 
1839   if (IdxVal->Value < VecTy.getNumElements())
1840     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
1841   else
1842     B.buildUndef(Dst);
1843 
1844   MI.eraseFromParent();
1845   return true;
1846 }
1847 
1848 bool AMDGPULegalizerInfo::legalizeShuffleVector(
1849   MachineInstr &MI, MachineRegisterInfo &MRI,
1850   MachineIRBuilder &B) const {
1851   const LLT V2S16 = LLT::vector(2, 16);
1852 
1853   Register Dst = MI.getOperand(0).getReg();
1854   Register Src0 = MI.getOperand(1).getReg();
1855   LLT DstTy = MRI.getType(Dst);
1856   LLT SrcTy = MRI.getType(Src0);
1857 
1858   if (SrcTy == V2S16 && DstTy == V2S16 &&
1859       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
1860     return true;
1861 
1862   MachineIRBuilder HelperBuilder(MI);
1863   GISelObserverWrapper DummyObserver;
1864   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
1865   HelperBuilder.setInstr(MI);
1866   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
1867 }
1868 
1869 bool AMDGPULegalizerInfo::legalizeSinCos(
1870   MachineInstr &MI, MachineRegisterInfo &MRI,
1871   MachineIRBuilder &B) const {
1872   B.setInstr(MI);
1873 
1874   Register DstReg = MI.getOperand(0).getReg();
1875   Register SrcReg = MI.getOperand(1).getReg();
1876   LLT Ty = MRI.getType(DstReg);
1877   unsigned Flags = MI.getFlags();
1878 
1879   Register TrigVal;
1880   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1881   if (ST.hasTrigReducedRange()) {
1882     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1883     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1884       .addUse(MulVal.getReg(0))
1885       .setMIFlags(Flags).getReg(0);
1886   } else
1887     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1888 
1889   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1890     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1891   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1892     .addUse(TrigVal)
1893     .setMIFlags(Flags);
1894   MI.eraseFromParent();
1895   return true;
1896 }
1897 
1898 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1899   Register DstReg, LLT PtrTy,
1900   MachineIRBuilder &B, const GlobalValue *GV,
1901   unsigned Offset, unsigned GAFlags) const {
1902   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1903   // to the following code sequence:
1904   //
1905   // For constant address space:
1906   //   s_getpc_b64 s[0:1]
1907   //   s_add_u32 s0, s0, $symbol
1908   //   s_addc_u32 s1, s1, 0
1909   //
1910   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1911   //   a fixup or relocation is emitted to replace $symbol with a literal
1912   //   constant, which is a pc-relative offset from the encoding of the $symbol
1913   //   operand to the global variable.
1914   //
1915   // For global address space:
1916   //   s_getpc_b64 s[0:1]
1917   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1918   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1919   //
1920   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1921   //   fixups or relocations are emitted to replace $symbol@*@lo and
1922   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1923   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
1924   //   operand to the global variable.
1925   //
1926   // What we want here is an offset from the value returned by s_getpc
1927   // (which is the address of the s_add_u32 instruction) to the global
1928   // variable, but since the encoding of $symbol starts 4 bytes after the start
1929   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1930   // small. This requires us to add 4 to the global variable offset in order to
1931   // compute the correct address.
1932 
1933   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1934 
1935   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1936     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1937 
1938   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1939     .addDef(PCReg);
1940 
1941   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1942   if (GAFlags == SIInstrInfo::MO_NONE)
1943     MIB.addImm(0);
1944   else
1945     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1946 
1947   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1948 
1949   if (PtrTy.getSizeInBits() == 32)
1950     B.buildExtract(DstReg, PCReg, 0);
1951   return true;
1952  }
1953 
1954 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1955   MachineInstr &MI, MachineRegisterInfo &MRI,
1956   MachineIRBuilder &B) const {
1957   Register DstReg = MI.getOperand(0).getReg();
1958   LLT Ty = MRI.getType(DstReg);
1959   unsigned AS = Ty.getAddressSpace();
1960 
1961   const GlobalValue *GV = MI.getOperand(1).getGlobal();
1962   MachineFunction &MF = B.getMF();
1963   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1964   B.setInstr(MI);
1965 
1966   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1967     if (!MFI->isEntryFunction()) {
1968       const Function &Fn = MF.getFunction();
1969       DiagnosticInfoUnsupported BadLDSDecl(
1970         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
1971         DS_Warning);
1972       Fn.getContext().diagnose(BadLDSDecl);
1973 
1974       // We currently don't have a way to correctly allocate LDS objects that
1975       // aren't directly associated with a kernel. We do force inlining of
1976       // functions that use local objects. However, if these dead functions are
1977       // not eliminated, we don't want a compile time error. Just emit a warning
1978       // and a trap, since there should be no callable path here.
1979       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
1980       B.buildUndef(DstReg);
1981       MI.eraseFromParent();
1982       return true;
1983     }
1984 
1985     // TODO: We could emit code to handle the initialization somewhere.
1986     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1987       const SITargetLowering *TLI = ST.getTargetLowering();
1988       if (!TLI->shouldUseLDSConstAddress(GV)) {
1989         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
1990         return true; // Leave in place;
1991       }
1992 
1993       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1994       MI.eraseFromParent();
1995       return true;
1996     }
1997 
1998     const Function &Fn = MF.getFunction();
1999     DiagnosticInfoUnsupported BadInit(
2000       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2001     Fn.getContext().diagnose(BadInit);
2002     return true;
2003   }
2004 
2005   const SITargetLowering *TLI = ST.getTargetLowering();
2006 
2007   if (TLI->shouldEmitFixup(GV)) {
2008     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2009     MI.eraseFromParent();
2010     return true;
2011   }
2012 
2013   if (TLI->shouldEmitPCReloc(GV)) {
2014     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2015     MI.eraseFromParent();
2016     return true;
2017   }
2018 
2019   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2020   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2021 
2022   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2023     MachinePointerInfo::getGOT(MF),
2024     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2025     MachineMemOperand::MOInvariant,
2026     8 /*Size*/, 8 /*Align*/);
2027 
2028   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2029 
2030   if (Ty.getSizeInBits() == 32) {
2031     // Truncate if this is a 32-bit constant adrdess.
2032     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2033     B.buildExtract(DstReg, Load, 0);
2034   } else
2035     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2036 
2037   MI.eraseFromParent();
2038   return true;
2039 }
2040 
2041 bool AMDGPULegalizerInfo::legalizeLoad(
2042   MachineInstr &MI, MachineRegisterInfo &MRI,
2043   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
2044   B.setInstr(MI);
2045   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2046   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
2047   Observer.changingInstr(MI);
2048   MI.getOperand(1).setReg(Cast.getReg(0));
2049   Observer.changedInstr(MI);
2050   return true;
2051 }
2052 
2053 bool AMDGPULegalizerInfo::legalizeFMad(
2054   MachineInstr &MI, MachineRegisterInfo &MRI,
2055   MachineIRBuilder &B) const {
2056   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2057   assert(Ty.isScalar());
2058 
2059   MachineFunction &MF = B.getMF();
2060   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2061 
2062   // TODO: Always legal with future ftz flag.
2063   // FIXME: Do we need just output?
2064   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2065     return true;
2066   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2067     return true;
2068 
2069   MachineIRBuilder HelperBuilder(MI);
2070   GISelObserverWrapper DummyObserver;
2071   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2072   HelperBuilder.setMBB(*MI.getParent());
2073   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2074 }
2075 
2076 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2077   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2078   Register DstReg = MI.getOperand(0).getReg();
2079   Register PtrReg = MI.getOperand(1).getReg();
2080   Register CmpVal = MI.getOperand(2).getReg();
2081   Register NewVal = MI.getOperand(3).getReg();
2082 
2083   assert(SITargetLowering::isFlatGlobalAddrSpace(
2084            MRI.getType(PtrReg).getAddressSpace()) &&
2085          "this should not have been custom lowered");
2086 
2087   LLT ValTy = MRI.getType(CmpVal);
2088   LLT VecTy = LLT::vector(2, ValTy);
2089 
2090   B.setInstr(MI);
2091   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2092 
2093   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2094     .addDef(DstReg)
2095     .addUse(PtrReg)
2096     .addUse(PackedVal)
2097     .setMemRefs(MI.memoperands());
2098 
2099   MI.eraseFromParent();
2100   return true;
2101 }
2102 
2103 bool AMDGPULegalizerInfo::legalizeFlog(
2104   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2105   Register Dst = MI.getOperand(0).getReg();
2106   Register Src = MI.getOperand(1).getReg();
2107   LLT Ty = B.getMRI()->getType(Dst);
2108   unsigned Flags = MI.getFlags();
2109   B.setInstr(MI);
2110 
2111   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2112   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2113 
2114   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2115   MI.eraseFromParent();
2116   return true;
2117 }
2118 
2119 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2120                                        MachineIRBuilder &B) const {
2121   Register Dst = MI.getOperand(0).getReg();
2122   Register Src = MI.getOperand(1).getReg();
2123   unsigned Flags = MI.getFlags();
2124   LLT Ty = B.getMRI()->getType(Dst);
2125   B.setInstr(MI);
2126 
2127   auto K = B.buildFConstant(Ty, numbers::log2e);
2128   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2129   B.buildFExp2(Dst, Mul, Flags);
2130   MI.eraseFromParent();
2131   return true;
2132 }
2133 
2134 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2135                                        MachineIRBuilder &B) const {
2136   Register Dst = MI.getOperand(0).getReg();
2137   Register Src0 = MI.getOperand(1).getReg();
2138   Register Src1 = MI.getOperand(2).getReg();
2139   unsigned Flags = MI.getFlags();
2140   LLT Ty = B.getMRI()->getType(Dst);
2141   B.setInstr(MI);
2142   const LLT S16 = LLT::scalar(16);
2143   const LLT S32 = LLT::scalar(32);
2144 
2145   if (Ty == S32) {
2146     auto Log = B.buildFLog2(S32, Src0, Flags);
2147     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2148       .addUse(Log.getReg(0))
2149       .addUse(Src1)
2150       .setMIFlags(Flags);
2151     B.buildFExp2(Dst, Mul, Flags);
2152   } else if (Ty == S16) {
2153     // There's no f16 fmul_legacy, so we need to convert for it.
2154     auto Log = B.buildFLog2(S16, Src0, Flags);
2155     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2156     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2157     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2158       .addUse(Ext0.getReg(0))
2159       .addUse(Ext1.getReg(0))
2160       .setMIFlags(Flags);
2161 
2162     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2163   } else
2164     return false;
2165 
2166   MI.eraseFromParent();
2167   return true;
2168 }
2169 
2170 // Find a source register, ignoring any possible source modifiers.
2171 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2172   Register ModSrc = OrigSrc;
2173   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2174     ModSrc = SrcFNeg->getOperand(1).getReg();
2175     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2176       ModSrc = SrcFAbs->getOperand(1).getReg();
2177   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2178     ModSrc = SrcFAbs->getOperand(1).getReg();
2179   return ModSrc;
2180 }
2181 
2182 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2183                                          MachineRegisterInfo &MRI,
2184                                          MachineIRBuilder &B) const {
2185   B.setInstr(MI);
2186 
2187   const LLT S1 = LLT::scalar(1);
2188   const LLT S64 = LLT::scalar(64);
2189   Register Dst = MI.getOperand(0).getReg();
2190   Register OrigSrc = MI.getOperand(1).getReg();
2191   unsigned Flags = MI.getFlags();
2192   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2193          "this should not have been custom lowered");
2194 
2195   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2196   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2197   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2198   // V_FRACT bug is:
2199   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2200   //
2201   // Convert floor(x) to (x - fract(x))
2202 
2203   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2204     .addUse(OrigSrc)
2205     .setMIFlags(Flags);
2206 
2207   // Give source modifier matching some assistance before obscuring a foldable
2208   // pattern.
2209 
2210   // TODO: We can avoid the neg on the fract? The input sign to fract
2211   // shouldn't matter?
2212   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2213 
2214   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2215 
2216   Register Min = MRI.createGenericVirtualRegister(S64);
2217 
2218   // We don't need to concern ourselves with the snan handling difference, so
2219   // use the one which will directly select.
2220   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2221   if (MFI->getMode().IEEE)
2222     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2223   else
2224     B.buildFMinNum(Min, Fract, Const, Flags);
2225 
2226   Register CorrectedFract = Min;
2227   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2228     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2229     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2230   }
2231 
2232   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2233   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2234 
2235   MI.eraseFromParent();
2236   return true;
2237 }
2238 
2239 // Turn an illegal packed v2s16 build vector into bit operations.
2240 // TODO: This should probably be a bitcast action in LegalizerHelper.
2241 bool AMDGPULegalizerInfo::legalizeBuildVector(
2242   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2243   Register Dst = MI.getOperand(0).getReg();
2244   LLT DstTy = MRI.getType(Dst);
2245   const LLT S32 = LLT::scalar(32);
2246   const LLT V2S16 = LLT::vector(2, 16);
2247   (void)DstTy;
2248   (void)V2S16;
2249   assert(DstTy == V2S16);
2250 
2251   Register Src0 = MI.getOperand(1).getReg();
2252   Register Src1 = MI.getOperand(2).getReg();
2253   assert(MRI.getType(Src0) == LLT::scalar(16));
2254 
2255   B.setInstr(MI);
2256   auto Merge = B.buildMerge(S32, {Src0, Src1});
2257   B.buildBitcast(Dst, Merge);
2258 
2259   MI.eraseFromParent();
2260   return true;
2261 }
2262 
2263 // Return the use branch instruction, otherwise null if the usage is invalid.
2264 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2265                                        MachineRegisterInfo &MRI,
2266                                        MachineInstr *&Br) {
2267   Register CondDef = MI.getOperand(0).getReg();
2268   if (!MRI.hasOneNonDBGUse(CondDef))
2269     return nullptr;
2270 
2271   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2272   if (UseMI.getParent() != MI.getParent() ||
2273       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2274     return nullptr;
2275 
2276   // Make sure the cond br is followed by a G_BR
2277   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2278   if (Next != MI.getParent()->end()) {
2279     if (Next->getOpcode() != AMDGPU::G_BR)
2280       return nullptr;
2281     Br = &*Next;
2282   }
2283 
2284   return &UseMI;
2285 }
2286 
2287 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B,
2288                                                MachineRegisterInfo &MRI,
2289                                                Register LiveIn,
2290                                                Register PhyReg) const {
2291   assert(PhyReg.isPhysical() && "Physical register expected");
2292 
2293   // Insert the live-in copy, if required, by defining destination virtual
2294   // register.
2295   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
2296   if (!MRI.getVRegDef(LiveIn)) {
2297     // FIXME: Should have scoped insert pt
2298     MachineBasicBlock &OrigInsBB = B.getMBB();
2299     auto OrigInsPt = B.getInsertPt();
2300 
2301     MachineBasicBlock &EntryMBB = B.getMF().front();
2302     EntryMBB.addLiveIn(PhyReg);
2303     B.setInsertPt(EntryMBB, EntryMBB.begin());
2304     B.buildCopy(LiveIn, PhyReg);
2305 
2306     B.setInsertPt(OrigInsBB, OrigInsPt);
2307   }
2308 
2309   return LiveIn;
2310 }
2311 
2312 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B,
2313                                                 MachineRegisterInfo &MRI,
2314                                                 Register PhyReg, LLT Ty,
2315                                                 bool InsertLiveInCopy) const {
2316   assert(PhyReg.isPhysical() && "Physical register expected");
2317 
2318   // Get or create virtual live-in regester
2319   Register LiveIn = MRI.getLiveInVirtReg(PhyReg);
2320   if (!LiveIn) {
2321     LiveIn = MRI.createGenericVirtualRegister(Ty);
2322     MRI.addLiveIn(PhyReg, LiveIn);
2323   }
2324 
2325   // When the actual true copy required is from virtual register to physical
2326   // register (to be inserted later), live-in copy insertion from physical
2327   // to register virtual register is not required
2328   if (!InsertLiveInCopy)
2329     return LiveIn;
2330 
2331   return insertLiveInCopy(B, MRI, LiveIn, PhyReg);
2332 }
2333 
2334 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor(
2335     MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2336   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2337   const ArgDescriptor *Arg;
2338   const TargetRegisterClass *RC;
2339   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
2340   if (!Arg) {
2341     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
2342     return nullptr;
2343   }
2344   return Arg;
2345 }
2346 
2347 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2348                                          const ArgDescriptor *Arg) const {
2349   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2350     return false; // TODO: Handle these
2351 
2352   Register SrcReg = Arg->getRegister();
2353   assert(SrcReg.isPhysical() && "Physical register expected");
2354   assert(DstReg.isVirtual() && "Virtual register expected");
2355 
2356   MachineRegisterInfo &MRI = *B.getMRI();
2357 
2358   LLT Ty = MRI.getType(DstReg);
2359   Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty);
2360 
2361   if (Arg->isMasked()) {
2362     // TODO: Should we try to emit this once in the entry block?
2363     const LLT S32 = LLT::scalar(32);
2364     const unsigned Mask = Arg->getMask();
2365     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2366 
2367     Register AndMaskSrc = LiveIn;
2368 
2369     if (Shift != 0) {
2370       auto ShiftAmt = B.buildConstant(S32, Shift);
2371       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2372     }
2373 
2374     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2375   } else {
2376     B.buildCopy(DstReg, LiveIn);
2377   }
2378 
2379   return true;
2380 }
2381 
2382 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2383     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2384     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2385   B.setInstr(MI);
2386 
2387   const ArgDescriptor *Arg = getArgDescriptor(B, ArgType);
2388   if (!Arg)
2389     return false;
2390 
2391   if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg))
2392     return false;
2393 
2394   MI.eraseFromParent();
2395   return true;
2396 }
2397 
2398 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2399                                        MachineRegisterInfo &MRI,
2400                                        MachineIRBuilder &B) const {
2401   B.setInstr(MI);
2402   Register Dst = MI.getOperand(0).getReg();
2403   LLT DstTy = MRI.getType(Dst);
2404   LLT S16 = LLT::scalar(16);
2405   LLT S32 = LLT::scalar(32);
2406   LLT S64 = LLT::scalar(64);
2407 
2408   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2409     return true;
2410 
2411   if (DstTy == S16)
2412     return legalizeFDIV16(MI, MRI, B);
2413   if (DstTy == S32)
2414     return legalizeFDIV32(MI, MRI, B);
2415   if (DstTy == S64)
2416     return legalizeFDIV64(MI, MRI, B);
2417 
2418   return false;
2419 }
2420 
2421 static Register buildDivRCP(MachineIRBuilder &B, Register Src) {
2422   const LLT S32 = LLT::scalar(32);
2423 
2424   auto Cvt0 = B.buildUITOFP(S32, Src);
2425   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Cvt0});
2426   auto FPUIntMaxPlus1 = B.buildFConstant(S32, BitsToFloat(0x4f800000));
2427   auto Mul = B.buildFMul(S32, RcpIFlag, FPUIntMaxPlus1);
2428   return B.buildFPTOUI(S32, Mul).getReg(0);
2429 }
2430 
2431 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2432                                                   Register DstReg,
2433                                                   Register Num,
2434                                                   Register Den,
2435                                                   bool IsRem) const {
2436   const LLT S1 = LLT::scalar(1);
2437   const LLT S32 = LLT::scalar(32);
2438 
2439   // RCP =  URECIP(Den) = 2^32 / Den + e
2440   // e is rounding error.
2441   auto RCP = buildDivRCP(B, Den);
2442 
2443   // RCP_LO = mul(RCP, Den)
2444   auto RCP_LO = B.buildMul(S32, RCP, Den);
2445 
2446   // RCP_HI = mulhu (RCP, Den) */
2447   auto RCP_HI = B.buildUMulH(S32, RCP, Den);
2448 
2449   // NEG_RCP_LO = -RCP_LO
2450   auto Zero = B.buildConstant(S32, 0);
2451   auto NEG_RCP_LO = B.buildSub(S32, Zero, RCP_LO);
2452 
2453   // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
2454   auto CmpRcpHiZero = B.buildICmp(CmpInst::ICMP_EQ, S1, RCP_HI, Zero);
2455   auto ABS_RCP_LO = B.buildSelect(S32, CmpRcpHiZero, NEG_RCP_LO, RCP_LO);
2456 
2457   // Calculate the rounding error from the URECIP instruction
2458   // E = mulhu(ABS_RCP_LO, RCP)
2459   auto E = B.buildUMulH(S32, ABS_RCP_LO, RCP);
2460 
2461   // RCP_A_E = RCP + E
2462   auto RCP_A_E = B.buildAdd(S32, RCP, E);
2463 
2464   // RCP_S_E = RCP - E
2465   auto RCP_S_E = B.buildSub(S32, RCP, E);
2466 
2467   // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
2468   auto Tmp0 = B.buildSelect(S32, CmpRcpHiZero, RCP_A_E, RCP_S_E);
2469 
2470   // Quotient = mulhu(Tmp0, Num)stmp
2471   auto Quotient = B.buildUMulH(S32, Tmp0, Num);
2472 
2473   // Num_S_Remainder = Quotient * Den
2474   auto Num_S_Remainder = B.buildMul(S32, Quotient, Den);
2475 
2476   // Remainder = Num - Num_S_Remainder
2477   auto Remainder = B.buildSub(S32, Num, Num_S_Remainder);
2478 
2479   // Remainder_GE_Den = Remainder >= Den
2480   auto Remainder_GE_Den = B.buildICmp(CmpInst::ICMP_UGE, S1, Remainder, Den);
2481 
2482   // Remainder_GE_Zero = Num >= Num_S_Remainder;
2483   auto Remainder_GE_Zero = B.buildICmp(CmpInst::ICMP_UGE, S1,
2484                                        Num, Num_S_Remainder);
2485 
2486   // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
2487   auto Tmp1 = B.buildAnd(S1, Remainder_GE_Den, Remainder_GE_Zero);
2488 
2489   // Calculate Division result:
2490 
2491   // Quotient_A_One = Quotient + 1
2492   auto One = B.buildConstant(S32, 1);
2493   auto Quotient_A_One = B.buildAdd(S32, Quotient, One);
2494 
2495   // Quotient_S_One = Quotient - 1
2496   auto Quotient_S_One = B.buildSub(S32, Quotient, One);
2497 
2498   // Div = (Tmp1 == 0 ? Quotient_A_One : Quotient)
2499   auto Div = B.buildSelect(S32, Tmp1, Quotient, Quotient_A_One);
2500 
2501   // Div = (Remainder_GE_Zero ? Div : Quotient_S_One)
2502   if (IsRem) {
2503     Div = B.buildSelect(S32, Remainder_GE_Zero, Div, Quotient_S_One);
2504 
2505     // Calculate Rem result:
2506     auto Remainder_S_Den = B.buildSub(S32, Remainder, Den);
2507 
2508     // Remainder_A_Den = Remainder + Den
2509     auto Remainder_A_Den = B.buildAdd(S32, Remainder, Den);
2510 
2511     // Rem = (Tmp1 ? Remainder_S_Den : Remainder)
2512     auto Rem = B.buildSelect(S32, Tmp1, Remainder_S_Den, Remainder);
2513 
2514     // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den)
2515     B.buildSelect(DstReg, Remainder_GE_Zero, Rem, Remainder_A_Den);
2516   } else {
2517     B.buildSelect(DstReg, Remainder_GE_Zero, Div, Quotient_S_One);
2518   }
2519 }
2520 
2521 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2522                                               MachineRegisterInfo &MRI,
2523                                               MachineIRBuilder &B) const {
2524   B.setInstr(MI);
2525   const bool IsRem = MI.getOpcode() == AMDGPU::G_UREM;
2526   Register DstReg = MI.getOperand(0).getReg();
2527   Register Num = MI.getOperand(1).getReg();
2528   Register Den = MI.getOperand(2).getReg();
2529   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsRem);
2530   MI.eraseFromParent();
2531   return true;
2532 }
2533 
2534 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2535                                             MachineRegisterInfo &MRI,
2536                                             MachineIRBuilder &B) const {
2537   if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32))
2538     return legalizeUDIV_UREM32(MI, MRI, B);
2539   return false;
2540 }
2541 
2542 bool AMDGPULegalizerInfo::legalizeSDIV_SREM32(MachineInstr &MI,
2543                                               MachineRegisterInfo &MRI,
2544                                               MachineIRBuilder &B) const {
2545   B.setInstr(MI);
2546   const LLT S32 = LLT::scalar(32);
2547 
2548   const bool IsRem = MI.getOpcode() == AMDGPU::G_SREM;
2549   Register DstReg = MI.getOperand(0).getReg();
2550   Register LHS = MI.getOperand(1).getReg();
2551   Register RHS = MI.getOperand(2).getReg();
2552 
2553   auto ThirtyOne = B.buildConstant(S32, 31);
2554   auto LHSign = B.buildAShr(S32, LHS, ThirtyOne);
2555   auto RHSign = B.buildAShr(S32, LHS, ThirtyOne);
2556 
2557   LHS = B.buildAdd(S32, LHS, LHSign).getReg(0);
2558   RHS = B.buildAdd(S32, RHS, RHSign).getReg(0);
2559 
2560   LHS = B.buildXor(S32, LHS, LHSign).getReg(0);
2561   RHS = B.buildXor(S32, RHS, RHSign).getReg(0);
2562 
2563   Register UDivRem = MRI.createGenericVirtualRegister(S32);
2564   legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsRem);
2565 
2566   if (IsRem) {
2567     auto RSign = LHSign; // Remainder sign is the same as LHS
2568     UDivRem = B.buildXor(S32, UDivRem, RSign).getReg(0);
2569     B.buildSub(DstReg, UDivRem, RSign);
2570   } else {
2571     auto DSign = B.buildXor(S32, LHSign, RHSign);
2572     UDivRem = B.buildXor(S32, UDivRem, DSign).getReg(0);
2573     B.buildSub(DstReg, UDivRem, DSign);
2574   }
2575 
2576   MI.eraseFromParent();
2577   return true;
2578 }
2579 
2580 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2581                                             MachineRegisterInfo &MRI,
2582                                             MachineIRBuilder &B) const {
2583   if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32))
2584     return legalizeSDIV_SREM32(MI, MRI, B);
2585   return false;
2586 }
2587 
2588 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2589                                                  MachineRegisterInfo &MRI,
2590                                                  MachineIRBuilder &B) const {
2591   Register Res = MI.getOperand(0).getReg();
2592   Register LHS = MI.getOperand(1).getReg();
2593   Register RHS = MI.getOperand(2).getReg();
2594 
2595   uint16_t Flags = MI.getFlags();
2596 
2597   LLT ResTy = MRI.getType(Res);
2598   LLT S32 = LLT::scalar(32);
2599   LLT S64 = LLT::scalar(64);
2600 
2601   const MachineFunction &MF = B.getMF();
2602   bool Unsafe =
2603     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2604 
2605   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2606     return false;
2607 
2608   if (!Unsafe && ResTy == S32 &&
2609       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2610     return false;
2611 
2612   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2613     // 1 / x -> RCP(x)
2614     if (CLHS->isExactlyValue(1.0)) {
2615       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2616         .addUse(RHS)
2617         .setMIFlags(Flags);
2618 
2619       MI.eraseFromParent();
2620       return true;
2621     }
2622 
2623     // -1 / x -> RCP( FNEG(x) )
2624     if (CLHS->isExactlyValue(-1.0)) {
2625       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2626       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2627         .addUse(FNeg.getReg(0))
2628         .setMIFlags(Flags);
2629 
2630       MI.eraseFromParent();
2631       return true;
2632     }
2633   }
2634 
2635   // x / y -> x * (1.0 / y)
2636   if (Unsafe) {
2637     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2638       .addUse(RHS)
2639       .setMIFlags(Flags);
2640     B.buildFMul(Res, LHS, RCP, Flags);
2641 
2642     MI.eraseFromParent();
2643     return true;
2644   }
2645 
2646   return false;
2647 }
2648 
2649 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2650                                          MachineRegisterInfo &MRI,
2651                                          MachineIRBuilder &B) const {
2652   B.setInstr(MI);
2653   Register Res = MI.getOperand(0).getReg();
2654   Register LHS = MI.getOperand(1).getReg();
2655   Register RHS = MI.getOperand(2).getReg();
2656 
2657   uint16_t Flags = MI.getFlags();
2658 
2659   LLT S16 = LLT::scalar(16);
2660   LLT S32 = LLT::scalar(32);
2661 
2662   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2663   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2664 
2665   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2666     .addUse(RHSExt.getReg(0))
2667     .setMIFlags(Flags);
2668 
2669   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2670   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2671 
2672   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2673     .addUse(RDst.getReg(0))
2674     .addUse(RHS)
2675     .addUse(LHS)
2676     .setMIFlags(Flags);
2677 
2678   MI.eraseFromParent();
2679   return true;
2680 }
2681 
2682 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2683 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2684 static void toggleSPDenormMode(bool Enable,
2685                                MachineIRBuilder &B,
2686                                const GCNSubtarget &ST,
2687                                AMDGPU::SIModeRegisterDefaults Mode) {
2688   // Set SP denorm mode to this value.
2689   unsigned SPDenormMode =
2690     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
2691 
2692   if (ST.hasDenormModeInst()) {
2693     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2694     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
2695 
2696     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2697     B.buildInstr(AMDGPU::S_DENORM_MODE)
2698       .addImm(NewDenormModeValue);
2699 
2700   } else {
2701     // Select FP32 bit field in mode register.
2702     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2703                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2704                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2705 
2706     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2707       .addImm(SPDenormMode)
2708       .addImm(SPDenormModeBitField);
2709   }
2710 }
2711 
2712 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2713                                          MachineRegisterInfo &MRI,
2714                                          MachineIRBuilder &B) const {
2715   B.setInstr(MI);
2716   Register Res = MI.getOperand(0).getReg();
2717   Register LHS = MI.getOperand(1).getReg();
2718   Register RHS = MI.getOperand(2).getReg();
2719   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2720   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2721 
2722   uint16_t Flags = MI.getFlags();
2723 
2724   LLT S32 = LLT::scalar(32);
2725   LLT S1 = LLT::scalar(1);
2726 
2727   auto One = B.buildFConstant(S32, 1.0f);
2728 
2729   auto DenominatorScaled =
2730     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2731       .addUse(RHS)
2732       .addUse(LHS)
2733       .addImm(1)
2734       .setMIFlags(Flags);
2735   auto NumeratorScaled =
2736     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2737       .addUse(LHS)
2738       .addUse(RHS)
2739       .addImm(0)
2740       .setMIFlags(Flags);
2741 
2742   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2743     .addUse(DenominatorScaled.getReg(0))
2744     .setMIFlags(Flags);
2745   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2746 
2747   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2748   // aren't modeled as reading it.
2749   if (!Mode.allFP32Denormals())
2750     toggleSPDenormMode(true, B, ST, Mode);
2751 
2752   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2753   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2754   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2755   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2756   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2757   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2758 
2759   if (!Mode.allFP32Denormals())
2760     toggleSPDenormMode(false, B, ST, Mode);
2761 
2762   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2763     .addUse(Fma4.getReg(0))
2764     .addUse(Fma1.getReg(0))
2765     .addUse(Fma3.getReg(0))
2766     .addUse(NumeratorScaled.getReg(1))
2767     .setMIFlags(Flags);
2768 
2769   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2770     .addUse(Fmas.getReg(0))
2771     .addUse(RHS)
2772     .addUse(LHS)
2773     .setMIFlags(Flags);
2774 
2775   MI.eraseFromParent();
2776   return true;
2777 }
2778 
2779 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
2780                                          MachineRegisterInfo &MRI,
2781                                          MachineIRBuilder &B) const {
2782   B.setInstr(MI);
2783   Register Res = MI.getOperand(0).getReg();
2784   Register LHS = MI.getOperand(1).getReg();
2785   Register RHS = MI.getOperand(2).getReg();
2786 
2787   uint16_t Flags = MI.getFlags();
2788 
2789   LLT S64 = LLT::scalar(64);
2790   LLT S1 = LLT::scalar(1);
2791 
2792   auto One = B.buildFConstant(S64, 1.0);
2793 
2794   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2795     .addUse(LHS)
2796     .addUse(RHS)
2797     .addImm(1)
2798     .setMIFlags(Flags);
2799 
2800   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
2801 
2802   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
2803     .addUse(DivScale0.getReg(0))
2804     .setMIFlags(Flags);
2805 
2806   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
2807   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
2808   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
2809 
2810   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2811     .addUse(LHS)
2812     .addUse(RHS)
2813     .addImm(0)
2814     .setMIFlags(Flags);
2815 
2816   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
2817   auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags);
2818   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
2819 
2820   Register Scale;
2821   if (!ST.hasUsableDivScaleConditionOutput()) {
2822     // Workaround a hardware bug on SI where the condition output from div_scale
2823     // is not usable.
2824 
2825     LLT S32 = LLT::scalar(32);
2826 
2827     auto NumUnmerge = B.buildUnmerge(S32, LHS);
2828     auto DenUnmerge = B.buildUnmerge(S32, RHS);
2829     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
2830     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
2831 
2832     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
2833                               Scale1Unmerge.getReg(1));
2834     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
2835                               Scale0Unmerge.getReg(1));
2836     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
2837   } else {
2838     Scale = DivScale1.getReg(1);
2839   }
2840 
2841   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
2842     .addUse(Fma4.getReg(0))
2843     .addUse(Fma3.getReg(0))
2844     .addUse(Mul.getReg(0))
2845     .addUse(Scale)
2846     .setMIFlags(Flags);
2847 
2848   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
2849     .addUse(Fmas.getReg(0))
2850     .addUse(RHS)
2851     .addUse(LHS)
2852     .setMIFlags(Flags);
2853 
2854   MI.eraseFromParent();
2855   return true;
2856 }
2857 
2858 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
2859                                                  MachineRegisterInfo &MRI,
2860                                                  MachineIRBuilder &B) const {
2861   B.setInstr(MI);
2862   Register Res = MI.getOperand(0).getReg();
2863   Register LHS = MI.getOperand(2).getReg();
2864   Register RHS = MI.getOperand(3).getReg();
2865   uint16_t Flags = MI.getFlags();
2866 
2867   LLT S32 = LLT::scalar(32);
2868   LLT S1 = LLT::scalar(1);
2869 
2870   auto Abs = B.buildFAbs(S32, RHS, Flags);
2871   const APFloat C0Val(1.0f);
2872 
2873   auto C0 = B.buildConstant(S32, 0x6f800000);
2874   auto C1 = B.buildConstant(S32, 0x2f800000);
2875   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
2876 
2877   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
2878   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
2879 
2880   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
2881 
2882   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2883     .addUse(Mul0.getReg(0))
2884     .setMIFlags(Flags);
2885 
2886   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
2887 
2888   B.buildFMul(Res, Sel, Mul1, Flags);
2889 
2890   MI.eraseFromParent();
2891   return true;
2892 }
2893 
2894 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
2895                                                  MachineRegisterInfo &MRI,
2896                                                  MachineIRBuilder &B) const {
2897   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2898   if (!MFI->isEntryFunction()) {
2899     return legalizePreloadedArgIntrin(MI, MRI, B,
2900                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
2901   }
2902 
2903   B.setInstr(MI);
2904 
2905   uint64_t Offset =
2906     ST.getTargetLowering()->getImplicitParameterOffset(
2907       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
2908   Register DstReg = MI.getOperand(0).getReg();
2909   LLT DstTy = MRI.getType(DstReg);
2910   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
2911 
2912   const ArgDescriptor *Arg;
2913   const TargetRegisterClass *RC;
2914   std::tie(Arg, RC)
2915     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2916   if (!Arg)
2917     return false;
2918 
2919   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
2920   if (!loadInputValue(KernargPtrReg, B, Arg))
2921     return false;
2922 
2923   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
2924   MI.eraseFromParent();
2925   return true;
2926 }
2927 
2928 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
2929                                               MachineRegisterInfo &MRI,
2930                                               MachineIRBuilder &B,
2931                                               unsigned AddrSpace) const {
2932   B.setInstr(MI);
2933   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
2934   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
2935   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
2936   MI.eraseFromParent();
2937   return true;
2938 }
2939 
2940 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
2941 // offset (the offset that is included in bounds checking and swizzling, to be
2942 // split between the instruction's voffset and immoffset fields) and soffset
2943 // (the offset that is excluded from bounds checking and swizzling, to go in
2944 // the instruction's soffset field).  This function takes the first kind of
2945 // offset and figures out how to split it between voffset and immoffset.
2946 std::tuple<Register, unsigned, unsigned>
2947 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
2948                                         Register OrigOffset) const {
2949   const unsigned MaxImm = 4095;
2950   Register BaseReg;
2951   unsigned TotalConstOffset;
2952   MachineInstr *OffsetDef;
2953   const LLT S32 = LLT::scalar(32);
2954 
2955   std::tie(BaseReg, TotalConstOffset, OffsetDef)
2956     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
2957 
2958   unsigned ImmOffset = TotalConstOffset;
2959 
2960   // If the immediate value is too big for the immoffset field, put the value
2961   // and -4096 into the immoffset field so that the value that is copied/added
2962   // for the voffset field is a multiple of 4096, and it stands more chance
2963   // of being CSEd with the copy/add for another similar load/store.
2964   // However, do not do that rounding down to a multiple of 4096 if that is a
2965   // negative number, as it appears to be illegal to have a negative offset
2966   // in the vgpr, even if adding the immediate offset makes it positive.
2967   unsigned Overflow = ImmOffset & ~MaxImm;
2968   ImmOffset -= Overflow;
2969   if ((int32_t)Overflow < 0) {
2970     Overflow += ImmOffset;
2971     ImmOffset = 0;
2972   }
2973 
2974   if (Overflow != 0) {
2975     if (!BaseReg) {
2976       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
2977     } else {
2978       auto OverflowVal = B.buildConstant(S32, Overflow);
2979       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
2980     }
2981   }
2982 
2983   if (!BaseReg)
2984     BaseReg = B.buildConstant(S32, 0).getReg(0);
2985 
2986   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
2987 }
2988 
2989 /// Handle register layout difference for f16 images for some subtargets.
2990 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
2991                                              MachineRegisterInfo &MRI,
2992                                              Register Reg) const {
2993   if (!ST.hasUnpackedD16VMem())
2994     return Reg;
2995 
2996   const LLT S16 = LLT::scalar(16);
2997   const LLT S32 = LLT::scalar(32);
2998   LLT StoreVT = MRI.getType(Reg);
2999   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3000 
3001   auto Unmerge = B.buildUnmerge(S16, Reg);
3002 
3003   SmallVector<Register, 4> WideRegs;
3004   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3005     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3006 
3007   int NumElts = StoreVT.getNumElements();
3008 
3009   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3010 }
3011 
3012 Register AMDGPULegalizerInfo::fixStoreSourceType(
3013   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3014   MachineRegisterInfo *MRI = B.getMRI();
3015   LLT Ty = MRI->getType(VData);
3016 
3017   const LLT S16 = LLT::scalar(16);
3018 
3019   // Fixup illegal register types for i8 stores.
3020   if (Ty == LLT::scalar(8) || Ty == S16) {
3021     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3022     return AnyExt;
3023   }
3024 
3025   if (Ty.isVector()) {
3026     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3027       if (IsFormat)
3028         return handleD16VData(B, *MRI, VData);
3029     }
3030   }
3031 
3032   return VData;
3033 }
3034 
3035 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3036                                               MachineRegisterInfo &MRI,
3037                                               MachineIRBuilder &B,
3038                                               bool IsTyped,
3039                                               bool IsFormat) const {
3040   B.setInstr(MI);
3041 
3042   Register VData = MI.getOperand(1).getReg();
3043   LLT Ty = MRI.getType(VData);
3044   LLT EltTy = Ty.getScalarType();
3045   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3046   const LLT S32 = LLT::scalar(32);
3047 
3048   VData = fixStoreSourceType(B, VData, IsFormat);
3049   Register RSrc = MI.getOperand(2).getReg();
3050 
3051   MachineMemOperand *MMO = *MI.memoperands_begin();
3052   const int MemSize = MMO->getSize();
3053 
3054   unsigned ImmOffset;
3055   unsigned TotalOffset;
3056 
3057   // The typed intrinsics add an immediate after the registers.
3058   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3059 
3060   // The struct intrinsic variants add one additional operand over raw.
3061   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3062   Register VIndex;
3063   int OpOffset = 0;
3064   if (HasVIndex) {
3065     VIndex = MI.getOperand(3).getReg();
3066     OpOffset = 1;
3067   }
3068 
3069   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3070   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3071 
3072   unsigned Format = 0;
3073   if (IsTyped) {
3074     Format = MI.getOperand(5 + OpOffset).getImm();
3075     ++OpOffset;
3076   }
3077 
3078   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3079 
3080   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3081   if (TotalOffset != 0)
3082     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3083 
3084   unsigned Opc;
3085   if (IsTyped) {
3086     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3087                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3088   } else if (IsFormat) {
3089     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3090                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3091   } else {
3092     switch (MemSize) {
3093     case 1:
3094       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3095       break;
3096     case 2:
3097       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3098       break;
3099     default:
3100       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3101       break;
3102     }
3103   }
3104 
3105   if (!VIndex)
3106     VIndex = B.buildConstant(S32, 0).getReg(0);
3107 
3108   auto MIB = B.buildInstr(Opc)
3109     .addUse(VData)              // vdata
3110     .addUse(RSrc)               // rsrc
3111     .addUse(VIndex)             // vindex
3112     .addUse(VOffset)            // voffset
3113     .addUse(SOffset)            // soffset
3114     .addImm(ImmOffset);         // offset(imm)
3115 
3116   if (IsTyped)
3117     MIB.addImm(Format);
3118 
3119   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3120      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3121      .addMemOperand(MMO);
3122 
3123   MI.eraseFromParent();
3124   return true;
3125 }
3126 
3127 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3128                                              MachineRegisterInfo &MRI,
3129                                              MachineIRBuilder &B,
3130                                              bool IsFormat,
3131                                              bool IsTyped) const {
3132   B.setInstr(MI);
3133 
3134   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3135   MachineMemOperand *MMO = *MI.memoperands_begin();
3136   const int MemSize = MMO->getSize();
3137   const LLT S32 = LLT::scalar(32);
3138 
3139   Register Dst = MI.getOperand(0).getReg();
3140   Register RSrc = MI.getOperand(2).getReg();
3141 
3142   // The typed intrinsics add an immediate after the registers.
3143   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3144 
3145   // The struct intrinsic variants add one additional operand over raw.
3146   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3147   Register VIndex;
3148   int OpOffset = 0;
3149   if (HasVIndex) {
3150     VIndex = MI.getOperand(3).getReg();
3151     OpOffset = 1;
3152   }
3153 
3154   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3155   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3156 
3157   unsigned Format = 0;
3158   if (IsTyped) {
3159     Format = MI.getOperand(5 + OpOffset).getImm();
3160     ++OpOffset;
3161   }
3162 
3163   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3164   unsigned ImmOffset;
3165   unsigned TotalOffset;
3166 
3167   LLT Ty = MRI.getType(Dst);
3168   LLT EltTy = Ty.getScalarType();
3169   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3170   const bool Unpacked = ST.hasUnpackedD16VMem();
3171 
3172   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3173   if (TotalOffset != 0)
3174     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3175 
3176   unsigned Opc;
3177 
3178   if (IsTyped) {
3179     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3180                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3181   } else if (IsFormat) {
3182     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3183                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3184   } else {
3185     switch (MemSize) {
3186     case 1:
3187       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3188       break;
3189     case 2:
3190       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3191       break;
3192     default:
3193       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3194       break;
3195     }
3196   }
3197 
3198   Register LoadDstReg;
3199 
3200   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3201   LLT UnpackedTy = Ty.changeElementSize(32);
3202 
3203   if (IsExtLoad)
3204     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3205   else if (Unpacked && IsD16 && Ty.isVector())
3206     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3207   else
3208     LoadDstReg = Dst;
3209 
3210   if (!VIndex)
3211     VIndex = B.buildConstant(S32, 0).getReg(0);
3212 
3213   auto MIB = B.buildInstr(Opc)
3214     .addDef(LoadDstReg)         // vdata
3215     .addUse(RSrc)               // rsrc
3216     .addUse(VIndex)             // vindex
3217     .addUse(VOffset)            // voffset
3218     .addUse(SOffset)            // soffset
3219     .addImm(ImmOffset);         // offset(imm)
3220 
3221   if (IsTyped)
3222     MIB.addImm(Format);
3223 
3224   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3225      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3226      .addMemOperand(MMO);
3227 
3228   if (LoadDstReg != Dst) {
3229     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3230 
3231     // Widen result for extending loads was widened.
3232     if (IsExtLoad)
3233       B.buildTrunc(Dst, LoadDstReg);
3234     else {
3235       // Repack to original 16-bit vector result
3236       // FIXME: G_TRUNC should work, but legalization currently fails
3237       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3238       SmallVector<Register, 4> Repack;
3239       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3240         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3241       B.buildMerge(Dst, Repack);
3242     }
3243   }
3244 
3245   MI.eraseFromParent();
3246   return true;
3247 }
3248 
3249 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3250                                                MachineIRBuilder &B,
3251                                                bool IsInc) const {
3252   B.setInstr(MI);
3253   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3254                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3255   B.buildInstr(Opc)
3256     .addDef(MI.getOperand(0).getReg())
3257     .addUse(MI.getOperand(2).getReg())
3258     .addUse(MI.getOperand(3).getReg())
3259     .cloneMemRefs(MI);
3260   MI.eraseFromParent();
3261   return true;
3262 }
3263 
3264 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3265   switch (IntrID) {
3266   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3267   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3268     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3269   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3270   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3271     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3272   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3273   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3274     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3275   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3276   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3277     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3278   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3279   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3280     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3281   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3282   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3283     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3284   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3285   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3286     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3287   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3288   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3289     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3290   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3291   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3292     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3293   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3294   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3295     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3296   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3297   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3298     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3299   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3300   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3301     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3302   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3303   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3304     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3305   default:
3306     llvm_unreachable("unhandled atomic opcode");
3307   }
3308 }
3309 
3310 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3311                                                MachineIRBuilder &B,
3312                                                Intrinsic::ID IID) const {
3313   B.setInstr(MI);
3314 
3315   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3316                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3317 
3318   Register Dst = MI.getOperand(0).getReg();
3319   Register VData = MI.getOperand(2).getReg();
3320 
3321   Register CmpVal;
3322   int OpOffset = 0;
3323 
3324   if (IsCmpSwap) {
3325     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3326     ++OpOffset;
3327   }
3328 
3329   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3330   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
3331 
3332   // The struct intrinsic variants add one additional operand over raw.
3333   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3334   Register VIndex;
3335   if (HasVIndex) {
3336     VIndex = MI.getOperand(4 + OpOffset).getReg();
3337     ++OpOffset;
3338   }
3339 
3340   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3341   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3342   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3343 
3344   MachineMemOperand *MMO = *MI.memoperands_begin();
3345 
3346   unsigned ImmOffset;
3347   unsigned TotalOffset;
3348   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3349   if (TotalOffset != 0)
3350     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3351 
3352   if (!VIndex)
3353     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3354 
3355   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
3356     .addDef(Dst)
3357     .addUse(VData); // vdata
3358 
3359   if (IsCmpSwap)
3360     MIB.addReg(CmpVal);
3361 
3362   MIB.addUse(RSrc)               // rsrc
3363      .addUse(VIndex)             // vindex
3364      .addUse(VOffset)            // voffset
3365      .addUse(SOffset)            // soffset
3366      .addImm(ImmOffset)          // offset(imm)
3367      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3368      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3369      .addMemOperand(MMO);
3370 
3371   MI.eraseFromParent();
3372   return true;
3373 }
3374 
3375 // Produce a vector of s16 elements from s32 pieces.
3376 static void truncToS16Vector(MachineIRBuilder &B, Register DstReg,
3377                              ArrayRef<Register> UnmergeParts) {
3378   const LLT S16 = LLT::scalar(16);
3379 
3380   SmallVector<Register, 4> RemergeParts(UnmergeParts.size());
3381   for (int I = 0, E = UnmergeParts.size(); I != E; ++I)
3382     RemergeParts[I] = B.buildTrunc(S16, UnmergeParts[I]).getReg(0);
3383 
3384   B.buildBuildVector(DstReg, RemergeParts);
3385 }
3386 
3387 /// Convert a set of s32 registers to a result vector with s16 elements.
3388 static void bitcastToS16Vector(MachineIRBuilder &B, Register DstReg,
3389                                ArrayRef<Register> UnmergeParts) {
3390   MachineRegisterInfo &MRI = *B.getMRI();
3391   const LLT V2S16 = LLT::vector(2, 16);
3392   LLT TargetTy = MRI.getType(DstReg);
3393   int NumElts = UnmergeParts.size();
3394 
3395   if (NumElts == 1) {
3396     assert(TargetTy == V2S16);
3397     B.buildBitcast(DstReg, UnmergeParts[0]);
3398     return;
3399   }
3400 
3401   SmallVector<Register, 4> RemergeParts(NumElts);
3402   for (int I = 0; I != NumElts; ++I)
3403     RemergeParts[I] = B.buildBitcast(V2S16, UnmergeParts[I]).getReg(0);
3404 
3405   if (TargetTy.getSizeInBits() == 32u * NumElts) {
3406     B.buildConcatVectors(DstReg, RemergeParts);
3407     return;
3408   }
3409 
3410   const LLT V3S16 = LLT::vector(3, 16);
3411   const LLT V6S16 = LLT::vector(6, 16);
3412 
3413   // Widen to v6s16 and unpack v3 parts.
3414   assert(TargetTy == V3S16);
3415 
3416   RemergeParts.push_back(B.buildUndef(V2S16).getReg(0));
3417   auto Concat = B.buildConcatVectors(V6S16, RemergeParts);
3418   B.buildUnmerge({DstReg, MRI.createGenericVirtualRegister(V3S16)}, Concat);
3419 }
3420 
3421 // FIXME: Just vector trunc should be sufficent, but legalization currently
3422 // broken.
3423 static void repackUnpackedD16Load(MachineIRBuilder &B, Register DstReg,
3424                                   Register WideDstReg) {
3425   const LLT S32 = LLT::scalar(32);
3426   const LLT S16 = LLT::scalar(16);
3427 
3428   auto Unmerge = B.buildUnmerge(S32, WideDstReg);
3429 
3430   int NumOps = Unmerge->getNumOperands() - 1;
3431   SmallVector<Register, 4> RemergeParts(NumOps);
3432   for (int I = 0; I != NumOps; ++I)
3433     RemergeParts[I] = B.buildTrunc(S16, Unmerge.getReg(I)).getReg(0);
3434 
3435   B.buildBuildVector(DstReg, RemergeParts);
3436 }
3437 
3438 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3439     MachineInstr &MI, MachineIRBuilder &B,
3440     GISelChangeObserver &Observer,
3441     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3442   bool IsTFE = MI.getNumExplicitDefs() == 2;
3443 
3444   // We are only processing the operands of d16 image operations on subtargets
3445   // that use the unpacked register layout, or need to repack the TFE result.
3446 
3447   // TODO: Need to handle a16 images too
3448   // TODO: Do we need to guard against already legalized intrinsics?
3449   if (!IsTFE && !ST.hasUnpackedD16VMem())
3450     return true;
3451 
3452   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3453     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3454 
3455   if (BaseOpcode->Atomic) // No d16 atomics, or TFE.
3456     return true;
3457 
3458   B.setInstr(MI);
3459 
3460   MachineRegisterInfo *MRI = B.getMRI();
3461   const LLT S32 = LLT::scalar(32);
3462   const LLT S16 = LLT::scalar(16);
3463 
3464   if (BaseOpcode->Store) { // No TFE for stores?
3465     Register VData = MI.getOperand(1).getReg();
3466     LLT Ty = MRI->getType(VData);
3467     if (!Ty.isVector() || Ty.getElementType() != S16)
3468       return true;
3469 
3470     B.setInstr(MI);
3471 
3472     Observer.changingInstr(MI);
3473     MI.getOperand(1).setReg(handleD16VData(B, *MRI, VData));
3474     Observer.changedInstr(MI);
3475     return true;
3476   }
3477 
3478   Register DstReg = MI.getOperand(0).getReg();
3479   LLT Ty = MRI->getType(DstReg);
3480   const LLT EltTy = Ty.getScalarType();
3481   const bool IsD16 = Ty.getScalarType() == S16;
3482   const unsigned NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
3483 
3484   if (IsTFE) {
3485     // In the IR, TFE is supposed to be used with a 2 element struct return
3486     // type. The intruction really returns these two values in one contiguous
3487     // register, with one additional dword beyond the loaded data. Rewrite the
3488     // return type to use a single register result.
3489     Register Dst1Reg = MI.getOperand(1).getReg();
3490     if (MRI->getType(Dst1Reg) != S32)
3491       return false;
3492 
3493     // TODO: Make sure the TFE operand bit is set.
3494 
3495     // The raw dword aligned data component of the load. The only legal cases
3496     // where this matters should be when using the packed D16 format, for
3497     // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
3498     LLT RoundedTy;
3499     LLT TFETy;
3500 
3501     if (IsD16 && ST.hasUnpackedD16VMem()) {
3502       RoundedTy = LLT::scalarOrVector(NumElts, 32);
3503       TFETy = LLT::vector(NumElts + 1, 32);
3504     } else {
3505       unsigned EltSize = Ty.getScalarSizeInBits();
3506       unsigned RoundedElts = (Ty.getSizeInBits() + 31) / 32;
3507       unsigned RoundedSize = 32 * RoundedElts;
3508       RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
3509       TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
3510     }
3511 
3512     Register TFEReg = MRI->createGenericVirtualRegister(TFETy);
3513     Observer.changingInstr(MI);
3514 
3515     MI.getOperand(0).setReg(TFEReg);
3516     MI.RemoveOperand(1);
3517 
3518     Observer.changedInstr(MI);
3519 
3520     // Insert after the instruction.
3521     B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3522 
3523     // Now figure out how to copy the new result register back into the old
3524     // result.
3525 
3526     SmallVector<Register, 5> UnmergeResults(TFETy.getNumElements(), Dst1Reg);
3527     int NumDataElts = TFETy.getNumElements() - 1;
3528 
3529     if (!Ty.isVector()) {
3530       // Simplest case is a trivial unmerge (plus a truncate for d16).
3531       UnmergeResults[0] = Ty == S32 ?
3532         DstReg : MRI->createGenericVirtualRegister(S32);
3533 
3534       B.buildUnmerge(UnmergeResults, TFEReg);
3535       if (Ty != S32)
3536         B.buildTrunc(DstReg, UnmergeResults[0]);
3537       return true;
3538     }
3539 
3540     // We have to repack into a new vector of some kind.
3541     for (int I = 0; I != NumDataElts; ++I)
3542       UnmergeResults[I] = MRI->createGenericVirtualRegister(S32);
3543     B.buildUnmerge(UnmergeResults, TFEReg);
3544 
3545     // Drop the final TFE element.
3546     ArrayRef<Register> DataPart(UnmergeResults.data(), NumDataElts);
3547 
3548     if (EltTy == S32)
3549       B.buildBuildVector(DstReg, DataPart);
3550     else if (ST.hasUnpackedD16VMem())
3551       truncToS16Vector(B, DstReg, DataPart);
3552     else
3553       bitcastToS16Vector(B, DstReg, DataPart);
3554 
3555     return true;
3556   }
3557 
3558   // Must be an image load.
3559   if (!Ty.isVector() || Ty.getElementType() != S16)
3560     return true;
3561 
3562   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3563 
3564   LLT WidenedTy = Ty.changeElementType(S32);
3565   Register WideDstReg = MRI->createGenericVirtualRegister(WidenedTy);
3566 
3567   Observer.changingInstr(MI);
3568   MI.getOperand(0).setReg(WideDstReg);
3569   Observer.changedInstr(MI);
3570 
3571   repackUnpackedD16Load(B, DstReg, WideDstReg);
3572   return true;
3573 }
3574 
3575 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
3576   MachineInstr &MI, MachineIRBuilder &B,
3577   GISelChangeObserver &Observer) const {
3578   Register Dst = MI.getOperand(0).getReg();
3579   LLT Ty = B.getMRI()->getType(Dst);
3580   unsigned Size = Ty.getSizeInBits();
3581   MachineFunction &MF = B.getMF();
3582 
3583   Observer.changingInstr(MI);
3584 
3585   // FIXME: We don't really need this intermediate instruction. The intrinsic
3586   // should be fixed to have a memory operand. Since it's readnone, we're not
3587   // allowed to add one.
3588   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
3589   MI.RemoveOperand(1); // Remove intrinsic ID
3590 
3591   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
3592   // TODO: Should this use datalayout alignment?
3593   const unsigned MemSize = (Size + 7) / 8;
3594   const unsigned MemAlign = 4;
3595   MachineMemOperand *MMO = MF.getMachineMemOperand(
3596     MachinePointerInfo(),
3597     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
3598     MachineMemOperand::MOInvariant, MemSize, MemAlign);
3599   MI.addMemOperand(MF, MMO);
3600 
3601   // There are no 96-bit result scalar loads, but widening to 128-bit should
3602   // always be legal. We may need to restore this to a 96-bit result if it turns
3603   // out this needs to be converted to a vector load during RegBankSelect.
3604   if (!isPowerOf2_32(Size)) {
3605     LegalizerHelper Helper(MF, *this, Observer, B);
3606     B.setInstr(MI);
3607 
3608     if (Ty.isVector())
3609       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
3610     else
3611       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
3612   }
3613 
3614   Observer.changedInstr(MI);
3615   return true;
3616 }
3617 
3618 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
3619                                                 MachineRegisterInfo &MRI,
3620                                                 MachineIRBuilder &B) const {
3621   B.setInstr(MI);
3622 
3623   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
3624   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
3625       !ST.isTrapHandlerEnabled()) {
3626     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
3627   } else {
3628     // Pass queue pointer to trap handler as input, and insert trap instruction
3629     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
3630     const ArgDescriptor *Arg =
3631         getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR);
3632     if (!Arg)
3633       return false;
3634     MachineRegisterInfo &MRI = *B.getMRI();
3635     Register SGPR01(AMDGPU::SGPR0_SGPR1);
3636     Register LiveIn = getLiveInRegister(
3637         B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64),
3638         /*InsertLiveInCopy=*/false);
3639     if (!loadInputValue(LiveIn, B, Arg))
3640       return false;
3641     B.buildCopy(SGPR01, LiveIn);
3642     B.buildInstr(AMDGPU::S_TRAP)
3643         .addImm(GCNSubtarget::TrapIDLLVMTrap)
3644         .addReg(SGPR01, RegState::Implicit);
3645   }
3646 
3647   MI.eraseFromParent();
3648   return true;
3649 }
3650 
3651 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
3652     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
3653   B.setInstr(MI);
3654 
3655   // Is non-HSA path or trap-handler disabled? then, report a warning
3656   // accordingly
3657   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
3658       !ST.isTrapHandlerEnabled()) {
3659     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
3660                                      "debugtrap handler not supported",
3661                                      MI.getDebugLoc(), DS_Warning);
3662     LLVMContext &Ctx = B.getMF().getFunction().getContext();
3663     Ctx.diagnose(NoTrap);
3664   } else {
3665     // Insert debug-trap instruction
3666     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
3667   }
3668 
3669   MI.eraseFromParent();
3670   return true;
3671 }
3672 
3673 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
3674                                             MachineIRBuilder &B,
3675                                             GISelChangeObserver &Observer) const {
3676   MachineRegisterInfo &MRI = *B.getMRI();
3677 
3678   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
3679   auto IntrID = MI.getIntrinsicID();
3680   switch (IntrID) {
3681   case Intrinsic::amdgcn_if:
3682   case Intrinsic::amdgcn_else: {
3683     MachineInstr *Br = nullptr;
3684     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
3685       const SIRegisterInfo *TRI
3686         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
3687 
3688       B.setInstr(*BrCond);
3689       Register Def = MI.getOperand(1).getReg();
3690       Register Use = MI.getOperand(3).getReg();
3691 
3692       MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB();
3693       if (Br)
3694         BrTarget = Br->getOperand(0).getMBB();
3695 
3696       if (IntrID == Intrinsic::amdgcn_if) {
3697         B.buildInstr(AMDGPU::SI_IF)
3698           .addDef(Def)
3699           .addUse(Use)
3700           .addMBB(BrTarget);
3701       } else {
3702         B.buildInstr(AMDGPU::SI_ELSE)
3703           .addDef(Def)
3704           .addUse(Use)
3705           .addMBB(BrTarget)
3706           .addImm(0);
3707       }
3708 
3709       if (Br)
3710         Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB());
3711 
3712       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
3713       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
3714       MI.eraseFromParent();
3715       BrCond->eraseFromParent();
3716       return true;
3717     }
3718 
3719     return false;
3720   }
3721   case Intrinsic::amdgcn_loop: {
3722     MachineInstr *Br = nullptr;
3723     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
3724       const SIRegisterInfo *TRI
3725         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
3726 
3727       B.setInstr(*BrCond);
3728 
3729       MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB();
3730       if (Br)
3731         BrTarget = Br->getOperand(0).getMBB();
3732 
3733       Register Reg = MI.getOperand(2).getReg();
3734       B.buildInstr(AMDGPU::SI_LOOP)
3735         .addUse(Reg)
3736         .addMBB(BrTarget);
3737 
3738       if (Br)
3739         Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB());
3740 
3741       MI.eraseFromParent();
3742       BrCond->eraseFromParent();
3743       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
3744       return true;
3745     }
3746 
3747     return false;
3748   }
3749   case Intrinsic::amdgcn_kernarg_segment_ptr:
3750     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
3751       B.setInstr(MI);
3752       // This only makes sense to call in a kernel, so just lower to null.
3753       B.buildConstant(MI.getOperand(0).getReg(), 0);
3754       MI.eraseFromParent();
3755       return true;
3756     }
3757 
3758     return legalizePreloadedArgIntrin(
3759       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
3760   case Intrinsic::amdgcn_implicitarg_ptr:
3761     return legalizeImplicitArgPtr(MI, MRI, B);
3762   case Intrinsic::amdgcn_workitem_id_x:
3763     return legalizePreloadedArgIntrin(MI, MRI, B,
3764                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
3765   case Intrinsic::amdgcn_workitem_id_y:
3766     return legalizePreloadedArgIntrin(MI, MRI, B,
3767                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
3768   case Intrinsic::amdgcn_workitem_id_z:
3769     return legalizePreloadedArgIntrin(MI, MRI, B,
3770                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
3771   case Intrinsic::amdgcn_workgroup_id_x:
3772     return legalizePreloadedArgIntrin(MI, MRI, B,
3773                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
3774   case Intrinsic::amdgcn_workgroup_id_y:
3775     return legalizePreloadedArgIntrin(MI, MRI, B,
3776                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
3777   case Intrinsic::amdgcn_workgroup_id_z:
3778     return legalizePreloadedArgIntrin(MI, MRI, B,
3779                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
3780   case Intrinsic::amdgcn_dispatch_ptr:
3781     return legalizePreloadedArgIntrin(MI, MRI, B,
3782                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
3783   case Intrinsic::amdgcn_queue_ptr:
3784     return legalizePreloadedArgIntrin(MI, MRI, B,
3785                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
3786   case Intrinsic::amdgcn_implicit_buffer_ptr:
3787     return legalizePreloadedArgIntrin(
3788       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
3789   case Intrinsic::amdgcn_dispatch_id:
3790     return legalizePreloadedArgIntrin(MI, MRI, B,
3791                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
3792   case Intrinsic::amdgcn_fdiv_fast:
3793     return legalizeFDIVFastIntrin(MI, MRI, B);
3794   case Intrinsic::amdgcn_is_shared:
3795     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
3796   case Intrinsic::amdgcn_is_private:
3797     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
3798   case Intrinsic::amdgcn_wavefrontsize: {
3799     B.setInstr(MI);
3800     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
3801     MI.eraseFromParent();
3802     return true;
3803   }
3804   case Intrinsic::amdgcn_s_buffer_load:
3805     return legalizeSBufferLoad(MI, B, Observer);
3806   case Intrinsic::amdgcn_raw_buffer_store:
3807   case Intrinsic::amdgcn_struct_buffer_store:
3808     return legalizeBufferStore(MI, MRI, B, false, false);
3809   case Intrinsic::amdgcn_raw_buffer_store_format:
3810   case Intrinsic::amdgcn_struct_buffer_store_format:
3811     return legalizeBufferStore(MI, MRI, B, false, true);
3812   case Intrinsic::amdgcn_raw_tbuffer_store:
3813   case Intrinsic::amdgcn_struct_tbuffer_store:
3814     return legalizeBufferStore(MI, MRI, B, true, true);
3815   case Intrinsic::amdgcn_raw_buffer_load:
3816   case Intrinsic::amdgcn_struct_buffer_load:
3817     return legalizeBufferLoad(MI, MRI, B, false, false);
3818   case Intrinsic::amdgcn_raw_buffer_load_format:
3819   case Intrinsic::amdgcn_struct_buffer_load_format:
3820     return legalizeBufferLoad(MI, MRI, B, true, false);
3821   case Intrinsic::amdgcn_raw_tbuffer_load:
3822   case Intrinsic::amdgcn_struct_tbuffer_load:
3823     return legalizeBufferLoad(MI, MRI, B, true, true);
3824   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3825   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3826   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3827   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3828   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3829   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3830   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3831   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3832   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3833   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3834   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3835   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3836   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3837   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3838   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3839   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3840   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3841   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3842   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3843   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3844   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3845   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3846   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3847   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3848   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3849   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3850     return legalizeBufferAtomic(MI, B, IntrID);
3851   case Intrinsic::amdgcn_atomic_inc:
3852     return legalizeAtomicIncDec(MI, B, true);
3853   case Intrinsic::amdgcn_atomic_dec:
3854     return legalizeAtomicIncDec(MI, B, false);
3855   case Intrinsic::trap:
3856     return legalizeTrapIntrinsic(MI, MRI, B);
3857   case Intrinsic::debugtrap:
3858     return legalizeDebugTrapIntrinsic(MI, MRI, B);
3859   default: {
3860     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
3861             AMDGPU::getImageDimIntrinsicInfo(IntrID))
3862       return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr);
3863     return true;
3864   }
3865   }
3866 
3867   return true;
3868 }
3869