1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
19 #endif
20 
21 #include "AMDGPULegalizerInfo.h"
22 
23 #include "AMDGPU.h"
24 #include "AMDGPUGlobalISelUtils.h"
25 #include "AMDGPUTargetMachine.h"
26 #include "SIMachineFunctionInfo.h"
27 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
28 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
29 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
30 #include "llvm/CodeGen/TargetOpcodes.h"
31 #include "llvm/CodeGen/ValueTypes.h"
32 #include "llvm/IR/DerivedTypes.h"
33 #include "llvm/IR/DiagnosticInfo.h"
34 #include "llvm/IR/Type.h"
35 #include "llvm/Support/Debug.h"
36 
37 #define DEBUG_TYPE "amdgpu-legalinfo"
38 
39 using namespace llvm;
40 using namespace LegalizeActions;
41 using namespace LegalizeMutations;
42 using namespace LegalityPredicates;
43 using namespace MIPatternMatch;
44 
45 // Round the number of elements to the next power of two elements
46 static LLT getPow2VectorType(LLT Ty) {
47   unsigned NElts = Ty.getNumElements();
48   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
49   return Ty.changeNumElements(Pow2NElts);
50 }
51 
52 // Round the number of bits to the next power of two bits
53 static LLT getPow2ScalarType(LLT Ty) {
54   unsigned Bits = Ty.getSizeInBits();
55   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
56   return LLT::scalar(Pow2Bits);
57 }
58 
59 static LegalityPredicate isMultiple32(unsigned TypeIdx,
60                                       unsigned MaxSize = 1024) {
61   return [=](const LegalityQuery &Query) {
62     const LLT Ty = Query.Types[TypeIdx];
63     const LLT EltTy = Ty.getScalarType();
64     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
65   };
66 }
67 
68 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
69   return [=](const LegalityQuery &Query) {
70     return Query.Types[TypeIdx].getSizeInBits() == Size;
71   };
72 }
73 
74 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
75   return [=](const LegalityQuery &Query) {
76     const LLT Ty = Query.Types[TypeIdx];
77     return Ty.isVector() &&
78            Ty.getNumElements() % 2 != 0 &&
79            Ty.getElementType().getSizeInBits() < 32 &&
80            Ty.getSizeInBits() % 32 != 0;
81   };
82 }
83 
84 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
85   return [=](const LegalityQuery &Query) {
86     const LLT Ty = Query.Types[TypeIdx];
87     const LLT EltTy = Ty.getScalarType();
88     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
89   };
90 }
91 
92 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
93   return [=](const LegalityQuery &Query) {
94     const LLT Ty = Query.Types[TypeIdx];
95     const LLT EltTy = Ty.getElementType();
96     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
97   };
98 }
99 
100 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
101   return [=](const LegalityQuery &Query) {
102     const LLT Ty = Query.Types[TypeIdx];
103     const LLT EltTy = Ty.getElementType();
104     unsigned Size = Ty.getSizeInBits();
105     unsigned Pieces = (Size + 63) / 64;
106     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
107     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
108   };
109 }
110 
111 // Increase the number of vector elements to reach the next multiple of 32-bit
112 // type.
113 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
114   return [=](const LegalityQuery &Query) {
115     const LLT Ty = Query.Types[TypeIdx];
116 
117     const LLT EltTy = Ty.getElementType();
118     const int Size = Ty.getSizeInBits();
119     const int EltSize = EltTy.getSizeInBits();
120     const int NextMul32 = (Size + 31) / 32;
121 
122     assert(EltSize < 32);
123 
124     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
125     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
126   };
127 }
128 
129 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
130   return [=](const LegalityQuery &Query) {
131     const LLT QueryTy = Query.Types[TypeIdx];
132     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
133   };
134 }
135 
136 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
137   return [=](const LegalityQuery &Query) {
138     const LLT QueryTy = Query.Types[TypeIdx];
139     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
140   };
141 }
142 
143 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
144   return [=](const LegalityQuery &Query) {
145     const LLT QueryTy = Query.Types[TypeIdx];
146     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
147   };
148 }
149 
150 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
151 // v2s16.
152 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
153   return [=](const LegalityQuery &Query) {
154     const LLT Ty = Query.Types[TypeIdx];
155     if (Ty.isVector()) {
156       const int EltSize = Ty.getElementType().getSizeInBits();
157       return EltSize == 32 || EltSize == 64 ||
158             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
159              EltSize == 128 || EltSize == 256;
160     }
161 
162     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
163   };
164 }
165 
166 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
167   return [=](const LegalityQuery &Query) {
168     const LLT QueryTy = Query.Types[TypeIdx];
169     return QueryTy.isVector() && QueryTy.getElementType() == Type;
170   };
171 }
172 
173 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
174   return [=](const LegalityQuery &Query) {
175     const LLT QueryTy = Query.Types[TypeIdx];
176     if (!QueryTy.isVector())
177       return false;
178     const LLT EltTy = QueryTy.getElementType();
179     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
180   };
181 }
182 
183 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
184   return [=](const LegalityQuery &Query) {
185     const LLT Ty = Query.Types[TypeIdx];
186     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
187            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
188   };
189 }
190 
191 static LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1) {
192   return [=](const LegalityQuery &Query) {
193     return Query.Types[TypeIdx0].getSizeInBits() <
194            Query.Types[TypeIdx1].getSizeInBits();
195   };
196 }
197 
198 static LegalityPredicate greaterThan(unsigned TypeIdx0, unsigned TypeIdx1) {
199   return [=](const LegalityQuery &Query) {
200     return Query.Types[TypeIdx0].getSizeInBits() >
201            Query.Types[TypeIdx1].getSizeInBits();
202   };
203 }
204 
205 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
206                                          const GCNTargetMachine &TM)
207   :  ST(ST_) {
208   using namespace TargetOpcode;
209 
210   auto GetAddrSpacePtr = [&TM](unsigned AS) {
211     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
212   };
213 
214   const LLT S1 = LLT::scalar(1);
215   const LLT S16 = LLT::scalar(16);
216   const LLT S32 = LLT::scalar(32);
217   const LLT S64 = LLT::scalar(64);
218   const LLT S128 = LLT::scalar(128);
219   const LLT S256 = LLT::scalar(256);
220   const LLT S512 = LLT::scalar(512);
221   const LLT S1024 = LLT::scalar(1024);
222 
223   const LLT V2S16 = LLT::vector(2, 16);
224   const LLT V4S16 = LLT::vector(4, 16);
225 
226   const LLT V2S32 = LLT::vector(2, 32);
227   const LLT V3S32 = LLT::vector(3, 32);
228   const LLT V4S32 = LLT::vector(4, 32);
229   const LLT V5S32 = LLT::vector(5, 32);
230   const LLT V6S32 = LLT::vector(6, 32);
231   const LLT V7S32 = LLT::vector(7, 32);
232   const LLT V8S32 = LLT::vector(8, 32);
233   const LLT V9S32 = LLT::vector(9, 32);
234   const LLT V10S32 = LLT::vector(10, 32);
235   const LLT V11S32 = LLT::vector(11, 32);
236   const LLT V12S32 = LLT::vector(12, 32);
237   const LLT V13S32 = LLT::vector(13, 32);
238   const LLT V14S32 = LLT::vector(14, 32);
239   const LLT V15S32 = LLT::vector(15, 32);
240   const LLT V16S32 = LLT::vector(16, 32);
241   const LLT V32S32 = LLT::vector(32, 32);
242 
243   const LLT V2S64 = LLT::vector(2, 64);
244   const LLT V3S64 = LLT::vector(3, 64);
245   const LLT V4S64 = LLT::vector(4, 64);
246   const LLT V5S64 = LLT::vector(5, 64);
247   const LLT V6S64 = LLT::vector(6, 64);
248   const LLT V7S64 = LLT::vector(7, 64);
249   const LLT V8S64 = LLT::vector(8, 64);
250   const LLT V16S64 = LLT::vector(16, 64);
251 
252   std::initializer_list<LLT> AllS32Vectors =
253     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
254      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
255   std::initializer_list<LLT> AllS64Vectors =
256     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
257 
258   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
259   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
260   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
261   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
262   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
263   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
264   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
265 
266   const LLT CodePtr = FlatPtr;
267 
268   const std::initializer_list<LLT> AddrSpaces64 = {
269     GlobalPtr, ConstantPtr, FlatPtr
270   };
271 
272   const std::initializer_list<LLT> AddrSpaces32 = {
273     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
274   };
275 
276   const std::initializer_list<LLT> FPTypesBase = {
277     S32, S64
278   };
279 
280   const std::initializer_list<LLT> FPTypes16 = {
281     S32, S64, S16
282   };
283 
284   const std::initializer_list<LLT> FPTypesPK16 = {
285     S32, S64, S16, V2S16
286   };
287 
288   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
289 
290   setAction({G_BRCOND, S1}, Legal); // VCC branches
291   setAction({G_BRCOND, S32}, Legal); // SCC branches
292 
293   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
294   // elements for v3s16
295   getActionDefinitionsBuilder(G_PHI)
296     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
297     .legalFor(AllS32Vectors)
298     .legalFor(AllS64Vectors)
299     .legalFor(AddrSpaces64)
300     .legalFor(AddrSpaces32)
301     .clampScalar(0, S32, S256)
302     .widenScalarToNextPow2(0, 32)
303     .clampMaxNumElements(0, S32, 16)
304     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
305     .legalIf(isPointer(0));
306 
307   if (ST.hasVOP3PInsts()) {
308     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
309       .legalFor({S32, S16, V2S16})
310       .clampScalar(0, S16, S32)
311       .clampMaxNumElements(0, S16, 2)
312       .scalarize(0)
313       .widenScalarToNextPow2(0, 32);
314   } else if (ST.has16BitInsts()) {
315     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
316       .legalFor({S32, S16})
317       .clampScalar(0, S16, S32)
318       .scalarize(0)
319       .widenScalarToNextPow2(0, 32);
320   } else {
321     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
322       .legalFor({S32})
323       .clampScalar(0, S32, S32)
324       .scalarize(0);
325   }
326 
327   // FIXME: Not really legal. Placeholder for custom lowering.
328   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
329     .customFor({S32, S64})
330     .clampScalar(0, S32, S64)
331     .widenScalarToNextPow2(0, 32)
332     .scalarize(0);
333 
334   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
335     .legalFor({S32})
336     .clampScalar(0, S32, S32)
337     .scalarize(0);
338 
339   // Report legal for any types we can handle anywhere. For the cases only legal
340   // on the SALU, RegBankSelect will be able to re-legalize.
341   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
342     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
343     .clampScalar(0, S32, S64)
344     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
345     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
346     .widenScalarToNextPow2(0)
347     .scalarize(0);
348 
349   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
350                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
351     .legalFor({{S32, S1}, {S32, S32}})
352     .minScalar(0, S32)
353     // TODO: .scalarize(0)
354     .lower();
355 
356   getActionDefinitionsBuilder(G_BITCAST)
357     // Don't worry about the size constraint.
358     .legalIf(all(isRegisterType(0), isRegisterType(1)))
359     .lower();
360 
361 
362   getActionDefinitionsBuilder(G_CONSTANT)
363     .legalFor({S1, S32, S64, S16, GlobalPtr,
364                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
365     .clampScalar(0, S32, S64)
366     .widenScalarToNextPow2(0)
367     .legalIf(isPointer(0));
368 
369   getActionDefinitionsBuilder(G_FCONSTANT)
370     .legalFor({S32, S64, S16})
371     .clampScalar(0, S16, S64);
372 
373   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
374     .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
375                ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
376     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
377     .clampScalarOrElt(0, S32, S1024)
378     .legalIf(isMultiple32(0))
379     .widenScalarToNextPow2(0, 32)
380     .clampMaxNumElements(0, S32, 16);
381 
382   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
383   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
384     .unsupportedFor({PrivatePtr})
385     .custom();
386   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
387 
388   auto &FPOpActions = getActionDefinitionsBuilder(
389     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
390     .legalFor({S32, S64});
391   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
392     .customFor({S32, S64});
393   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
394     .customFor({S32, S64});
395 
396   if (ST.has16BitInsts()) {
397     if (ST.hasVOP3PInsts())
398       FPOpActions.legalFor({S16, V2S16});
399     else
400       FPOpActions.legalFor({S16});
401 
402     TrigActions.customFor({S16});
403     FDIVActions.customFor({S16});
404   }
405 
406   auto &MinNumMaxNum = getActionDefinitionsBuilder({
407       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
408 
409   if (ST.hasVOP3PInsts()) {
410     MinNumMaxNum.customFor(FPTypesPK16)
411       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
412       .clampMaxNumElements(0, S16, 2)
413       .clampScalar(0, S16, S64)
414       .scalarize(0);
415   } else if (ST.has16BitInsts()) {
416     MinNumMaxNum.customFor(FPTypes16)
417       .clampScalar(0, S16, S64)
418       .scalarize(0);
419   } else {
420     MinNumMaxNum.customFor(FPTypesBase)
421       .clampScalar(0, S32, S64)
422       .scalarize(0);
423   }
424 
425   if (ST.hasVOP3PInsts())
426     FPOpActions.clampMaxNumElements(0, S16, 2);
427 
428   FPOpActions
429     .scalarize(0)
430     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
431 
432   TrigActions
433     .scalarize(0)
434     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
435 
436   FDIVActions
437     .scalarize(0)
438     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
439 
440   getActionDefinitionsBuilder({G_FNEG, G_FABS})
441     .legalFor(FPTypesPK16)
442     .clampMaxNumElements(0, S16, 2)
443     .scalarize(0)
444     .clampScalar(0, S16, S64);
445 
446   if (ST.has16BitInsts()) {
447     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
448       .legalFor({S32, S64, S16})
449       .scalarize(0)
450       .clampScalar(0, S16, S64);
451   } else {
452     getActionDefinitionsBuilder(G_FSQRT)
453       .legalFor({S32, S64})
454       .scalarize(0)
455       .clampScalar(0, S32, S64);
456 
457     if (ST.hasFractBug()) {
458       getActionDefinitionsBuilder(G_FFLOOR)
459         .customFor({S64})
460         .legalFor({S32, S64})
461         .scalarize(0)
462         .clampScalar(0, S32, S64);
463     } else {
464       getActionDefinitionsBuilder(G_FFLOOR)
465         .legalFor({S32, S64})
466         .scalarize(0)
467         .clampScalar(0, S32, S64);
468     }
469   }
470 
471   getActionDefinitionsBuilder(G_FPTRUNC)
472     .legalFor({{S32, S64}, {S16, S32}})
473     .scalarize(0)
474     .lower();
475 
476   getActionDefinitionsBuilder(G_FPEXT)
477     .legalFor({{S64, S32}, {S32, S16}})
478     .lowerFor({{S64, S16}}) // FIXME: Implement
479     .scalarize(0);
480 
481   getActionDefinitionsBuilder(G_FSUB)
482       // Use actual fsub instruction
483       .legalFor({S32})
484       // Must use fadd + fneg
485       .lowerFor({S64, S16, V2S16})
486       .scalarize(0)
487       .clampScalar(0, S32, S64);
488 
489   // Whether this is legal depends on the floating point mode for the function.
490   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
491   if (ST.hasMadF16())
492     FMad.customFor({S32, S16});
493   else
494     FMad.customFor({S32});
495   FMad.scalarize(0)
496       .lower();
497 
498   // TODO: Do we need to clamp maximum bitwidth?
499   getActionDefinitionsBuilder(G_TRUNC)
500     .legalIf(isScalar(0))
501     .legalFor({{V2S16, V2S32}})
502     .clampMaxNumElements(0, S16, 2)
503     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
504     // situations (like an invalid implicit use), we don't want to infinite loop
505     // in the legalizer.
506     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
507     .alwaysLegal();
508 
509   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
510     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
511                {S32, S1}, {S64, S1}, {S16, S1}})
512     .scalarize(0)
513     .clampScalar(0, S32, S64)
514     .widenScalarToNextPow2(1, 32);
515 
516   // TODO: Split s1->s64 during regbankselect for VALU.
517   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
518     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
519     .lowerFor({{S32, S64}})
520     .lowerIf(typeIs(1, S1))
521     .customFor({{S64, S64}});
522   if (ST.has16BitInsts())
523     IToFP.legalFor({{S16, S16}});
524   IToFP.clampScalar(1, S32, S64)
525        .scalarize(0)
526        .widenScalarToNextPow2(1);
527 
528   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
529     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
530     .customFor({{S64, S64}});
531   if (ST.has16BitInsts())
532     FPToI.legalFor({{S16, S16}});
533   else
534     FPToI.minScalar(1, S32);
535 
536   FPToI.minScalar(0, S32)
537        .scalarize(0)
538        .lower();
539 
540   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
541     .scalarize(0)
542     .lower();
543 
544   if (ST.has16BitInsts()) {
545     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
546       .legalFor({S16, S32, S64})
547       .clampScalar(0, S16, S64)
548       .scalarize(0);
549   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
550     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
551       .legalFor({S32, S64})
552       .clampScalar(0, S32, S64)
553       .scalarize(0);
554   } else {
555     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
556       .legalFor({S32})
557       .customFor({S64})
558       .clampScalar(0, S32, S64)
559       .scalarize(0);
560   }
561 
562   getActionDefinitionsBuilder({G_PTR_ADD, G_PTR_MASK})
563     .scalarize(0)
564     .alwaysLegal();
565 
566   auto &CmpBuilder =
567     getActionDefinitionsBuilder(G_ICMP)
568     // The compare output type differs based on the register bank of the output,
569     // so make both s1 and s32 legal.
570     //
571     // Scalar compares producing output in scc will be promoted to s32, as that
572     // is the allocatable register type that will be needed for the copy from
573     // scc. This will be promoted during RegBankSelect, and we assume something
574     // before that won't try to use s32 result types.
575     //
576     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
577     // bank.
578     .legalForCartesianProduct(
579       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
580     .legalForCartesianProduct(
581       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
582   if (ST.has16BitInsts()) {
583     CmpBuilder.legalFor({{S1, S16}});
584   }
585 
586   CmpBuilder
587     .widenScalarToNextPow2(1)
588     .clampScalar(1, S32, S64)
589     .scalarize(0)
590     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
591 
592   getActionDefinitionsBuilder(G_FCMP)
593     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
594     .widenScalarToNextPow2(1)
595     .clampScalar(1, S32, S64)
596     .scalarize(0);
597 
598   // FIXME: fpow has a selection pattern that should move to custom lowering.
599   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
600   if (ST.has16BitInsts())
601     Exp2Ops.legalFor({S32, S16});
602   else
603     Exp2Ops.legalFor({S32});
604   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
605   Exp2Ops.scalarize(0);
606 
607   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
608   if (ST.has16BitInsts())
609     ExpOps.customFor({{S32}, {S16}});
610   else
611     ExpOps.customFor({S32});
612   ExpOps.clampScalar(0, MinScalarFPTy, S32)
613         .scalarize(0);
614 
615   // The 64-bit versions produce 32-bit results, but only on the SALU.
616   getActionDefinitionsBuilder(G_CTPOP)
617     .legalFor({{S32, S32}, {S32, S64}})
618     .clampScalar(0, S32, S32)
619     .clampScalar(1, S32, S64)
620     .scalarize(0)
621     .widenScalarToNextPow2(0, 32)
622     .widenScalarToNextPow2(1, 32);
623 
624   // The hardware instructions return a different result on 0 than the generic
625   // instructions expect. The hardware produces -1, but these produce the
626   // bitwidth.
627   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
628     .scalarize(0)
629     .clampScalar(0, S32, S32)
630     .clampScalar(1, S32, S64)
631     .widenScalarToNextPow2(0, 32)
632     .widenScalarToNextPow2(1, 32)
633     .lower();
634 
635   // The 64-bit versions produce 32-bit results, but only on the SALU.
636   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
637     .legalFor({{S32, S32}, {S32, S64}})
638     .clampScalar(0, S32, S32)
639     .clampScalar(1, S32, S64)
640     .scalarize(0)
641     .widenScalarToNextPow2(0, 32)
642     .widenScalarToNextPow2(1, 32);
643 
644   getActionDefinitionsBuilder(G_BITREVERSE)
645     .legalFor({S32})
646     .clampScalar(0, S32, S32)
647     .scalarize(0);
648 
649   if (ST.has16BitInsts()) {
650     getActionDefinitionsBuilder(G_BSWAP)
651       .legalFor({S16, S32, V2S16})
652       .clampMaxNumElements(0, S16, 2)
653       // FIXME: Fixing non-power-of-2 before clamp is workaround for
654       // narrowScalar limitation.
655       .widenScalarToNextPow2(0)
656       .clampScalar(0, S16, S32)
657       .scalarize(0);
658 
659     if (ST.hasVOP3PInsts()) {
660       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
661         .legalFor({S32, S16, V2S16})
662         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
663         .clampMaxNumElements(0, S16, 2)
664         .minScalar(0, S16)
665         .widenScalarToNextPow2(0)
666         .scalarize(0)
667         .lower();
668     } else {
669       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
670         .legalFor({S32, S16})
671         .widenScalarToNextPow2(0)
672         .minScalar(0, S16)
673         .scalarize(0)
674         .lower();
675     }
676   } else {
677     // TODO: Should have same legality without v_perm_b32
678     getActionDefinitionsBuilder(G_BSWAP)
679       .legalFor({S32})
680       .lowerIf(narrowerThan(0, 32))
681       // FIXME: Fixing non-power-of-2 before clamp is workaround for
682       // narrowScalar limitation.
683       .widenScalarToNextPow2(0)
684       .maxScalar(0, S32)
685       .scalarize(0)
686       .lower();
687 
688     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
689       .legalFor({S32})
690       .minScalar(0, S32)
691       .widenScalarToNextPow2(0)
692       .scalarize(0)
693       .lower();
694   }
695 
696   getActionDefinitionsBuilder(G_INTTOPTR)
697     // List the common cases
698     .legalForCartesianProduct(AddrSpaces64, {S64})
699     .legalForCartesianProduct(AddrSpaces32, {S32})
700     .scalarize(0)
701     // Accept any address space as long as the size matches
702     .legalIf(sameSize(0, 1))
703     .widenScalarIf(smallerThan(1, 0),
704       [](const LegalityQuery &Query) {
705         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
706       })
707     .narrowScalarIf(greaterThan(1, 0),
708       [](const LegalityQuery &Query) {
709         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
710       });
711 
712   getActionDefinitionsBuilder(G_PTRTOINT)
713     // List the common cases
714     .legalForCartesianProduct(AddrSpaces64, {S64})
715     .legalForCartesianProduct(AddrSpaces32, {S32})
716     .scalarize(0)
717     // Accept any address space as long as the size matches
718     .legalIf(sameSize(0, 1))
719     .widenScalarIf(smallerThan(0, 1),
720       [](const LegalityQuery &Query) {
721         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
722       })
723     .narrowScalarIf(
724       greaterThan(0, 1),
725       [](const LegalityQuery &Query) {
726         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
727       });
728 
729   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
730     .scalarize(0)
731     .custom();
732 
733   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
734   // handle some operations by just promoting the register during
735   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
736   auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned {
737     switch (AS) {
738     // FIXME: Private element size.
739     case AMDGPUAS::PRIVATE_ADDRESS:
740       return 32;
741     // FIXME: Check subtarget
742     case AMDGPUAS::LOCAL_ADDRESS:
743       return ST.useDS128() ? 128 : 64;
744 
745     // Treat constant and global as identical. SMRD loads are sometimes usable
746     // for global loads (ideally constant address space should be eliminated)
747     // depending on the context. Legality cannot be context dependent, but
748     // RegBankSelect can split the load as necessary depending on the pointer
749     // register bank/uniformity and if the memory is invariant or not written in
750     // a kernel.
751     case AMDGPUAS::CONSTANT_ADDRESS:
752     case AMDGPUAS::GLOBAL_ADDRESS:
753       return IsLoad ? 512 : 128;
754     default:
755       return 128;
756     }
757   };
758 
759   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
760                                     bool IsLoad) -> bool {
761     const LLT DstTy = Query.Types[0];
762 
763     // Split vector extloads.
764     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
765     unsigned Align = Query.MMODescrs[0].AlignInBits;
766 
767     if (MemSize < DstTy.getSizeInBits())
768       MemSize = std::max(MemSize, Align);
769 
770     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
771       return true;
772 
773     const LLT PtrTy = Query.Types[1];
774     unsigned AS = PtrTy.getAddressSpace();
775     if (MemSize > maxSizeForAddrSpace(AS, IsLoad))
776       return true;
777 
778     // Catch weird sized loads that don't evenly divide into the access sizes
779     // TODO: May be able to widen depending on alignment etc.
780     unsigned NumRegs = (MemSize + 31) / 32;
781     if (NumRegs == 3) {
782       if (!ST.hasDwordx3LoadStores())
783         return true;
784     } else {
785       // If the alignment allows, these should have been widened.
786       if (!isPowerOf2_32(NumRegs))
787         return true;
788     }
789 
790     if (Align < MemSize) {
791       const SITargetLowering *TLI = ST.getTargetLowering();
792       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
793     }
794 
795     return false;
796   };
797 
798   const auto shouldWidenLoadResult = [=](const LegalityQuery &Query) -> bool {
799     unsigned Size = Query.Types[0].getSizeInBits();
800     if (isPowerOf2_32(Size))
801       return false;
802 
803     if (Size == 96 && ST.hasDwordx3LoadStores())
804       return false;
805 
806     unsigned AddrSpace = Query.Types[1].getAddressSpace();
807     if (Size >= maxSizeForAddrSpace(AddrSpace, true))
808       return false;
809 
810     unsigned Align = Query.MMODescrs[0].AlignInBits;
811     unsigned RoundedSize = NextPowerOf2(Size);
812     return (Align >= RoundedSize);
813   };
814 
815   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
816   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
817   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
818 
819   // TODO: Refine based on subtargets which support unaligned access or 128-bit
820   // LDS
821   // TODO: Unsupported flat for SI.
822 
823   for (unsigned Op : {G_LOAD, G_STORE}) {
824     const bool IsStore = Op == G_STORE;
825 
826     auto &Actions = getActionDefinitionsBuilder(Op);
827     // Whitelist the common cases.
828     // TODO: Loads to s16 on gfx9
829     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
830                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
831                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
832                                       {S128, GlobalPtr, 128, GlobalAlign32},
833                                       {S64, GlobalPtr, 64, GlobalAlign32},
834                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
835                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
836                                       {S32, GlobalPtr, 8, GlobalAlign8},
837                                       {S32, GlobalPtr, 16, GlobalAlign16},
838 
839                                       {S32, LocalPtr, 32, 32},
840                                       {S64, LocalPtr, 64, 32},
841                                       {V2S32, LocalPtr, 64, 32},
842                                       {S32, LocalPtr, 8, 8},
843                                       {S32, LocalPtr, 16, 16},
844                                       {V2S16, LocalPtr, 32, 32},
845 
846                                       {S32, PrivatePtr, 32, 32},
847                                       {S32, PrivatePtr, 8, 8},
848                                       {S32, PrivatePtr, 16, 16},
849                                       {V2S16, PrivatePtr, 32, 32},
850 
851                                       {S32, FlatPtr, 32, GlobalAlign32},
852                                       {S32, FlatPtr, 16, GlobalAlign16},
853                                       {S32, FlatPtr, 8, GlobalAlign8},
854                                       {V2S16, FlatPtr, 32, GlobalAlign32},
855 
856                                       {S32, ConstantPtr, 32, GlobalAlign32},
857                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
858                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
859                                       {S64, ConstantPtr, 64, GlobalAlign32},
860                                       {S128, ConstantPtr, 128, GlobalAlign32},
861                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
862     Actions
863         .customIf(typeIs(1, Constant32Ptr))
864         // Widen suitably aligned loads by loading extra elements.
865         .moreElementsIf([=](const LegalityQuery &Query) {
866             const LLT Ty = Query.Types[0];
867             return Op == G_LOAD && Ty.isVector() &&
868                    shouldWidenLoadResult(Query);
869           }, moreElementsToNextPow2(0))
870         .widenScalarIf([=](const LegalityQuery &Query) {
871             const LLT Ty = Query.Types[0];
872             return Op == G_LOAD && !Ty.isVector() &&
873                    shouldWidenLoadResult(Query);
874           }, widenScalarOrEltToNextPow2(0))
875         .narrowScalarIf(
876             [=](const LegalityQuery &Query) -> bool {
877               return !Query.Types[0].isVector() &&
878                      needToSplitMemOp(Query, Op == G_LOAD);
879             },
880             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
881               const LLT DstTy = Query.Types[0];
882               const LLT PtrTy = Query.Types[1];
883 
884               const unsigned DstSize = DstTy.getSizeInBits();
885               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
886 
887               // Split extloads.
888               if (DstSize > MemSize)
889                 return std::make_pair(0, LLT::scalar(MemSize));
890 
891               if (!isPowerOf2_32(DstSize)) {
892                 // We're probably decomposing an odd sized store. Try to split
893                 // to the widest type. TODO: Account for alignment. As-is it
894                 // should be OK, since the new parts will be further legalized.
895                 unsigned FloorSize = PowerOf2Floor(DstSize);
896                 return std::make_pair(0, LLT::scalar(FloorSize));
897               }
898 
899               if (DstSize > 32 && (DstSize % 32 != 0)) {
900                 // FIXME: Need a way to specify non-extload of larger size if
901                 // suitably aligned.
902                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
903               }
904 
905               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
906                                                      Op == G_LOAD);
907               if (MemSize > MaxSize)
908                 return std::make_pair(0, LLT::scalar(MaxSize));
909 
910               unsigned Align = Query.MMODescrs[0].AlignInBits;
911               return std::make_pair(0, LLT::scalar(Align));
912             })
913         .fewerElementsIf(
914             [=](const LegalityQuery &Query) -> bool {
915               return Query.Types[0].isVector() &&
916                      needToSplitMemOp(Query, Op == G_LOAD);
917             },
918             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
919               const LLT DstTy = Query.Types[0];
920               const LLT PtrTy = Query.Types[1];
921 
922               LLT EltTy = DstTy.getElementType();
923               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
924                                                      Op == G_LOAD);
925 
926               // FIXME: Handle widened to power of 2 results better. This ends
927               // up scalarizing.
928               // FIXME: 3 element stores scalarized on SI
929 
930               // Split if it's too large for the address space.
931               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
932                 unsigned NumElts = DstTy.getNumElements();
933                 unsigned EltSize = EltTy.getSizeInBits();
934 
935                 if (MaxSize % EltSize == 0) {
936                   return std::make_pair(
937                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
938                 }
939 
940                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
941 
942                 // FIXME: Refine when odd breakdowns handled
943                 // The scalars will need to be re-legalized.
944                 if (NumPieces == 1 || NumPieces >= NumElts ||
945                     NumElts % NumPieces != 0)
946                   return std::make_pair(0, EltTy);
947 
948                 return std::make_pair(0,
949                                       LLT::vector(NumElts / NumPieces, EltTy));
950               }
951 
952               // FIXME: We could probably handle weird extending loads better.
953               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
954               if (DstTy.getSizeInBits() > MemSize)
955                 return std::make_pair(0, EltTy);
956 
957               unsigned EltSize = EltTy.getSizeInBits();
958               unsigned DstSize = DstTy.getSizeInBits();
959               if (!isPowerOf2_32(DstSize)) {
960                 // We're probably decomposing an odd sized store. Try to split
961                 // to the widest type. TODO: Account for alignment. As-is it
962                 // should be OK, since the new parts will be further legalized.
963                 unsigned FloorSize = PowerOf2Floor(DstSize);
964                 return std::make_pair(
965                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
966               }
967 
968               // Need to split because of alignment.
969               unsigned Align = Query.MMODescrs[0].AlignInBits;
970               if (EltSize > Align &&
971                   (EltSize / Align < DstTy.getNumElements())) {
972                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
973               }
974 
975               // May need relegalization for the scalars.
976               return std::make_pair(0, EltTy);
977             })
978         .minScalar(0, S32);
979 
980     if (IsStore)
981       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
982 
983     // TODO: Need a bitcast lower option?
984     Actions
985         .legalIf([=](const LegalityQuery &Query) {
986           const LLT Ty0 = Query.Types[0];
987           unsigned Size = Ty0.getSizeInBits();
988           unsigned MemSize = Query.MMODescrs[0].SizeInBits;
989           unsigned Align = Query.MMODescrs[0].AlignInBits;
990 
991           // FIXME: Widening store from alignment not valid.
992           if (MemSize < Size)
993             MemSize = std::max(MemSize, Align);
994 
995           // No extending vector loads.
996           if (Size > MemSize && Ty0.isVector())
997             return false;
998 
999           switch (MemSize) {
1000           case 8:
1001           case 16:
1002             return Size == 32;
1003           case 32:
1004           case 64:
1005           case 128:
1006             return true;
1007           case 96:
1008             return ST.hasDwordx3LoadStores();
1009           case 256:
1010           case 512:
1011             return true;
1012           default:
1013             return false;
1014           }
1015         })
1016         .widenScalarToNextPow2(0)
1017         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
1018   }
1019 
1020   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1021                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1022                                                   {S32, GlobalPtr, 16, 2 * 8},
1023                                                   {S32, LocalPtr, 8, 8},
1024                                                   {S32, LocalPtr, 16, 16},
1025                                                   {S32, PrivatePtr, 8, 8},
1026                                                   {S32, PrivatePtr, 16, 16},
1027                                                   {S32, ConstantPtr, 8, 8},
1028                                                   {S32, ConstantPtr, 16, 2 * 8}});
1029   if (ST.hasFlatAddressSpace()) {
1030     ExtLoads.legalForTypesWithMemDesc(
1031         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1032   }
1033 
1034   ExtLoads.clampScalar(0, S32, S32)
1035           .widenScalarToNextPow2(0)
1036           .unsupportedIfMemSizeNotPow2()
1037           .lower();
1038 
1039   auto &Atomics = getActionDefinitionsBuilder(
1040     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1041      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1042      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1043      G_ATOMICRMW_UMIN})
1044     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1045                {S64, GlobalPtr}, {S64, LocalPtr}});
1046   if (ST.hasFlatAddressSpace()) {
1047     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1048   }
1049 
1050   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1051     .legalFor({{S32, LocalPtr}});
1052 
1053   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1054   // demarshalling
1055   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1056     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1057                 {S32, FlatPtr}, {S64, FlatPtr}})
1058     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1059                {S32, RegionPtr}, {S64, RegionPtr}});
1060   // TODO: Pointer types, any 32-bit or 64-bit vector
1061 
1062   // Condition should be s32 for scalar, s1 for vector.
1063   getActionDefinitionsBuilder(G_SELECT)
1064     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1065           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1066           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1067     .clampScalar(0, S16, S64)
1068     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1069     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1070     .scalarize(1)
1071     .clampMaxNumElements(0, S32, 2)
1072     .clampMaxNumElements(0, LocalPtr, 2)
1073     .clampMaxNumElements(0, PrivatePtr, 2)
1074     .scalarize(0)
1075     .widenScalarToNextPow2(0)
1076     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1077 
1078   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1079   // be more flexible with the shift amount type.
1080   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1081     .legalFor({{S32, S32}, {S64, S32}});
1082   if (ST.has16BitInsts()) {
1083     if (ST.hasVOP3PInsts()) {
1084       Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
1085             .clampMaxNumElements(0, S16, 2);
1086     } else
1087       Shifts.legalFor({{S16, S32}, {S16, S16}});
1088 
1089     // TODO: Support 16-bit shift amounts
1090     Shifts.clampScalar(1, S32, S32);
1091     Shifts.clampScalar(0, S16, S64);
1092     Shifts.widenScalarToNextPow2(0, 16);
1093   } else {
1094     // Make sure we legalize the shift amount type first, as the general
1095     // expansion for the shifted type will produce much worse code if it hasn't
1096     // been truncated already.
1097     Shifts.clampScalar(1, S32, S32);
1098     Shifts.clampScalar(0, S32, S64);
1099     Shifts.widenScalarToNextPow2(0, 32);
1100   }
1101   Shifts.scalarize(0);
1102 
1103   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1104     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1105     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1106     unsigned IdxTypeIdx = 2;
1107 
1108     getActionDefinitionsBuilder(Op)
1109       .customIf([=](const LegalityQuery &Query) {
1110           const LLT EltTy = Query.Types[EltTypeIdx];
1111           const LLT VecTy = Query.Types[VecTypeIdx];
1112           const LLT IdxTy = Query.Types[IdxTypeIdx];
1113           return (EltTy.getSizeInBits() == 16 ||
1114                   EltTy.getSizeInBits() % 32 == 0) &&
1115                  VecTy.getSizeInBits() % 32 == 0 &&
1116                  VecTy.getSizeInBits() <= 1024 &&
1117                  IdxTy.getSizeInBits() == 32;
1118         })
1119       .clampScalar(EltTypeIdx, S32, S64)
1120       .clampScalar(VecTypeIdx, S32, S64)
1121       .clampScalar(IdxTypeIdx, S32, S32);
1122   }
1123 
1124   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1125     .unsupportedIf([=](const LegalityQuery &Query) {
1126         const LLT &EltTy = Query.Types[1].getElementType();
1127         return Query.Types[0] != EltTy;
1128       });
1129 
1130   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1131     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1132     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1133 
1134     // FIXME: Doesn't handle extract of illegal sizes.
1135     getActionDefinitionsBuilder(Op)
1136       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1137       // FIXME: Multiples of 16 should not be legal.
1138       .legalIf([=](const LegalityQuery &Query) {
1139           const LLT BigTy = Query.Types[BigTyIdx];
1140           const LLT LitTy = Query.Types[LitTyIdx];
1141           return (BigTy.getSizeInBits() % 32 == 0) &&
1142                  (LitTy.getSizeInBits() % 16 == 0);
1143         })
1144       .widenScalarIf(
1145         [=](const LegalityQuery &Query) {
1146           const LLT BigTy = Query.Types[BigTyIdx];
1147           return (BigTy.getScalarSizeInBits() < 16);
1148         },
1149         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1150       .widenScalarIf(
1151         [=](const LegalityQuery &Query) {
1152           const LLT LitTy = Query.Types[LitTyIdx];
1153           return (LitTy.getScalarSizeInBits() < 16);
1154         },
1155         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1156       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1157       .widenScalarToNextPow2(BigTyIdx, 32);
1158 
1159   }
1160 
1161   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1162     .legalForCartesianProduct(AllS32Vectors, {S32})
1163     .legalForCartesianProduct(AllS64Vectors, {S64})
1164     .clampNumElements(0, V16S32, V32S32)
1165     .clampNumElements(0, V2S64, V16S64)
1166     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1167 
1168   if (ST.hasScalarPackInsts()) {
1169     BuildVector
1170       // FIXME: Should probably widen s1 vectors straight to s32
1171       .minScalarOrElt(0, S16)
1172       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1173       .minScalar(1, S32);
1174 
1175     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1176       .legalFor({V2S16, S32})
1177       .lower();
1178     BuildVector.minScalarOrElt(0, S32);
1179   } else {
1180     BuildVector.customFor({V2S16, S16});
1181     BuildVector.minScalarOrElt(0, S32);
1182 
1183     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1184       .customFor({V2S16, S32})
1185       .lower();
1186   }
1187 
1188   BuildVector.legalIf(isRegisterType(0));
1189 
1190   // FIXME: Clamp maximum size
1191   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1192     .legalIf(isRegisterType(0));
1193 
1194   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1195   // pre-legalize.
1196   if (ST.hasVOP3PInsts()) {
1197     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1198       .customFor({V2S16, V2S16})
1199       .lower();
1200   } else
1201     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1202 
1203   // Merge/Unmerge
1204   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1205     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1206     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1207 
1208     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1209       const LLT Ty = Query.Types[TypeIdx];
1210       if (Ty.isVector()) {
1211         const LLT &EltTy = Ty.getElementType();
1212         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1213           return true;
1214         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1215           return true;
1216       }
1217       return false;
1218     };
1219 
1220     auto &Builder = getActionDefinitionsBuilder(Op)
1221       // Try to widen to s16 first for small types.
1222       // TODO: Only do this on targets with legal s16 shifts
1223       .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1224 
1225       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1226       .lowerFor({{S16, V2S16}})
1227       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1228       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1229                            elementTypeIs(1, S16)),
1230                        changeTo(1, V2S16))
1231       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1232       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1233       // valid.
1234       .clampScalar(LitTyIdx, S32, S512)
1235       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1236       // Break up vectors with weird elements into scalars
1237       .fewerElementsIf(
1238         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1239         scalarize(0))
1240       .fewerElementsIf(
1241         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1242         scalarize(1))
1243       .clampScalar(BigTyIdx, S32, S1024);
1244 
1245     if (Op == G_MERGE_VALUES) {
1246       Builder.widenScalarIf(
1247         // TODO: Use 16-bit shifts if legal for 8-bit values?
1248         [=](const LegalityQuery &Query) {
1249           const LLT Ty = Query.Types[LitTyIdx];
1250           return Ty.getSizeInBits() < 32;
1251         },
1252         changeTo(LitTyIdx, S32));
1253     }
1254 
1255     Builder.widenScalarIf(
1256       [=](const LegalityQuery &Query) {
1257         const LLT Ty = Query.Types[BigTyIdx];
1258         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1259           Ty.getSizeInBits() % 16 != 0;
1260       },
1261       [=](const LegalityQuery &Query) {
1262         // Pick the next power of 2, or a multiple of 64 over 128.
1263         // Whichever is smaller.
1264         const LLT &Ty = Query.Types[BigTyIdx];
1265         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1266         if (NewSizeInBits >= 256) {
1267           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1268           if (RoundedTo < NewSizeInBits)
1269             NewSizeInBits = RoundedTo;
1270         }
1271         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1272       })
1273       .legalIf([=](const LegalityQuery &Query) {
1274           const LLT &BigTy = Query.Types[BigTyIdx];
1275           const LLT &LitTy = Query.Types[LitTyIdx];
1276 
1277           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1278             return false;
1279           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1280             return false;
1281 
1282           return BigTy.getSizeInBits() % 16 == 0 &&
1283                  LitTy.getSizeInBits() % 16 == 0 &&
1284                  BigTy.getSizeInBits() <= 1024;
1285         })
1286       // Any vectors left are the wrong size. Scalarize them.
1287       .scalarize(0)
1288       .scalarize(1);
1289   }
1290 
1291   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1292   // RegBankSelect.
1293   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1294     .legalFor({{S32}, {S64}});
1295 
1296   if (ST.hasVOP3PInsts()) {
1297     SextInReg.lowerFor({{V2S16}})
1298       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1299       // get more vector shift opportunities, since we'll get those when
1300       // expanded.
1301       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1302   } else if (ST.has16BitInsts()) {
1303     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1304   } else {
1305     // Prefer to promote to s32 before lowering if we don't have 16-bit
1306     // shifts. This avoid a lot of intermediate truncate and extend operations.
1307     SextInReg.lowerFor({{S32}, {S64}});
1308   }
1309 
1310   SextInReg
1311     .scalarize(0)
1312     .clampScalar(0, S32, S64)
1313     .lower();
1314 
1315   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1316     .legalFor({S64});
1317 
1318   getActionDefinitionsBuilder({
1319       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1320       G_FCOPYSIGN,
1321 
1322       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1323       G_READ_REGISTER,
1324       G_WRITE_REGISTER,
1325 
1326       G_SADDO, G_SSUBO,
1327 
1328        // TODO: Implement
1329       G_FMINIMUM, G_FMAXIMUM
1330     }).lower();
1331 
1332   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1333         G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1334         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1335     .unsupported();
1336 
1337   computeTables();
1338   verify(*ST.getInstrInfo());
1339 }
1340 
1341 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1342                                          MachineRegisterInfo &MRI,
1343                                          MachineIRBuilder &B,
1344                                          GISelChangeObserver &Observer) const {
1345   switch (MI.getOpcode()) {
1346   case TargetOpcode::G_ADDRSPACE_CAST:
1347     return legalizeAddrSpaceCast(MI, MRI, B);
1348   case TargetOpcode::G_FRINT:
1349     return legalizeFrint(MI, MRI, B);
1350   case TargetOpcode::G_FCEIL:
1351     return legalizeFceil(MI, MRI, B);
1352   case TargetOpcode::G_INTRINSIC_TRUNC:
1353     return legalizeIntrinsicTrunc(MI, MRI, B);
1354   case TargetOpcode::G_SITOFP:
1355     return legalizeITOFP(MI, MRI, B, true);
1356   case TargetOpcode::G_UITOFP:
1357     return legalizeITOFP(MI, MRI, B, false);
1358   case TargetOpcode::G_FPTOSI:
1359     return legalizeFPTOI(MI, MRI, B, true);
1360   case TargetOpcode::G_FPTOUI:
1361     return legalizeFPTOI(MI, MRI, B, false);
1362   case TargetOpcode::G_FMINNUM:
1363   case TargetOpcode::G_FMAXNUM:
1364   case TargetOpcode::G_FMINNUM_IEEE:
1365   case TargetOpcode::G_FMAXNUM_IEEE:
1366     return legalizeMinNumMaxNum(MI, MRI, B);
1367   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1368     return legalizeExtractVectorElt(MI, MRI, B);
1369   case TargetOpcode::G_INSERT_VECTOR_ELT:
1370     return legalizeInsertVectorElt(MI, MRI, B);
1371   case TargetOpcode::G_SHUFFLE_VECTOR:
1372     return legalizeShuffleVector(MI, MRI, B);
1373   case TargetOpcode::G_FSIN:
1374   case TargetOpcode::G_FCOS:
1375     return legalizeSinCos(MI, MRI, B);
1376   case TargetOpcode::G_GLOBAL_VALUE:
1377     return legalizeGlobalValue(MI, MRI, B);
1378   case TargetOpcode::G_LOAD:
1379     return legalizeLoad(MI, MRI, B, Observer);
1380   case TargetOpcode::G_FMAD:
1381     return legalizeFMad(MI, MRI, B);
1382   case TargetOpcode::G_FDIV:
1383     return legalizeFDIV(MI, MRI, B);
1384   case TargetOpcode::G_UDIV:
1385   case TargetOpcode::G_UREM:
1386     return legalizeUDIV_UREM(MI, MRI, B);
1387   case TargetOpcode::G_SDIV:
1388   case TargetOpcode::G_SREM:
1389     return legalizeSDIV_SREM(MI, MRI, B);
1390   case TargetOpcode::G_ATOMIC_CMPXCHG:
1391     return legalizeAtomicCmpXChg(MI, MRI, B);
1392   case TargetOpcode::G_FLOG:
1393     return legalizeFlog(MI, B, 1.0f / numbers::log2ef);
1394   case TargetOpcode::G_FLOG10:
1395     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1396   case TargetOpcode::G_FEXP:
1397     return legalizeFExp(MI, B);
1398   case TargetOpcode::G_FPOW:
1399     return legalizeFPow(MI, B);
1400   case TargetOpcode::G_FFLOOR:
1401     return legalizeFFloor(MI, MRI, B);
1402   case TargetOpcode::G_BUILD_VECTOR:
1403     return legalizeBuildVector(MI, MRI, B);
1404   default:
1405     return false;
1406   }
1407 
1408   llvm_unreachable("expected switch to return");
1409 }
1410 
1411 Register AMDGPULegalizerInfo::getSegmentAperture(
1412   unsigned AS,
1413   MachineRegisterInfo &MRI,
1414   MachineIRBuilder &B) const {
1415   MachineFunction &MF = B.getMF();
1416   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1417   const LLT S32 = LLT::scalar(32);
1418 
1419   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1420 
1421   if (ST.hasApertureRegs()) {
1422     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1423     // getreg.
1424     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1425         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1426         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1427     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1428         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1429         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1430     unsigned Encoding =
1431         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1432         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1433         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1434 
1435     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1436 
1437     B.buildInstr(AMDGPU::S_GETREG_B32)
1438       .addDef(GetReg)
1439       .addImm(Encoding);
1440     MRI.setType(GetReg, S32);
1441 
1442     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1443     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1444   }
1445 
1446   Register QueuePtr = MRI.createGenericVirtualRegister(
1447     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1448 
1449   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1450   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1451     return Register();
1452 
1453   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1454   // private_segment_aperture_base_hi.
1455   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1456 
1457   // TODO: can we be smarter about machine pointer info?
1458   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1459   MachineMemOperand *MMO = MF.getMachineMemOperand(
1460     PtrInfo,
1461     MachineMemOperand::MOLoad |
1462     MachineMemOperand::MODereferenceable |
1463     MachineMemOperand::MOInvariant,
1464     4,
1465     MinAlign(64, StructOffset));
1466 
1467   Register LoadAddr;
1468 
1469   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1470   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1471 }
1472 
1473 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1474   MachineInstr &MI, MachineRegisterInfo &MRI,
1475   MachineIRBuilder &B) const {
1476   MachineFunction &MF = B.getMF();
1477 
1478   B.setInstr(MI);
1479 
1480   const LLT S32 = LLT::scalar(32);
1481   Register Dst = MI.getOperand(0).getReg();
1482   Register Src = MI.getOperand(1).getReg();
1483 
1484   LLT DstTy = MRI.getType(Dst);
1485   LLT SrcTy = MRI.getType(Src);
1486   unsigned DestAS = DstTy.getAddressSpace();
1487   unsigned SrcAS = SrcTy.getAddressSpace();
1488 
1489   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1490   // vector element.
1491   assert(!DstTy.isVector());
1492 
1493   const AMDGPUTargetMachine &TM
1494     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1495 
1496   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1497   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1498     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1499     return true;
1500   }
1501 
1502   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1503     // Truncate.
1504     B.buildExtract(Dst, Src, 0);
1505     MI.eraseFromParent();
1506     return true;
1507   }
1508 
1509   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1510     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1511     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1512 
1513     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1514     // another. Merge operands are required to be the same type, but creating an
1515     // extra ptrtoint would be kind of pointless.
1516     auto HighAddr = B.buildConstant(
1517       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1518     B.buildMerge(Dst, {Src, HighAddr});
1519     MI.eraseFromParent();
1520     return true;
1521   }
1522 
1523   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1524     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1525            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1526     unsigned NullVal = TM.getNullPointerValue(DestAS);
1527 
1528     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1529     auto FlatNull = B.buildConstant(SrcTy, 0);
1530 
1531     // Extract low 32-bits of the pointer.
1532     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1533 
1534     auto CmpRes =
1535         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1536     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1537 
1538     MI.eraseFromParent();
1539     return true;
1540   }
1541 
1542   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1543     return false;
1544 
1545   if (!ST.hasFlatAddressSpace())
1546     return false;
1547 
1548   auto SegmentNull =
1549       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1550   auto FlatNull =
1551       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1552 
1553   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1554   if (!ApertureReg.isValid())
1555     return false;
1556 
1557   auto CmpRes =
1558       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1559 
1560   // Coerce the type of the low half of the result so we can use merge_values.
1561   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1562 
1563   // TODO: Should we allow mismatched types but matching sizes in merges to
1564   // avoid the ptrtoint?
1565   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1566   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1567 
1568   MI.eraseFromParent();
1569   return true;
1570 }
1571 
1572 bool AMDGPULegalizerInfo::legalizeFrint(
1573   MachineInstr &MI, MachineRegisterInfo &MRI,
1574   MachineIRBuilder &B) const {
1575   B.setInstr(MI);
1576 
1577   Register Src = MI.getOperand(1).getReg();
1578   LLT Ty = MRI.getType(Src);
1579   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1580 
1581   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1582   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1583 
1584   auto C1 = B.buildFConstant(Ty, C1Val);
1585   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1586 
1587   // TODO: Should this propagate fast-math-flags?
1588   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1589   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1590 
1591   auto C2 = B.buildFConstant(Ty, C2Val);
1592   auto Fabs = B.buildFAbs(Ty, Src);
1593 
1594   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1595   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1596   return true;
1597 }
1598 
1599 bool AMDGPULegalizerInfo::legalizeFceil(
1600   MachineInstr &MI, MachineRegisterInfo &MRI,
1601   MachineIRBuilder &B) const {
1602   B.setInstr(MI);
1603 
1604   const LLT S1 = LLT::scalar(1);
1605   const LLT S64 = LLT::scalar(64);
1606 
1607   Register Src = MI.getOperand(1).getReg();
1608   assert(MRI.getType(Src) == S64);
1609 
1610   // result = trunc(src)
1611   // if (src > 0.0 && src != result)
1612   //   result += 1.0
1613 
1614   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1615 
1616   const auto Zero = B.buildFConstant(S64, 0.0);
1617   const auto One = B.buildFConstant(S64, 1.0);
1618   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1619   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1620   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1621   auto Add = B.buildSelect(S64, And, One, Zero);
1622 
1623   // TODO: Should this propagate fast-math-flags?
1624   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1625   return true;
1626 }
1627 
1628 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1629                                               MachineIRBuilder &B) {
1630   const unsigned FractBits = 52;
1631   const unsigned ExpBits = 11;
1632   LLT S32 = LLT::scalar(32);
1633 
1634   auto Const0 = B.buildConstant(S32, FractBits - 32);
1635   auto Const1 = B.buildConstant(S32, ExpBits);
1636 
1637   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1638     .addUse(Const0.getReg(0))
1639     .addUse(Const1.getReg(0));
1640 
1641   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1642 }
1643 
1644 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1645   MachineInstr &MI, MachineRegisterInfo &MRI,
1646   MachineIRBuilder &B) const {
1647   B.setInstr(MI);
1648 
1649   const LLT S1 = LLT::scalar(1);
1650   const LLT S32 = LLT::scalar(32);
1651   const LLT S64 = LLT::scalar(64);
1652 
1653   Register Src = MI.getOperand(1).getReg();
1654   assert(MRI.getType(Src) == S64);
1655 
1656   // TODO: Should this use extract since the low half is unused?
1657   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1658   Register Hi = Unmerge.getReg(1);
1659 
1660   // Extract the upper half, since this is where we will find the sign and
1661   // exponent.
1662   auto Exp = extractF64Exponent(Hi, B);
1663 
1664   const unsigned FractBits = 52;
1665 
1666   // Extract the sign bit.
1667   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1668   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1669 
1670   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1671 
1672   const auto Zero32 = B.buildConstant(S32, 0);
1673 
1674   // Extend back to 64-bits.
1675   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1676 
1677   auto Shr = B.buildAShr(S64, FractMask, Exp);
1678   auto Not = B.buildNot(S64, Shr);
1679   auto Tmp0 = B.buildAnd(S64, Src, Not);
1680   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1681 
1682   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1683   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1684 
1685   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1686   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1687   return true;
1688 }
1689 
1690 bool AMDGPULegalizerInfo::legalizeITOFP(
1691   MachineInstr &MI, MachineRegisterInfo &MRI,
1692   MachineIRBuilder &B, bool Signed) const {
1693   B.setInstr(MI);
1694 
1695   Register Dst = MI.getOperand(0).getReg();
1696   Register Src = MI.getOperand(1).getReg();
1697 
1698   const LLT S64 = LLT::scalar(64);
1699   const LLT S32 = LLT::scalar(32);
1700 
1701   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1702 
1703   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1704 
1705   auto CvtHi = Signed ?
1706     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1707     B.buildUITOFP(S64, Unmerge.getReg(1));
1708 
1709   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1710 
1711   auto ThirtyTwo = B.buildConstant(S32, 32);
1712   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1713     .addUse(CvtHi.getReg(0))
1714     .addUse(ThirtyTwo.getReg(0));
1715 
1716   // TODO: Should this propagate fast-math-flags?
1717   B.buildFAdd(Dst, LdExp, CvtLo);
1718   MI.eraseFromParent();
1719   return true;
1720 }
1721 
1722 // TODO: Copied from DAG implementation. Verify logic and document how this
1723 // actually works.
1724 bool AMDGPULegalizerInfo::legalizeFPTOI(
1725   MachineInstr &MI, MachineRegisterInfo &MRI,
1726   MachineIRBuilder &B, bool Signed) const {
1727   B.setInstr(MI);
1728 
1729   Register Dst = MI.getOperand(0).getReg();
1730   Register Src = MI.getOperand(1).getReg();
1731 
1732   const LLT S64 = LLT::scalar(64);
1733   const LLT S32 = LLT::scalar(32);
1734 
1735   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1736 
1737   unsigned Flags = MI.getFlags();
1738 
1739   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
1740   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
1741   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
1742 
1743   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
1744   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
1745   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
1746 
1747   auto Hi = Signed ?
1748     B.buildFPTOSI(S32, FloorMul) :
1749     B.buildFPTOUI(S32, FloorMul);
1750   auto Lo = B.buildFPTOUI(S32, Fma);
1751 
1752   B.buildMerge(Dst, { Lo, Hi });
1753   MI.eraseFromParent();
1754 
1755   return true;
1756 }
1757 
1758 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1759   MachineInstr &MI, MachineRegisterInfo &MRI,
1760   MachineIRBuilder &B) const {
1761   MachineFunction &MF = B.getMF();
1762   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1763 
1764   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1765                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1766 
1767   // With ieee_mode disabled, the instructions have the correct behavior
1768   // already for G_FMINNUM/G_FMAXNUM
1769   if (!MFI->getMode().IEEE)
1770     return !IsIEEEOp;
1771 
1772   if (IsIEEEOp)
1773     return true;
1774 
1775   MachineIRBuilder HelperBuilder(MI);
1776   GISelObserverWrapper DummyObserver;
1777   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1778   HelperBuilder.setInstr(MI);
1779   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1780 }
1781 
1782 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1783   MachineInstr &MI, MachineRegisterInfo &MRI,
1784   MachineIRBuilder &B) const {
1785   // TODO: Should move some of this into LegalizerHelper.
1786 
1787   // TODO: Promote dynamic indexing of s16 to s32
1788 
1789   // FIXME: Artifact combiner probably should have replaced the truncated
1790   // constant before this, so we shouldn't need
1791   // getConstantVRegValWithLookThrough.
1792   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1793     MI.getOperand(2).getReg(), MRI);
1794   if (!IdxVal) // Dynamic case will be selected to register indexing.
1795     return true;
1796 
1797   Register Dst = MI.getOperand(0).getReg();
1798   Register Vec = MI.getOperand(1).getReg();
1799 
1800   LLT VecTy = MRI.getType(Vec);
1801   LLT EltTy = VecTy.getElementType();
1802   assert(EltTy == MRI.getType(Dst));
1803 
1804   B.setInstr(MI);
1805 
1806   if (IdxVal->Value < VecTy.getNumElements())
1807     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
1808   else
1809     B.buildUndef(Dst);
1810 
1811   MI.eraseFromParent();
1812   return true;
1813 }
1814 
1815 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1816   MachineInstr &MI, MachineRegisterInfo &MRI,
1817   MachineIRBuilder &B) const {
1818   // TODO: Should move some of this into LegalizerHelper.
1819 
1820   // TODO: Promote dynamic indexing of s16 to s32
1821 
1822   // FIXME: Artifact combiner probably should have replaced the truncated
1823   // constant before this, so we shouldn't need
1824   // getConstantVRegValWithLookThrough.
1825   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1826     MI.getOperand(3).getReg(), MRI);
1827   if (!IdxVal) // Dynamic case will be selected to register indexing.
1828     return true;
1829 
1830   Register Dst = MI.getOperand(0).getReg();
1831   Register Vec = MI.getOperand(1).getReg();
1832   Register Ins = MI.getOperand(2).getReg();
1833 
1834   LLT VecTy = MRI.getType(Vec);
1835   LLT EltTy = VecTy.getElementType();
1836   assert(EltTy == MRI.getType(Ins));
1837 
1838   B.setInstr(MI);
1839 
1840   if (IdxVal->Value < VecTy.getNumElements())
1841     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
1842   else
1843     B.buildUndef(Dst);
1844 
1845   MI.eraseFromParent();
1846   return true;
1847 }
1848 
1849 bool AMDGPULegalizerInfo::legalizeShuffleVector(
1850   MachineInstr &MI, MachineRegisterInfo &MRI,
1851   MachineIRBuilder &B) const {
1852   const LLT V2S16 = LLT::vector(2, 16);
1853 
1854   Register Dst = MI.getOperand(0).getReg();
1855   Register Src0 = MI.getOperand(1).getReg();
1856   LLT DstTy = MRI.getType(Dst);
1857   LLT SrcTy = MRI.getType(Src0);
1858 
1859   if (SrcTy == V2S16 && DstTy == V2S16 &&
1860       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
1861     return true;
1862 
1863   MachineIRBuilder HelperBuilder(MI);
1864   GISelObserverWrapper DummyObserver;
1865   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
1866   HelperBuilder.setInstr(MI);
1867   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
1868 }
1869 
1870 bool AMDGPULegalizerInfo::legalizeSinCos(
1871   MachineInstr &MI, MachineRegisterInfo &MRI,
1872   MachineIRBuilder &B) const {
1873   B.setInstr(MI);
1874 
1875   Register DstReg = MI.getOperand(0).getReg();
1876   Register SrcReg = MI.getOperand(1).getReg();
1877   LLT Ty = MRI.getType(DstReg);
1878   unsigned Flags = MI.getFlags();
1879 
1880   Register TrigVal;
1881   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1882   if (ST.hasTrigReducedRange()) {
1883     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1884     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1885       .addUse(MulVal.getReg(0))
1886       .setMIFlags(Flags).getReg(0);
1887   } else
1888     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1889 
1890   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1891     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1892   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1893     .addUse(TrigVal)
1894     .setMIFlags(Flags);
1895   MI.eraseFromParent();
1896   return true;
1897 }
1898 
1899 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1900   Register DstReg, LLT PtrTy,
1901   MachineIRBuilder &B, const GlobalValue *GV,
1902   unsigned Offset, unsigned GAFlags) const {
1903   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1904   // to the following code sequence:
1905   //
1906   // For constant address space:
1907   //   s_getpc_b64 s[0:1]
1908   //   s_add_u32 s0, s0, $symbol
1909   //   s_addc_u32 s1, s1, 0
1910   //
1911   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1912   //   a fixup or relocation is emitted to replace $symbol with a literal
1913   //   constant, which is a pc-relative offset from the encoding of the $symbol
1914   //   operand to the global variable.
1915   //
1916   // For global address space:
1917   //   s_getpc_b64 s[0:1]
1918   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1919   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1920   //
1921   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1922   //   fixups or relocations are emitted to replace $symbol@*@lo and
1923   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1924   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
1925   //   operand to the global variable.
1926   //
1927   // What we want here is an offset from the value returned by s_getpc
1928   // (which is the address of the s_add_u32 instruction) to the global
1929   // variable, but since the encoding of $symbol starts 4 bytes after the start
1930   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1931   // small. This requires us to add 4 to the global variable offset in order to
1932   // compute the correct address.
1933 
1934   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1935 
1936   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1937     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1938 
1939   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1940     .addDef(PCReg);
1941 
1942   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1943   if (GAFlags == SIInstrInfo::MO_NONE)
1944     MIB.addImm(0);
1945   else
1946     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1947 
1948   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1949 
1950   if (PtrTy.getSizeInBits() == 32)
1951     B.buildExtract(DstReg, PCReg, 0);
1952   return true;
1953  }
1954 
1955 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1956   MachineInstr &MI, MachineRegisterInfo &MRI,
1957   MachineIRBuilder &B) const {
1958   Register DstReg = MI.getOperand(0).getReg();
1959   LLT Ty = MRI.getType(DstReg);
1960   unsigned AS = Ty.getAddressSpace();
1961 
1962   const GlobalValue *GV = MI.getOperand(1).getGlobal();
1963   MachineFunction &MF = B.getMF();
1964   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1965   B.setInstr(MI);
1966 
1967   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1968     if (!MFI->isEntryFunction()) {
1969       const Function &Fn = MF.getFunction();
1970       DiagnosticInfoUnsupported BadLDSDecl(
1971         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
1972         DS_Warning);
1973       Fn.getContext().diagnose(BadLDSDecl);
1974 
1975       // We currently don't have a way to correctly allocate LDS objects that
1976       // aren't directly associated with a kernel. We do force inlining of
1977       // functions that use local objects. However, if these dead functions are
1978       // not eliminated, we don't want a compile time error. Just emit a warning
1979       // and a trap, since there should be no callable path here.
1980       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
1981       B.buildUndef(DstReg);
1982       MI.eraseFromParent();
1983       return true;
1984     }
1985 
1986     // TODO: We could emit code to handle the initialization somewhere.
1987     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1988       const SITargetLowering *TLI = ST.getTargetLowering();
1989       if (!TLI->shouldUseLDSConstAddress(GV)) {
1990         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
1991         return true; // Leave in place;
1992       }
1993 
1994       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1995       MI.eraseFromParent();
1996       return true;
1997     }
1998 
1999     const Function &Fn = MF.getFunction();
2000     DiagnosticInfoUnsupported BadInit(
2001       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2002     Fn.getContext().diagnose(BadInit);
2003     return true;
2004   }
2005 
2006   const SITargetLowering *TLI = ST.getTargetLowering();
2007 
2008   if (TLI->shouldEmitFixup(GV)) {
2009     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2010     MI.eraseFromParent();
2011     return true;
2012   }
2013 
2014   if (TLI->shouldEmitPCReloc(GV)) {
2015     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2016     MI.eraseFromParent();
2017     return true;
2018   }
2019 
2020   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2021   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2022 
2023   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2024     MachinePointerInfo::getGOT(MF),
2025     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2026     MachineMemOperand::MOInvariant,
2027     8 /*Size*/, 8 /*Align*/);
2028 
2029   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2030 
2031   if (Ty.getSizeInBits() == 32) {
2032     // Truncate if this is a 32-bit constant adrdess.
2033     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2034     B.buildExtract(DstReg, Load, 0);
2035   } else
2036     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2037 
2038   MI.eraseFromParent();
2039   return true;
2040 }
2041 
2042 bool AMDGPULegalizerInfo::legalizeLoad(
2043   MachineInstr &MI, MachineRegisterInfo &MRI,
2044   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
2045   B.setInstr(MI);
2046   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2047   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
2048   Observer.changingInstr(MI);
2049   MI.getOperand(1).setReg(Cast.getReg(0));
2050   Observer.changedInstr(MI);
2051   return true;
2052 }
2053 
2054 bool AMDGPULegalizerInfo::legalizeFMad(
2055   MachineInstr &MI, MachineRegisterInfo &MRI,
2056   MachineIRBuilder &B) const {
2057   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2058   assert(Ty.isScalar());
2059 
2060   MachineFunction &MF = B.getMF();
2061   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2062 
2063   // TODO: Always legal with future ftz flag.
2064   // FIXME: Do we need just output?
2065   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2066     return true;
2067   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2068     return true;
2069 
2070   MachineIRBuilder HelperBuilder(MI);
2071   GISelObserverWrapper DummyObserver;
2072   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2073   HelperBuilder.setMBB(*MI.getParent());
2074   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2075 }
2076 
2077 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2078   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2079   Register DstReg = MI.getOperand(0).getReg();
2080   Register PtrReg = MI.getOperand(1).getReg();
2081   Register CmpVal = MI.getOperand(2).getReg();
2082   Register NewVal = MI.getOperand(3).getReg();
2083 
2084   assert(SITargetLowering::isFlatGlobalAddrSpace(
2085            MRI.getType(PtrReg).getAddressSpace()) &&
2086          "this should not have been custom lowered");
2087 
2088   LLT ValTy = MRI.getType(CmpVal);
2089   LLT VecTy = LLT::vector(2, ValTy);
2090 
2091   B.setInstr(MI);
2092   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2093 
2094   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2095     .addDef(DstReg)
2096     .addUse(PtrReg)
2097     .addUse(PackedVal)
2098     .setMemRefs(MI.memoperands());
2099 
2100   MI.eraseFromParent();
2101   return true;
2102 }
2103 
2104 bool AMDGPULegalizerInfo::legalizeFlog(
2105   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2106   Register Dst = MI.getOperand(0).getReg();
2107   Register Src = MI.getOperand(1).getReg();
2108   LLT Ty = B.getMRI()->getType(Dst);
2109   unsigned Flags = MI.getFlags();
2110   B.setInstr(MI);
2111 
2112   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2113   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2114 
2115   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2116   MI.eraseFromParent();
2117   return true;
2118 }
2119 
2120 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2121                                        MachineIRBuilder &B) const {
2122   Register Dst = MI.getOperand(0).getReg();
2123   Register Src = MI.getOperand(1).getReg();
2124   unsigned Flags = MI.getFlags();
2125   LLT Ty = B.getMRI()->getType(Dst);
2126   B.setInstr(MI);
2127 
2128   auto K = B.buildFConstant(Ty, numbers::log2e);
2129   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2130   B.buildFExp2(Dst, Mul, Flags);
2131   MI.eraseFromParent();
2132   return true;
2133 }
2134 
2135 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2136                                        MachineIRBuilder &B) const {
2137   Register Dst = MI.getOperand(0).getReg();
2138   Register Src0 = MI.getOperand(1).getReg();
2139   Register Src1 = MI.getOperand(2).getReg();
2140   unsigned Flags = MI.getFlags();
2141   LLT Ty = B.getMRI()->getType(Dst);
2142   B.setInstr(MI);
2143   const LLT S16 = LLT::scalar(16);
2144   const LLT S32 = LLT::scalar(32);
2145 
2146   if (Ty == S32) {
2147     auto Log = B.buildFLog2(S32, Src0, Flags);
2148     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2149       .addUse(Log.getReg(0))
2150       .addUse(Src1)
2151       .setMIFlags(Flags);
2152     B.buildFExp2(Dst, Mul, Flags);
2153   } else if (Ty == S16) {
2154     // There's no f16 fmul_legacy, so we need to convert for it.
2155     auto Log = B.buildFLog2(S16, Src0, Flags);
2156     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2157     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2158     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2159       .addUse(Ext0.getReg(0))
2160       .addUse(Ext1.getReg(0))
2161       .setMIFlags(Flags);
2162 
2163     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2164   } else
2165     return false;
2166 
2167   MI.eraseFromParent();
2168   return true;
2169 }
2170 
2171 // Find a source register, ignoring any possible source modifiers.
2172 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2173   Register ModSrc = OrigSrc;
2174   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2175     ModSrc = SrcFNeg->getOperand(1).getReg();
2176     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2177       ModSrc = SrcFAbs->getOperand(1).getReg();
2178   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2179     ModSrc = SrcFAbs->getOperand(1).getReg();
2180   return ModSrc;
2181 }
2182 
2183 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2184                                          MachineRegisterInfo &MRI,
2185                                          MachineIRBuilder &B) const {
2186   B.setInstr(MI);
2187 
2188   const LLT S1 = LLT::scalar(1);
2189   const LLT S64 = LLT::scalar(64);
2190   Register Dst = MI.getOperand(0).getReg();
2191   Register OrigSrc = MI.getOperand(1).getReg();
2192   unsigned Flags = MI.getFlags();
2193   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2194          "this should not have been custom lowered");
2195 
2196   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2197   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2198   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2199   // V_FRACT bug is:
2200   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2201   //
2202   // Convert floor(x) to (x - fract(x))
2203 
2204   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2205     .addUse(OrigSrc)
2206     .setMIFlags(Flags);
2207 
2208   // Give source modifier matching some assistance before obscuring a foldable
2209   // pattern.
2210 
2211   // TODO: We can avoid the neg on the fract? The input sign to fract
2212   // shouldn't matter?
2213   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2214 
2215   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2216 
2217   Register Min = MRI.createGenericVirtualRegister(S64);
2218 
2219   // We don't need to concern ourselves with the snan handling difference, so
2220   // use the one which will directly select.
2221   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2222   if (MFI->getMode().IEEE)
2223     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2224   else
2225     B.buildFMinNum(Min, Fract, Const, Flags);
2226 
2227   Register CorrectedFract = Min;
2228   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2229     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2230     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2231   }
2232 
2233   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2234   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2235 
2236   MI.eraseFromParent();
2237   return true;
2238 }
2239 
2240 // Turn an illegal packed v2s16 build vector into bit operations.
2241 // TODO: This should probably be a bitcast action in LegalizerHelper.
2242 bool AMDGPULegalizerInfo::legalizeBuildVector(
2243   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2244   Register Dst = MI.getOperand(0).getReg();
2245   LLT DstTy = MRI.getType(Dst);
2246   const LLT S32 = LLT::scalar(32);
2247   const LLT V2S16 = LLT::vector(2, 16);
2248   (void)DstTy;
2249   (void)V2S16;
2250   assert(DstTy == V2S16);
2251 
2252   Register Src0 = MI.getOperand(1).getReg();
2253   Register Src1 = MI.getOperand(2).getReg();
2254   assert(MRI.getType(Src0) == LLT::scalar(16));
2255 
2256   B.setInstr(MI);
2257   auto Merge = B.buildMerge(S32, {Src0, Src1});
2258   B.buildBitcast(Dst, Merge);
2259 
2260   MI.eraseFromParent();
2261   return true;
2262 }
2263 
2264 // Return the use branch instruction, otherwise null if the usage is invalid.
2265 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2266                                        MachineRegisterInfo &MRI,
2267                                        MachineInstr *&Br) {
2268   Register CondDef = MI.getOperand(0).getReg();
2269   if (!MRI.hasOneNonDBGUse(CondDef))
2270     return nullptr;
2271 
2272   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2273   if (UseMI.getParent() != MI.getParent() ||
2274       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2275     return nullptr;
2276 
2277   // Make sure the cond br is followed by a G_BR
2278   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2279   if (Next != MI.getParent()->end()) {
2280     if (Next->getOpcode() != AMDGPU::G_BR)
2281       return nullptr;
2282     Br = &*Next;
2283   }
2284 
2285   return &UseMI;
2286 }
2287 
2288 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B,
2289                                                MachineRegisterInfo &MRI,
2290                                                Register LiveIn,
2291                                                Register PhyReg) const {
2292   assert(PhyReg.isPhysical() && "Physical register expected");
2293 
2294   // Insert the live-in copy, if required, by defining destination virtual
2295   // register.
2296   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
2297   if (!MRI.getVRegDef(LiveIn)) {
2298     // FIXME: Should have scoped insert pt
2299     MachineBasicBlock &OrigInsBB = B.getMBB();
2300     auto OrigInsPt = B.getInsertPt();
2301 
2302     MachineBasicBlock &EntryMBB = B.getMF().front();
2303     EntryMBB.addLiveIn(PhyReg);
2304     B.setInsertPt(EntryMBB, EntryMBB.begin());
2305     B.buildCopy(LiveIn, PhyReg);
2306 
2307     B.setInsertPt(OrigInsBB, OrigInsPt);
2308   }
2309 
2310   return LiveIn;
2311 }
2312 
2313 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B,
2314                                                 MachineRegisterInfo &MRI,
2315                                                 Register PhyReg, LLT Ty,
2316                                                 bool InsertLiveInCopy) const {
2317   assert(PhyReg.isPhysical() && "Physical register expected");
2318 
2319   // Get or create virtual live-in regester
2320   Register LiveIn = MRI.getLiveInVirtReg(PhyReg);
2321   if (!LiveIn) {
2322     LiveIn = MRI.createGenericVirtualRegister(Ty);
2323     MRI.addLiveIn(PhyReg, LiveIn);
2324   }
2325 
2326   // When the actual true copy required is from virtual register to physical
2327   // register (to be inserted later), live-in copy insertion from physical
2328   // to register virtual register is not required
2329   if (!InsertLiveInCopy)
2330     return LiveIn;
2331 
2332   return insertLiveInCopy(B, MRI, LiveIn, PhyReg);
2333 }
2334 
2335 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor(
2336     MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2337   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2338   const ArgDescriptor *Arg;
2339   const TargetRegisterClass *RC;
2340   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
2341   if (!Arg) {
2342     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
2343     return nullptr;
2344   }
2345   return Arg;
2346 }
2347 
2348 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2349                                          const ArgDescriptor *Arg) const {
2350   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2351     return false; // TODO: Handle these
2352 
2353   Register SrcReg = Arg->getRegister();
2354   assert(SrcReg.isPhysical() && "Physical register expected");
2355   assert(DstReg.isVirtual() && "Virtual register expected");
2356 
2357   MachineRegisterInfo &MRI = *B.getMRI();
2358 
2359   LLT Ty = MRI.getType(DstReg);
2360   Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty);
2361 
2362   if (Arg->isMasked()) {
2363     // TODO: Should we try to emit this once in the entry block?
2364     const LLT S32 = LLT::scalar(32);
2365     const unsigned Mask = Arg->getMask();
2366     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2367 
2368     Register AndMaskSrc = LiveIn;
2369 
2370     if (Shift != 0) {
2371       auto ShiftAmt = B.buildConstant(S32, Shift);
2372       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2373     }
2374 
2375     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2376   } else {
2377     B.buildCopy(DstReg, LiveIn);
2378   }
2379 
2380   return true;
2381 }
2382 
2383 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2384     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2385     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2386   B.setInstr(MI);
2387 
2388   const ArgDescriptor *Arg = getArgDescriptor(B, ArgType);
2389   if (!Arg)
2390     return false;
2391 
2392   if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg))
2393     return false;
2394 
2395   MI.eraseFromParent();
2396   return true;
2397 }
2398 
2399 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2400                                        MachineRegisterInfo &MRI,
2401                                        MachineIRBuilder &B) const {
2402   B.setInstr(MI);
2403   Register Dst = MI.getOperand(0).getReg();
2404   LLT DstTy = MRI.getType(Dst);
2405   LLT S16 = LLT::scalar(16);
2406   LLT S32 = LLT::scalar(32);
2407   LLT S64 = LLT::scalar(64);
2408 
2409   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2410     return true;
2411 
2412   if (DstTy == S16)
2413     return legalizeFDIV16(MI, MRI, B);
2414   if (DstTy == S32)
2415     return legalizeFDIV32(MI, MRI, B);
2416   if (DstTy == S64)
2417     return legalizeFDIV64(MI, MRI, B);
2418 
2419   return false;
2420 }
2421 
2422 static Register buildDivRCP(MachineIRBuilder &B, Register Src) {
2423   const LLT S32 = LLT::scalar(32);
2424 
2425   auto Cvt0 = B.buildUITOFP(S32, Src);
2426   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Cvt0});
2427   auto FPUIntMaxPlus1 = B.buildFConstant(S32, BitsToFloat(0x4f800000));
2428   auto Mul = B.buildFMul(S32, RcpIFlag, FPUIntMaxPlus1);
2429   return B.buildFPTOUI(S32, Mul).getReg(0);
2430 }
2431 
2432 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2433                                                   Register DstReg,
2434                                                   Register Num,
2435                                                   Register Den,
2436                                                   bool IsRem) const {
2437   const LLT S1 = LLT::scalar(1);
2438   const LLT S32 = LLT::scalar(32);
2439 
2440   // RCP =  URECIP(Den) = 2^32 / Den + e
2441   // e is rounding error.
2442   auto RCP = buildDivRCP(B, Den);
2443 
2444   // RCP_LO = mul(RCP, Den)
2445   auto RCP_LO = B.buildMul(S32, RCP, Den);
2446 
2447   // RCP_HI = mulhu (RCP, Den) */
2448   auto RCP_HI = B.buildUMulH(S32, RCP, Den);
2449 
2450   // NEG_RCP_LO = -RCP_LO
2451   auto Zero = B.buildConstant(S32, 0);
2452   auto NEG_RCP_LO = B.buildSub(S32, Zero, RCP_LO);
2453 
2454   // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
2455   auto CmpRcpHiZero = B.buildICmp(CmpInst::ICMP_EQ, S1, RCP_HI, Zero);
2456   auto ABS_RCP_LO = B.buildSelect(S32, CmpRcpHiZero, NEG_RCP_LO, RCP_LO);
2457 
2458   // Calculate the rounding error from the URECIP instruction
2459   // E = mulhu(ABS_RCP_LO, RCP)
2460   auto E = B.buildUMulH(S32, ABS_RCP_LO, RCP);
2461 
2462   // RCP_A_E = RCP + E
2463   auto RCP_A_E = B.buildAdd(S32, RCP, E);
2464 
2465   // RCP_S_E = RCP - E
2466   auto RCP_S_E = B.buildSub(S32, RCP, E);
2467 
2468   // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
2469   auto Tmp0 = B.buildSelect(S32, CmpRcpHiZero, RCP_A_E, RCP_S_E);
2470 
2471   // Quotient = mulhu(Tmp0, Num)stmp
2472   auto Quotient = B.buildUMulH(S32, Tmp0, Num);
2473 
2474   // Num_S_Remainder = Quotient * Den
2475   auto Num_S_Remainder = B.buildMul(S32, Quotient, Den);
2476 
2477   // Remainder = Num - Num_S_Remainder
2478   auto Remainder = B.buildSub(S32, Num, Num_S_Remainder);
2479 
2480   // Remainder_GE_Den = Remainder >= Den
2481   auto Remainder_GE_Den = B.buildICmp(CmpInst::ICMP_UGE, S1, Remainder, Den);
2482 
2483   // Remainder_GE_Zero = Num >= Num_S_Remainder;
2484   auto Remainder_GE_Zero = B.buildICmp(CmpInst::ICMP_UGE, S1,
2485                                        Num, Num_S_Remainder);
2486 
2487   // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
2488   auto Tmp1 = B.buildAnd(S1, Remainder_GE_Den, Remainder_GE_Zero);
2489 
2490   // Calculate Division result:
2491 
2492   // Quotient_A_One = Quotient + 1
2493   auto One = B.buildConstant(S32, 1);
2494   auto Quotient_A_One = B.buildAdd(S32, Quotient, One);
2495 
2496   // Quotient_S_One = Quotient - 1
2497   auto Quotient_S_One = B.buildSub(S32, Quotient, One);
2498 
2499   // Div = (Tmp1 == 0 ? Quotient_A_One : Quotient)
2500   auto Div = B.buildSelect(S32, Tmp1, Quotient, Quotient_A_One);
2501 
2502   // Div = (Remainder_GE_Zero ? Div : Quotient_S_One)
2503   if (IsRem) {
2504     Div = B.buildSelect(S32, Remainder_GE_Zero, Div, Quotient_S_One);
2505 
2506     // Calculate Rem result:
2507     auto Remainder_S_Den = B.buildSub(S32, Remainder, Den);
2508 
2509     // Remainder_A_Den = Remainder + Den
2510     auto Remainder_A_Den = B.buildAdd(S32, Remainder, Den);
2511 
2512     // Rem = (Tmp1 ? Remainder_S_Den : Remainder)
2513     auto Rem = B.buildSelect(S32, Tmp1, Remainder_S_Den, Remainder);
2514 
2515     // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den)
2516     B.buildSelect(DstReg, Remainder_GE_Zero, Rem, Remainder_A_Den);
2517   } else {
2518     B.buildSelect(DstReg, Remainder_GE_Zero, Div, Quotient_S_One);
2519   }
2520 }
2521 
2522 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2523                                               MachineRegisterInfo &MRI,
2524                                               MachineIRBuilder &B) const {
2525   B.setInstr(MI);
2526   const bool IsRem = MI.getOpcode() == AMDGPU::G_UREM;
2527   Register DstReg = MI.getOperand(0).getReg();
2528   Register Num = MI.getOperand(1).getReg();
2529   Register Den = MI.getOperand(2).getReg();
2530   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsRem);
2531   MI.eraseFromParent();
2532   return true;
2533 }
2534 
2535 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2536                                             MachineRegisterInfo &MRI,
2537                                             MachineIRBuilder &B) const {
2538   if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32))
2539     return legalizeUDIV_UREM32(MI, MRI, B);
2540   return false;
2541 }
2542 
2543 bool AMDGPULegalizerInfo::legalizeSDIV_SREM32(MachineInstr &MI,
2544                                               MachineRegisterInfo &MRI,
2545                                               MachineIRBuilder &B) const {
2546   B.setInstr(MI);
2547   const LLT S32 = LLT::scalar(32);
2548 
2549   const bool IsRem = MI.getOpcode() == AMDGPU::G_SREM;
2550   Register DstReg = MI.getOperand(0).getReg();
2551   Register LHS = MI.getOperand(1).getReg();
2552   Register RHS = MI.getOperand(2).getReg();
2553 
2554   auto ThirtyOne = B.buildConstant(S32, 31);
2555   auto LHSign = B.buildAShr(S32, LHS, ThirtyOne);
2556   auto RHSign = B.buildAShr(S32, LHS, ThirtyOne);
2557 
2558   LHS = B.buildAdd(S32, LHS, LHSign).getReg(0);
2559   RHS = B.buildAdd(S32, RHS, RHSign).getReg(0);
2560 
2561   LHS = B.buildXor(S32, LHS, LHSign).getReg(0);
2562   RHS = B.buildXor(S32, RHS, RHSign).getReg(0);
2563 
2564   Register UDivRem = MRI.createGenericVirtualRegister(S32);
2565   legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsRem);
2566 
2567   if (IsRem) {
2568     auto RSign = LHSign; // Remainder sign is the same as LHS
2569     UDivRem = B.buildXor(S32, UDivRem, RSign).getReg(0);
2570     B.buildSub(DstReg, UDivRem, RSign);
2571   } else {
2572     auto DSign = B.buildXor(S32, LHSign, RHSign);
2573     UDivRem = B.buildXor(S32, UDivRem, DSign).getReg(0);
2574     B.buildSub(DstReg, UDivRem, DSign);
2575   }
2576 
2577   MI.eraseFromParent();
2578   return true;
2579 }
2580 
2581 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2582                                             MachineRegisterInfo &MRI,
2583                                             MachineIRBuilder &B) const {
2584   if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32))
2585     return legalizeSDIV_SREM32(MI, MRI, B);
2586   return false;
2587 }
2588 
2589 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2590                                                  MachineRegisterInfo &MRI,
2591                                                  MachineIRBuilder &B) const {
2592   Register Res = MI.getOperand(0).getReg();
2593   Register LHS = MI.getOperand(1).getReg();
2594   Register RHS = MI.getOperand(2).getReg();
2595 
2596   uint16_t Flags = MI.getFlags();
2597 
2598   LLT ResTy = MRI.getType(Res);
2599   LLT S32 = LLT::scalar(32);
2600   LLT S64 = LLT::scalar(64);
2601 
2602   const MachineFunction &MF = B.getMF();
2603   bool Unsafe =
2604     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2605 
2606   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2607     return false;
2608 
2609   if (!Unsafe && ResTy == S32 &&
2610       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2611     return false;
2612 
2613   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2614     // 1 / x -> RCP(x)
2615     if (CLHS->isExactlyValue(1.0)) {
2616       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2617         .addUse(RHS)
2618         .setMIFlags(Flags);
2619 
2620       MI.eraseFromParent();
2621       return true;
2622     }
2623 
2624     // -1 / x -> RCP( FNEG(x) )
2625     if (CLHS->isExactlyValue(-1.0)) {
2626       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2627       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2628         .addUse(FNeg.getReg(0))
2629         .setMIFlags(Flags);
2630 
2631       MI.eraseFromParent();
2632       return true;
2633     }
2634   }
2635 
2636   // x / y -> x * (1.0 / y)
2637   if (Unsafe) {
2638     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2639       .addUse(RHS)
2640       .setMIFlags(Flags);
2641     B.buildFMul(Res, LHS, RCP, Flags);
2642 
2643     MI.eraseFromParent();
2644     return true;
2645   }
2646 
2647   return false;
2648 }
2649 
2650 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2651                                          MachineRegisterInfo &MRI,
2652                                          MachineIRBuilder &B) const {
2653   B.setInstr(MI);
2654   Register Res = MI.getOperand(0).getReg();
2655   Register LHS = MI.getOperand(1).getReg();
2656   Register RHS = MI.getOperand(2).getReg();
2657 
2658   uint16_t Flags = MI.getFlags();
2659 
2660   LLT S16 = LLT::scalar(16);
2661   LLT S32 = LLT::scalar(32);
2662 
2663   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2664   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2665 
2666   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2667     .addUse(RHSExt.getReg(0))
2668     .setMIFlags(Flags);
2669 
2670   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2671   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2672 
2673   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2674     .addUse(RDst.getReg(0))
2675     .addUse(RHS)
2676     .addUse(LHS)
2677     .setMIFlags(Flags);
2678 
2679   MI.eraseFromParent();
2680   return true;
2681 }
2682 
2683 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2684 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2685 static void toggleSPDenormMode(bool Enable,
2686                                MachineIRBuilder &B,
2687                                const GCNSubtarget &ST,
2688                                AMDGPU::SIModeRegisterDefaults Mode) {
2689   // Set SP denorm mode to this value.
2690   unsigned SPDenormMode =
2691     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
2692 
2693   if (ST.hasDenormModeInst()) {
2694     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2695     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
2696 
2697     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2698     B.buildInstr(AMDGPU::S_DENORM_MODE)
2699       .addImm(NewDenormModeValue);
2700 
2701   } else {
2702     // Select FP32 bit field in mode register.
2703     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2704                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2705                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2706 
2707     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2708       .addImm(SPDenormMode)
2709       .addImm(SPDenormModeBitField);
2710   }
2711 }
2712 
2713 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2714                                          MachineRegisterInfo &MRI,
2715                                          MachineIRBuilder &B) const {
2716   B.setInstr(MI);
2717   Register Res = MI.getOperand(0).getReg();
2718   Register LHS = MI.getOperand(1).getReg();
2719   Register RHS = MI.getOperand(2).getReg();
2720   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2721   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2722 
2723   uint16_t Flags = MI.getFlags();
2724 
2725   LLT S32 = LLT::scalar(32);
2726   LLT S1 = LLT::scalar(1);
2727 
2728   auto One = B.buildFConstant(S32, 1.0f);
2729 
2730   auto DenominatorScaled =
2731     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2732       .addUse(RHS)
2733       .addUse(LHS)
2734       .addImm(1)
2735       .setMIFlags(Flags);
2736   auto NumeratorScaled =
2737     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2738       .addUse(LHS)
2739       .addUse(RHS)
2740       .addImm(0)
2741       .setMIFlags(Flags);
2742 
2743   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2744     .addUse(DenominatorScaled.getReg(0))
2745     .setMIFlags(Flags);
2746   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2747 
2748   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2749   // aren't modeled as reading it.
2750   if (!Mode.allFP32Denormals())
2751     toggleSPDenormMode(true, B, ST, Mode);
2752 
2753   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2754   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2755   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2756   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2757   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2758   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2759 
2760   if (!Mode.allFP32Denormals())
2761     toggleSPDenormMode(false, B, ST, Mode);
2762 
2763   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2764     .addUse(Fma4.getReg(0))
2765     .addUse(Fma1.getReg(0))
2766     .addUse(Fma3.getReg(0))
2767     .addUse(NumeratorScaled.getReg(1))
2768     .setMIFlags(Flags);
2769 
2770   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2771     .addUse(Fmas.getReg(0))
2772     .addUse(RHS)
2773     .addUse(LHS)
2774     .setMIFlags(Flags);
2775 
2776   MI.eraseFromParent();
2777   return true;
2778 }
2779 
2780 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
2781                                          MachineRegisterInfo &MRI,
2782                                          MachineIRBuilder &B) const {
2783   B.setInstr(MI);
2784   Register Res = MI.getOperand(0).getReg();
2785   Register LHS = MI.getOperand(1).getReg();
2786   Register RHS = MI.getOperand(2).getReg();
2787 
2788   uint16_t Flags = MI.getFlags();
2789 
2790   LLT S64 = LLT::scalar(64);
2791   LLT S1 = LLT::scalar(1);
2792 
2793   auto One = B.buildFConstant(S64, 1.0);
2794 
2795   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2796     .addUse(LHS)
2797     .addUse(RHS)
2798     .addImm(1)
2799     .setMIFlags(Flags);
2800 
2801   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
2802 
2803   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
2804     .addUse(DivScale0.getReg(0))
2805     .setMIFlags(Flags);
2806 
2807   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
2808   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
2809   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
2810 
2811   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2812     .addUse(LHS)
2813     .addUse(RHS)
2814     .addImm(0)
2815     .setMIFlags(Flags);
2816 
2817   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
2818   auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags);
2819   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
2820 
2821   Register Scale;
2822   if (!ST.hasUsableDivScaleConditionOutput()) {
2823     // Workaround a hardware bug on SI where the condition output from div_scale
2824     // is not usable.
2825 
2826     LLT S32 = LLT::scalar(32);
2827 
2828     auto NumUnmerge = B.buildUnmerge(S32, LHS);
2829     auto DenUnmerge = B.buildUnmerge(S32, RHS);
2830     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
2831     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
2832 
2833     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
2834                               Scale1Unmerge.getReg(1));
2835     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
2836                               Scale0Unmerge.getReg(1));
2837     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
2838   } else {
2839     Scale = DivScale1.getReg(1);
2840   }
2841 
2842   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
2843     .addUse(Fma4.getReg(0))
2844     .addUse(Fma3.getReg(0))
2845     .addUse(Mul.getReg(0))
2846     .addUse(Scale)
2847     .setMIFlags(Flags);
2848 
2849   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
2850     .addUse(Fmas.getReg(0))
2851     .addUse(RHS)
2852     .addUse(LHS)
2853     .setMIFlags(Flags);
2854 
2855   MI.eraseFromParent();
2856   return true;
2857 }
2858 
2859 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
2860                                                  MachineRegisterInfo &MRI,
2861                                                  MachineIRBuilder &B) const {
2862   B.setInstr(MI);
2863   Register Res = MI.getOperand(0).getReg();
2864   Register LHS = MI.getOperand(2).getReg();
2865   Register RHS = MI.getOperand(3).getReg();
2866   uint16_t Flags = MI.getFlags();
2867 
2868   LLT S32 = LLT::scalar(32);
2869   LLT S1 = LLT::scalar(1);
2870 
2871   auto Abs = B.buildFAbs(S32, RHS, Flags);
2872   const APFloat C0Val(1.0f);
2873 
2874   auto C0 = B.buildConstant(S32, 0x6f800000);
2875   auto C1 = B.buildConstant(S32, 0x2f800000);
2876   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
2877 
2878   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
2879   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
2880 
2881   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
2882 
2883   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2884     .addUse(Mul0.getReg(0))
2885     .setMIFlags(Flags);
2886 
2887   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
2888 
2889   B.buildFMul(Res, Sel, Mul1, Flags);
2890 
2891   MI.eraseFromParent();
2892   return true;
2893 }
2894 
2895 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
2896                                                  MachineRegisterInfo &MRI,
2897                                                  MachineIRBuilder &B) const {
2898   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2899   if (!MFI->isEntryFunction()) {
2900     return legalizePreloadedArgIntrin(MI, MRI, B,
2901                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
2902   }
2903 
2904   B.setInstr(MI);
2905 
2906   uint64_t Offset =
2907     ST.getTargetLowering()->getImplicitParameterOffset(
2908       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
2909   Register DstReg = MI.getOperand(0).getReg();
2910   LLT DstTy = MRI.getType(DstReg);
2911   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
2912 
2913   const ArgDescriptor *Arg;
2914   const TargetRegisterClass *RC;
2915   std::tie(Arg, RC)
2916     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2917   if (!Arg)
2918     return false;
2919 
2920   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
2921   if (!loadInputValue(KernargPtrReg, B, Arg))
2922     return false;
2923 
2924   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
2925   MI.eraseFromParent();
2926   return true;
2927 }
2928 
2929 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
2930                                               MachineRegisterInfo &MRI,
2931                                               MachineIRBuilder &B,
2932                                               unsigned AddrSpace) const {
2933   B.setInstr(MI);
2934   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
2935   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
2936   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
2937   MI.eraseFromParent();
2938   return true;
2939 }
2940 
2941 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
2942 // offset (the offset that is included in bounds checking and swizzling, to be
2943 // split between the instruction's voffset and immoffset fields) and soffset
2944 // (the offset that is excluded from bounds checking and swizzling, to go in
2945 // the instruction's soffset field).  This function takes the first kind of
2946 // offset and figures out how to split it between voffset and immoffset.
2947 std::tuple<Register, unsigned, unsigned>
2948 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
2949                                         Register OrigOffset) const {
2950   const unsigned MaxImm = 4095;
2951   Register BaseReg;
2952   unsigned TotalConstOffset;
2953   MachineInstr *OffsetDef;
2954   const LLT S32 = LLT::scalar(32);
2955 
2956   std::tie(BaseReg, TotalConstOffset, OffsetDef)
2957     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
2958 
2959   unsigned ImmOffset = TotalConstOffset;
2960 
2961   // If the immediate value is too big for the immoffset field, put the value
2962   // and -4096 into the immoffset field so that the value that is copied/added
2963   // for the voffset field is a multiple of 4096, and it stands more chance
2964   // of being CSEd with the copy/add for another similar load/store.
2965   // However, do not do that rounding down to a multiple of 4096 if that is a
2966   // negative number, as it appears to be illegal to have a negative offset
2967   // in the vgpr, even if adding the immediate offset makes it positive.
2968   unsigned Overflow = ImmOffset & ~MaxImm;
2969   ImmOffset -= Overflow;
2970   if ((int32_t)Overflow < 0) {
2971     Overflow += ImmOffset;
2972     ImmOffset = 0;
2973   }
2974 
2975   if (Overflow != 0) {
2976     if (!BaseReg) {
2977       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
2978     } else {
2979       auto OverflowVal = B.buildConstant(S32, Overflow);
2980       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
2981     }
2982   }
2983 
2984   if (!BaseReg)
2985     BaseReg = B.buildConstant(S32, 0).getReg(0);
2986 
2987   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
2988 }
2989 
2990 /// Handle register layout difference for f16 images for some subtargets.
2991 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
2992                                              MachineRegisterInfo &MRI,
2993                                              Register Reg) const {
2994   if (!ST.hasUnpackedD16VMem())
2995     return Reg;
2996 
2997   const LLT S16 = LLT::scalar(16);
2998   const LLT S32 = LLT::scalar(32);
2999   LLT StoreVT = MRI.getType(Reg);
3000   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3001 
3002   auto Unmerge = B.buildUnmerge(S16, Reg);
3003 
3004   SmallVector<Register, 4> WideRegs;
3005   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3006     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3007 
3008   int NumElts = StoreVT.getNumElements();
3009 
3010   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3011 }
3012 
3013 Register AMDGPULegalizerInfo::fixStoreSourceType(
3014   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3015   MachineRegisterInfo *MRI = B.getMRI();
3016   LLT Ty = MRI->getType(VData);
3017 
3018   const LLT S16 = LLT::scalar(16);
3019 
3020   // Fixup illegal register types for i8 stores.
3021   if (Ty == LLT::scalar(8) || Ty == S16) {
3022     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3023     return AnyExt;
3024   }
3025 
3026   if (Ty.isVector()) {
3027     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3028       if (IsFormat)
3029         return handleD16VData(B, *MRI, VData);
3030     }
3031   }
3032 
3033   return VData;
3034 }
3035 
3036 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3037                                               MachineRegisterInfo &MRI,
3038                                               MachineIRBuilder &B,
3039                                               bool IsTyped,
3040                                               bool IsFormat) const {
3041   B.setInstr(MI);
3042 
3043   Register VData = MI.getOperand(1).getReg();
3044   LLT Ty = MRI.getType(VData);
3045   LLT EltTy = Ty.getScalarType();
3046   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3047   const LLT S32 = LLT::scalar(32);
3048 
3049   VData = fixStoreSourceType(B, VData, IsFormat);
3050   Register RSrc = MI.getOperand(2).getReg();
3051 
3052   MachineMemOperand *MMO = *MI.memoperands_begin();
3053   const int MemSize = MMO->getSize();
3054 
3055   unsigned ImmOffset;
3056   unsigned TotalOffset;
3057 
3058   // The typed intrinsics add an immediate after the registers.
3059   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3060 
3061   // The struct intrinsic variants add one additional operand over raw.
3062   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3063   Register VIndex;
3064   int OpOffset = 0;
3065   if (HasVIndex) {
3066     VIndex = MI.getOperand(3).getReg();
3067     OpOffset = 1;
3068   }
3069 
3070   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3071   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3072 
3073   unsigned Format = 0;
3074   if (IsTyped) {
3075     Format = MI.getOperand(5 + OpOffset).getImm();
3076     ++OpOffset;
3077   }
3078 
3079   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3080 
3081   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3082   if (TotalOffset != 0)
3083     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3084 
3085   unsigned Opc;
3086   if (IsTyped) {
3087     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3088                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3089   } else if (IsFormat) {
3090     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3091                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3092   } else {
3093     switch (MemSize) {
3094     case 1:
3095       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3096       break;
3097     case 2:
3098       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3099       break;
3100     default:
3101       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3102       break;
3103     }
3104   }
3105 
3106   if (!VIndex)
3107     VIndex = B.buildConstant(S32, 0).getReg(0);
3108 
3109   auto MIB = B.buildInstr(Opc)
3110     .addUse(VData)              // vdata
3111     .addUse(RSrc)               // rsrc
3112     .addUse(VIndex)             // vindex
3113     .addUse(VOffset)            // voffset
3114     .addUse(SOffset)            // soffset
3115     .addImm(ImmOffset);         // offset(imm)
3116 
3117   if (IsTyped)
3118     MIB.addImm(Format);
3119 
3120   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3121      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3122      .addMemOperand(MMO);
3123 
3124   MI.eraseFromParent();
3125   return true;
3126 }
3127 
3128 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3129                                              MachineRegisterInfo &MRI,
3130                                              MachineIRBuilder &B,
3131                                              bool IsFormat,
3132                                              bool IsTyped) const {
3133   B.setInstr(MI);
3134 
3135   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3136   MachineMemOperand *MMO = *MI.memoperands_begin();
3137   const int MemSize = MMO->getSize();
3138   const LLT S32 = LLT::scalar(32);
3139 
3140   Register Dst = MI.getOperand(0).getReg();
3141   Register RSrc = MI.getOperand(2).getReg();
3142 
3143   // The typed intrinsics add an immediate after the registers.
3144   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3145 
3146   // The struct intrinsic variants add one additional operand over raw.
3147   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3148   Register VIndex;
3149   int OpOffset = 0;
3150   if (HasVIndex) {
3151     VIndex = MI.getOperand(3).getReg();
3152     OpOffset = 1;
3153   }
3154 
3155   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3156   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3157 
3158   unsigned Format = 0;
3159   if (IsTyped) {
3160     Format = MI.getOperand(5 + OpOffset).getImm();
3161     ++OpOffset;
3162   }
3163 
3164   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3165   unsigned ImmOffset;
3166   unsigned TotalOffset;
3167 
3168   LLT Ty = MRI.getType(Dst);
3169   LLT EltTy = Ty.getScalarType();
3170   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3171   const bool Unpacked = ST.hasUnpackedD16VMem();
3172 
3173   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3174   if (TotalOffset != 0)
3175     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3176 
3177   unsigned Opc;
3178 
3179   if (IsTyped) {
3180     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3181                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3182   } else if (IsFormat) {
3183     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3184                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3185   } else {
3186     switch (MemSize) {
3187     case 1:
3188       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3189       break;
3190     case 2:
3191       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3192       break;
3193     default:
3194       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3195       break;
3196     }
3197   }
3198 
3199   Register LoadDstReg;
3200 
3201   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3202   LLT UnpackedTy = Ty.changeElementSize(32);
3203 
3204   if (IsExtLoad)
3205     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3206   else if (Unpacked && IsD16 && Ty.isVector())
3207     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3208   else
3209     LoadDstReg = Dst;
3210 
3211   if (!VIndex)
3212     VIndex = B.buildConstant(S32, 0).getReg(0);
3213 
3214   auto MIB = B.buildInstr(Opc)
3215     .addDef(LoadDstReg)         // vdata
3216     .addUse(RSrc)               // rsrc
3217     .addUse(VIndex)             // vindex
3218     .addUse(VOffset)            // voffset
3219     .addUse(SOffset)            // soffset
3220     .addImm(ImmOffset);         // offset(imm)
3221 
3222   if (IsTyped)
3223     MIB.addImm(Format);
3224 
3225   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3226      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3227      .addMemOperand(MMO);
3228 
3229   if (LoadDstReg != Dst) {
3230     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3231 
3232     // Widen result for extending loads was widened.
3233     if (IsExtLoad)
3234       B.buildTrunc(Dst, LoadDstReg);
3235     else {
3236       // Repack to original 16-bit vector result
3237       // FIXME: G_TRUNC should work, but legalization currently fails
3238       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3239       SmallVector<Register, 4> Repack;
3240       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3241         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3242       B.buildMerge(Dst, Repack);
3243     }
3244   }
3245 
3246   MI.eraseFromParent();
3247   return true;
3248 }
3249 
3250 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3251                                                MachineIRBuilder &B,
3252                                                bool IsInc) const {
3253   B.setInstr(MI);
3254   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3255                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3256   B.buildInstr(Opc)
3257     .addDef(MI.getOperand(0).getReg())
3258     .addUse(MI.getOperand(2).getReg())
3259     .addUse(MI.getOperand(3).getReg())
3260     .cloneMemRefs(MI);
3261   MI.eraseFromParent();
3262   return true;
3263 }
3264 
3265 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3266   switch (IntrID) {
3267   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3268   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3269     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3270   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3271   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3272     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3273   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3274   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3275     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3276   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3277   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3278     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3279   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3280   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3281     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3282   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3283   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3284     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3285   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3286   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3287     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3288   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3289   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3290     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3291   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3292   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3293     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3294   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3295   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3296     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3297   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3298   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3299     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3300   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3301   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3302     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3303   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3304   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3305     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3306   default:
3307     llvm_unreachable("unhandled atomic opcode");
3308   }
3309 }
3310 
3311 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3312                                                MachineIRBuilder &B,
3313                                                Intrinsic::ID IID) const {
3314   B.setInstr(MI);
3315 
3316   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3317                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3318 
3319   Register Dst = MI.getOperand(0).getReg();
3320   Register VData = MI.getOperand(2).getReg();
3321 
3322   Register CmpVal;
3323   int OpOffset = 0;
3324 
3325   if (IsCmpSwap) {
3326     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3327     ++OpOffset;
3328   }
3329 
3330   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3331   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
3332 
3333   // The struct intrinsic variants add one additional operand over raw.
3334   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3335   Register VIndex;
3336   if (HasVIndex) {
3337     VIndex = MI.getOperand(4 + OpOffset).getReg();
3338     ++OpOffset;
3339   }
3340 
3341   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3342   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3343   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3344 
3345   MachineMemOperand *MMO = *MI.memoperands_begin();
3346 
3347   unsigned ImmOffset;
3348   unsigned TotalOffset;
3349   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3350   if (TotalOffset != 0)
3351     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3352 
3353   if (!VIndex)
3354     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3355 
3356   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
3357     .addDef(Dst)
3358     .addUse(VData); // vdata
3359 
3360   if (IsCmpSwap)
3361     MIB.addReg(CmpVal);
3362 
3363   MIB.addUse(RSrc)               // rsrc
3364      .addUse(VIndex)             // vindex
3365      .addUse(VOffset)            // voffset
3366      .addUse(SOffset)            // soffset
3367      .addImm(ImmOffset)          // offset(imm)
3368      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3369      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3370      .addMemOperand(MMO);
3371 
3372   MI.eraseFromParent();
3373   return true;
3374 }
3375 
3376 // Produce a vector of s16 elements from s32 pieces.
3377 static void truncToS16Vector(MachineIRBuilder &B, Register DstReg,
3378                              ArrayRef<Register> UnmergeParts) {
3379   const LLT S16 = LLT::scalar(16);
3380 
3381   SmallVector<Register, 4> RemergeParts(UnmergeParts.size());
3382   for (int I = 0, E = UnmergeParts.size(); I != E; ++I)
3383     RemergeParts[I] = B.buildTrunc(S16, UnmergeParts[I]).getReg(0);
3384 
3385   B.buildBuildVector(DstReg, RemergeParts);
3386 }
3387 
3388 /// Convert a set of s32 registers to a result vector with s16 elements.
3389 static void bitcastToS16Vector(MachineIRBuilder &B, Register DstReg,
3390                                ArrayRef<Register> UnmergeParts) {
3391   MachineRegisterInfo &MRI = *B.getMRI();
3392   const LLT V2S16 = LLT::vector(2, 16);
3393   LLT TargetTy = MRI.getType(DstReg);
3394   int NumElts = UnmergeParts.size();
3395 
3396   if (NumElts == 1) {
3397     assert(TargetTy == V2S16);
3398     B.buildBitcast(DstReg, UnmergeParts[0]);
3399     return;
3400   }
3401 
3402   SmallVector<Register, 4> RemergeParts(NumElts);
3403   for (int I = 0; I != NumElts; ++I)
3404     RemergeParts[I] = B.buildBitcast(V2S16, UnmergeParts[I]).getReg(0);
3405 
3406   if (TargetTy.getSizeInBits() == 32u * NumElts) {
3407     B.buildConcatVectors(DstReg, RemergeParts);
3408     return;
3409   }
3410 
3411   const LLT V3S16 = LLT::vector(3, 16);
3412   const LLT V6S16 = LLT::vector(6, 16);
3413 
3414   // Widen to v6s16 and unpack v3 parts.
3415   assert(TargetTy == V3S16);
3416 
3417   RemergeParts.push_back(B.buildUndef(V2S16).getReg(0));
3418   auto Concat = B.buildConcatVectors(V6S16, RemergeParts);
3419   B.buildUnmerge({DstReg, MRI.createGenericVirtualRegister(V3S16)}, Concat);
3420 }
3421 
3422 // FIXME: Just vector trunc should be sufficent, but legalization currently
3423 // broken.
3424 static void repackUnpackedD16Load(MachineIRBuilder &B, Register DstReg,
3425                                   Register WideDstReg) {
3426   const LLT S32 = LLT::scalar(32);
3427   const LLT S16 = LLT::scalar(16);
3428 
3429   auto Unmerge = B.buildUnmerge(S32, WideDstReg);
3430 
3431   int NumOps = Unmerge->getNumOperands() - 1;
3432   SmallVector<Register, 4> RemergeParts(NumOps);
3433   for (int I = 0; I != NumOps; ++I)
3434     RemergeParts[I] = B.buildTrunc(S16, Unmerge.getReg(I)).getReg(0);
3435 
3436   B.buildBuildVector(DstReg, RemergeParts);
3437 }
3438 
3439 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3440     MachineInstr &MI, MachineIRBuilder &B,
3441     GISelChangeObserver &Observer,
3442     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3443   bool IsTFE = MI.getNumExplicitDefs() == 2;
3444 
3445   // We are only processing the operands of d16 image operations on subtargets
3446   // that use the unpacked register layout, or need to repack the TFE result.
3447 
3448   // TODO: Need to handle a16 images too
3449   // TODO: Do we need to guard against already legalized intrinsics?
3450   if (!IsTFE && !ST.hasUnpackedD16VMem())
3451     return true;
3452 
3453   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3454     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3455 
3456   if (BaseOpcode->Atomic) // No d16 atomics, or TFE.
3457     return true;
3458 
3459   B.setInstr(MI);
3460 
3461   MachineRegisterInfo *MRI = B.getMRI();
3462   const LLT S32 = LLT::scalar(32);
3463   const LLT S16 = LLT::scalar(16);
3464 
3465   if (BaseOpcode->Store) { // No TFE for stores?
3466     Register VData = MI.getOperand(1).getReg();
3467     LLT Ty = MRI->getType(VData);
3468     if (!Ty.isVector() || Ty.getElementType() != S16)
3469       return true;
3470 
3471     B.setInstr(MI);
3472 
3473     Observer.changingInstr(MI);
3474     MI.getOperand(1).setReg(handleD16VData(B, *MRI, VData));
3475     Observer.changedInstr(MI);
3476     return true;
3477   }
3478 
3479   Register DstReg = MI.getOperand(0).getReg();
3480   LLT Ty = MRI->getType(DstReg);
3481   const LLT EltTy = Ty.getScalarType();
3482   const bool IsD16 = Ty.getScalarType() == S16;
3483   const unsigned NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
3484 
3485   if (IsTFE) {
3486     // In the IR, TFE is supposed to be used with a 2 element struct return
3487     // type. The intruction really returns these two values in one contiguous
3488     // register, with one additional dword beyond the loaded data. Rewrite the
3489     // return type to use a single register result.
3490     Register Dst1Reg = MI.getOperand(1).getReg();
3491     if (MRI->getType(Dst1Reg) != S32)
3492       return false;
3493 
3494     // TODO: Make sure the TFE operand bit is set.
3495 
3496     // The raw dword aligned data component of the load. The only legal cases
3497     // where this matters should be when using the packed D16 format, for
3498     // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
3499     LLT RoundedTy;
3500     LLT TFETy;
3501 
3502     if (IsD16 && ST.hasUnpackedD16VMem()) {
3503       RoundedTy = LLT::scalarOrVector(NumElts, 32);
3504       TFETy = LLT::vector(NumElts + 1, 32);
3505     } else {
3506       unsigned EltSize = Ty.getScalarSizeInBits();
3507       unsigned RoundedElts = (Ty.getSizeInBits() + 31) / 32;
3508       unsigned RoundedSize = 32 * RoundedElts;
3509       RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
3510       TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
3511     }
3512 
3513     Register TFEReg = MRI->createGenericVirtualRegister(TFETy);
3514     Observer.changingInstr(MI);
3515 
3516     MI.getOperand(0).setReg(TFEReg);
3517     MI.RemoveOperand(1);
3518 
3519     Observer.changedInstr(MI);
3520 
3521     // Insert after the instruction.
3522     B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3523 
3524     // Now figure out how to copy the new result register back into the old
3525     // result.
3526 
3527     SmallVector<Register, 5> UnmergeResults(TFETy.getNumElements(), Dst1Reg);
3528     int NumDataElts = TFETy.getNumElements() - 1;
3529 
3530     if (!Ty.isVector()) {
3531       // Simplest case is a trivial unmerge (plus a truncate for d16).
3532       UnmergeResults[0] = Ty == S32 ?
3533         DstReg : MRI->createGenericVirtualRegister(S32);
3534 
3535       B.buildUnmerge(UnmergeResults, TFEReg);
3536       if (Ty != S32)
3537         B.buildTrunc(DstReg, UnmergeResults[0]);
3538       return true;
3539     }
3540 
3541     // We have to repack into a new vector of some kind.
3542     for (int I = 0; I != NumDataElts; ++I)
3543       UnmergeResults[I] = MRI->createGenericVirtualRegister(S32);
3544     B.buildUnmerge(UnmergeResults, TFEReg);
3545 
3546     // Drop the final TFE element.
3547     ArrayRef<Register> DataPart(UnmergeResults.data(), NumDataElts);
3548 
3549     if (EltTy == S32)
3550       B.buildBuildVector(DstReg, DataPart);
3551     else if (ST.hasUnpackedD16VMem())
3552       truncToS16Vector(B, DstReg, DataPart);
3553     else
3554       bitcastToS16Vector(B, DstReg, DataPart);
3555 
3556     return true;
3557   }
3558 
3559   // Must be an image load.
3560   if (!Ty.isVector() || Ty.getElementType() != S16)
3561     return true;
3562 
3563   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3564 
3565   LLT WidenedTy = Ty.changeElementType(S32);
3566   Register WideDstReg = MRI->createGenericVirtualRegister(WidenedTy);
3567 
3568   Observer.changingInstr(MI);
3569   MI.getOperand(0).setReg(WideDstReg);
3570   Observer.changedInstr(MI);
3571 
3572   repackUnpackedD16Load(B, DstReg, WideDstReg);
3573   return true;
3574 }
3575 
3576 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
3577   MachineInstr &MI, MachineIRBuilder &B,
3578   GISelChangeObserver &Observer) const {
3579   Register Dst = MI.getOperand(0).getReg();
3580   LLT Ty = B.getMRI()->getType(Dst);
3581   unsigned Size = Ty.getSizeInBits();
3582   MachineFunction &MF = B.getMF();
3583 
3584   Observer.changingInstr(MI);
3585 
3586   // FIXME: We don't really need this intermediate instruction. The intrinsic
3587   // should be fixed to have a memory operand. Since it's readnone, we're not
3588   // allowed to add one.
3589   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
3590   MI.RemoveOperand(1); // Remove intrinsic ID
3591 
3592   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
3593   // TODO: Should this use datalayout alignment?
3594   const unsigned MemSize = (Size + 7) / 8;
3595   const unsigned MemAlign = 4;
3596   MachineMemOperand *MMO = MF.getMachineMemOperand(
3597     MachinePointerInfo(),
3598     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
3599     MachineMemOperand::MOInvariant, MemSize, MemAlign);
3600   MI.addMemOperand(MF, MMO);
3601 
3602   // There are no 96-bit result scalar loads, but widening to 128-bit should
3603   // always be legal. We may need to restore this to a 96-bit result if it turns
3604   // out this needs to be converted to a vector load during RegBankSelect.
3605   if (!isPowerOf2_32(Size)) {
3606     LegalizerHelper Helper(MF, *this, Observer, B);
3607     B.setInstr(MI);
3608 
3609     if (Ty.isVector())
3610       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
3611     else
3612       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
3613   }
3614 
3615   Observer.changedInstr(MI);
3616   return true;
3617 }
3618 
3619 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
3620                                                 MachineRegisterInfo &MRI,
3621                                                 MachineIRBuilder &B) const {
3622   B.setInstr(MI);
3623 
3624   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
3625   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
3626       !ST.isTrapHandlerEnabled()) {
3627     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
3628   } else {
3629     // Pass queue pointer to trap handler as input, and insert trap instruction
3630     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
3631     const ArgDescriptor *Arg =
3632         getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR);
3633     if (!Arg)
3634       return false;
3635     MachineRegisterInfo &MRI = *B.getMRI();
3636     Register SGPR01(AMDGPU::SGPR0_SGPR1);
3637     Register LiveIn = getLiveInRegister(
3638         B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64),
3639         /*InsertLiveInCopy=*/false);
3640     if (!loadInputValue(LiveIn, B, Arg))
3641       return false;
3642     B.buildCopy(SGPR01, LiveIn);
3643     B.buildInstr(AMDGPU::S_TRAP)
3644         .addImm(GCNSubtarget::TrapIDLLVMTrap)
3645         .addReg(SGPR01, RegState::Implicit);
3646   }
3647 
3648   MI.eraseFromParent();
3649   return true;
3650 }
3651 
3652 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
3653     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
3654   B.setInstr(MI);
3655 
3656   // Is non-HSA path or trap-handler disabled? then, report a warning
3657   // accordingly
3658   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
3659       !ST.isTrapHandlerEnabled()) {
3660     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
3661                                      "debugtrap handler not supported",
3662                                      MI.getDebugLoc(), DS_Warning);
3663     LLVMContext &Ctx = B.getMF().getFunction().getContext();
3664     Ctx.diagnose(NoTrap);
3665   } else {
3666     // Insert debug-trap instruction
3667     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
3668   }
3669 
3670   MI.eraseFromParent();
3671   return true;
3672 }
3673 
3674 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
3675                                             MachineIRBuilder &B,
3676                                             GISelChangeObserver &Observer) const {
3677   MachineRegisterInfo &MRI = *B.getMRI();
3678 
3679   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
3680   auto IntrID = MI.getIntrinsicID();
3681   switch (IntrID) {
3682   case Intrinsic::amdgcn_if:
3683   case Intrinsic::amdgcn_else: {
3684     MachineInstr *Br = nullptr;
3685     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
3686       const SIRegisterInfo *TRI
3687         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
3688 
3689       B.setInstr(*BrCond);
3690       Register Def = MI.getOperand(1).getReg();
3691       Register Use = MI.getOperand(3).getReg();
3692 
3693       MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB();
3694       if (Br)
3695         BrTarget = Br->getOperand(0).getMBB();
3696 
3697       if (IntrID == Intrinsic::amdgcn_if) {
3698         B.buildInstr(AMDGPU::SI_IF)
3699           .addDef(Def)
3700           .addUse(Use)
3701           .addMBB(BrTarget);
3702       } else {
3703         B.buildInstr(AMDGPU::SI_ELSE)
3704           .addDef(Def)
3705           .addUse(Use)
3706           .addMBB(BrTarget)
3707           .addImm(0);
3708       }
3709 
3710       if (Br)
3711         Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB());
3712 
3713       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
3714       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
3715       MI.eraseFromParent();
3716       BrCond->eraseFromParent();
3717       return true;
3718     }
3719 
3720     return false;
3721   }
3722   case Intrinsic::amdgcn_loop: {
3723     MachineInstr *Br = nullptr;
3724     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
3725       const SIRegisterInfo *TRI
3726         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
3727 
3728       B.setInstr(*BrCond);
3729 
3730       MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB();
3731       if (Br)
3732         BrTarget = Br->getOperand(0).getMBB();
3733 
3734       Register Reg = MI.getOperand(2).getReg();
3735       B.buildInstr(AMDGPU::SI_LOOP)
3736         .addUse(Reg)
3737         .addMBB(BrTarget);
3738 
3739       if (Br)
3740         Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB());
3741 
3742       MI.eraseFromParent();
3743       BrCond->eraseFromParent();
3744       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
3745       return true;
3746     }
3747 
3748     return false;
3749   }
3750   case Intrinsic::amdgcn_kernarg_segment_ptr:
3751     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
3752       B.setInstr(MI);
3753       // This only makes sense to call in a kernel, so just lower to null.
3754       B.buildConstant(MI.getOperand(0).getReg(), 0);
3755       MI.eraseFromParent();
3756       return true;
3757     }
3758 
3759     return legalizePreloadedArgIntrin(
3760       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
3761   case Intrinsic::amdgcn_implicitarg_ptr:
3762     return legalizeImplicitArgPtr(MI, MRI, B);
3763   case Intrinsic::amdgcn_workitem_id_x:
3764     return legalizePreloadedArgIntrin(MI, MRI, B,
3765                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
3766   case Intrinsic::amdgcn_workitem_id_y:
3767     return legalizePreloadedArgIntrin(MI, MRI, B,
3768                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
3769   case Intrinsic::amdgcn_workitem_id_z:
3770     return legalizePreloadedArgIntrin(MI, MRI, B,
3771                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
3772   case Intrinsic::amdgcn_workgroup_id_x:
3773     return legalizePreloadedArgIntrin(MI, MRI, B,
3774                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
3775   case Intrinsic::amdgcn_workgroup_id_y:
3776     return legalizePreloadedArgIntrin(MI, MRI, B,
3777                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
3778   case Intrinsic::amdgcn_workgroup_id_z:
3779     return legalizePreloadedArgIntrin(MI, MRI, B,
3780                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
3781   case Intrinsic::amdgcn_dispatch_ptr:
3782     return legalizePreloadedArgIntrin(MI, MRI, B,
3783                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
3784   case Intrinsic::amdgcn_queue_ptr:
3785     return legalizePreloadedArgIntrin(MI, MRI, B,
3786                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
3787   case Intrinsic::amdgcn_implicit_buffer_ptr:
3788     return legalizePreloadedArgIntrin(
3789       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
3790   case Intrinsic::amdgcn_dispatch_id:
3791     return legalizePreloadedArgIntrin(MI, MRI, B,
3792                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
3793   case Intrinsic::amdgcn_fdiv_fast:
3794     return legalizeFDIVFastIntrin(MI, MRI, B);
3795   case Intrinsic::amdgcn_is_shared:
3796     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
3797   case Intrinsic::amdgcn_is_private:
3798     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
3799   case Intrinsic::amdgcn_wavefrontsize: {
3800     B.setInstr(MI);
3801     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
3802     MI.eraseFromParent();
3803     return true;
3804   }
3805   case Intrinsic::amdgcn_s_buffer_load:
3806     return legalizeSBufferLoad(MI, B, Observer);
3807   case Intrinsic::amdgcn_raw_buffer_store:
3808   case Intrinsic::amdgcn_struct_buffer_store:
3809     return legalizeBufferStore(MI, MRI, B, false, false);
3810   case Intrinsic::amdgcn_raw_buffer_store_format:
3811   case Intrinsic::amdgcn_struct_buffer_store_format:
3812     return legalizeBufferStore(MI, MRI, B, false, true);
3813   case Intrinsic::amdgcn_raw_tbuffer_store:
3814   case Intrinsic::amdgcn_struct_tbuffer_store:
3815     return legalizeBufferStore(MI, MRI, B, true, true);
3816   case Intrinsic::amdgcn_raw_buffer_load:
3817   case Intrinsic::amdgcn_struct_buffer_load:
3818     return legalizeBufferLoad(MI, MRI, B, false, false);
3819   case Intrinsic::amdgcn_raw_buffer_load_format:
3820   case Intrinsic::amdgcn_struct_buffer_load_format:
3821     return legalizeBufferLoad(MI, MRI, B, true, false);
3822   case Intrinsic::amdgcn_raw_tbuffer_load:
3823   case Intrinsic::amdgcn_struct_tbuffer_load:
3824     return legalizeBufferLoad(MI, MRI, B, true, true);
3825   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3826   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3827   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3828   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3829   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3830   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3831   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3832   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3833   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3834   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3835   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3836   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3837   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3838   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3839   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3840   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3841   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3842   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3843   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3844   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3845   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3846   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3847   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3848   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3849   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3850   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3851     return legalizeBufferAtomic(MI, B, IntrID);
3852   case Intrinsic::amdgcn_atomic_inc:
3853     return legalizeAtomicIncDec(MI, B, true);
3854   case Intrinsic::amdgcn_atomic_dec:
3855     return legalizeAtomicIncDec(MI, B, false);
3856   case Intrinsic::trap:
3857     return legalizeTrapIntrinsic(MI, MRI, B);
3858   case Intrinsic::debugtrap:
3859     return legalizeDebugTrapIntrinsic(MI, MRI, B);
3860   default: {
3861     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
3862             AMDGPU::getImageDimIntrinsicInfo(IntrID))
3863       return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr);
3864     return true;
3865   }
3866   }
3867 
3868   return true;
3869 }
3870