1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
19 #endif
20 
21 #include "AMDGPULegalizerInfo.h"
22 
23 #include "AMDGPU.h"
24 #include "AMDGPUGlobalISelUtils.h"
25 #include "AMDGPUTargetMachine.h"
26 #include "SIMachineFunctionInfo.h"
27 #include "llvm/ADT/ScopeExit.h"
28 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
29 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
30 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
31 #include "llvm/CodeGen/TargetOpcodes.h"
32 #include "llvm/CodeGen/ValueTypes.h"
33 #include "llvm/IR/DerivedTypes.h"
34 #include "llvm/IR/DiagnosticInfo.h"
35 #include "llvm/IR/Type.h"
36 #include "llvm/Support/Debug.h"
37 
38 #define DEBUG_TYPE "amdgpu-legalinfo"
39 
40 using namespace llvm;
41 using namespace LegalizeActions;
42 using namespace LegalizeMutations;
43 using namespace LegalityPredicates;
44 using namespace MIPatternMatch;
45 
46 // Round the number of elements to the next power of two elements
47 static LLT getPow2VectorType(LLT Ty) {
48   unsigned NElts = Ty.getNumElements();
49   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
50   return Ty.changeNumElements(Pow2NElts);
51 }
52 
53 // Round the number of bits to the next power of two bits
54 static LLT getPow2ScalarType(LLT Ty) {
55   unsigned Bits = Ty.getSizeInBits();
56   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
57   return LLT::scalar(Pow2Bits);
58 }
59 
60 static LegalityPredicate isMultiple32(unsigned TypeIdx,
61                                       unsigned MaxSize = 1024) {
62   return [=](const LegalityQuery &Query) {
63     const LLT Ty = Query.Types[TypeIdx];
64     const LLT EltTy = Ty.getScalarType();
65     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
66   };
67 }
68 
69 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
70   return [=](const LegalityQuery &Query) {
71     return Query.Types[TypeIdx].getSizeInBits() == Size;
72   };
73 }
74 
75 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
76   return [=](const LegalityQuery &Query) {
77     const LLT Ty = Query.Types[TypeIdx];
78     return Ty.isVector() &&
79            Ty.getNumElements() % 2 != 0 &&
80            Ty.getElementType().getSizeInBits() < 32 &&
81            Ty.getSizeInBits() % 32 != 0;
82   };
83 }
84 
85 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
86   return [=](const LegalityQuery &Query) {
87     const LLT Ty = Query.Types[TypeIdx];
88     const LLT EltTy = Ty.getScalarType();
89     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
90   };
91 }
92 
93 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
94   return [=](const LegalityQuery &Query) {
95     const LLT Ty = Query.Types[TypeIdx];
96     const LLT EltTy = Ty.getElementType();
97     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
98   };
99 }
100 
101 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
102   return [=](const LegalityQuery &Query) {
103     const LLT Ty = Query.Types[TypeIdx];
104     const LLT EltTy = Ty.getElementType();
105     unsigned Size = Ty.getSizeInBits();
106     unsigned Pieces = (Size + 63) / 64;
107     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
108     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
109   };
110 }
111 
112 // Increase the number of vector elements to reach the next multiple of 32-bit
113 // type.
114 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
115   return [=](const LegalityQuery &Query) {
116     const LLT Ty = Query.Types[TypeIdx];
117 
118     const LLT EltTy = Ty.getElementType();
119     const int Size = Ty.getSizeInBits();
120     const int EltSize = EltTy.getSizeInBits();
121     const int NextMul32 = (Size + 31) / 32;
122 
123     assert(EltSize < 32);
124 
125     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
126     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
127   };
128 }
129 
130 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
131   return [=](const LegalityQuery &Query) {
132     const LLT QueryTy = Query.Types[TypeIdx];
133     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
134   };
135 }
136 
137 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
138   return [=](const LegalityQuery &Query) {
139     const LLT QueryTy = Query.Types[TypeIdx];
140     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
141   };
142 }
143 
144 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
145   return [=](const LegalityQuery &Query) {
146     const LLT QueryTy = Query.Types[TypeIdx];
147     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
148   };
149 }
150 
151 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
152 // v2s16.
153 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
154   return [=](const LegalityQuery &Query) {
155     const LLT Ty = Query.Types[TypeIdx];
156     if (Ty.isVector()) {
157       const int EltSize = Ty.getElementType().getSizeInBits();
158       return EltSize == 32 || EltSize == 64 ||
159             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
160              EltSize == 128 || EltSize == 256;
161     }
162 
163     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
164   };
165 }
166 
167 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
168   return [=](const LegalityQuery &Query) {
169     const LLT QueryTy = Query.Types[TypeIdx];
170     return QueryTy.isVector() && QueryTy.getElementType() == Type;
171   };
172 }
173 
174 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
175   return [=](const LegalityQuery &Query) {
176     const LLT QueryTy = Query.Types[TypeIdx];
177     if (!QueryTy.isVector())
178       return false;
179     const LLT EltTy = QueryTy.getElementType();
180     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
181   };
182 }
183 
184 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
185   return [=](const LegalityQuery &Query) {
186     const LLT Ty = Query.Types[TypeIdx];
187     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
188            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
189   };
190 }
191 
192 static LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1) {
193   return [=](const LegalityQuery &Query) {
194     return Query.Types[TypeIdx0].getSizeInBits() <
195            Query.Types[TypeIdx1].getSizeInBits();
196   };
197 }
198 
199 static LegalityPredicate greaterThan(unsigned TypeIdx0, unsigned TypeIdx1) {
200   return [=](const LegalityQuery &Query) {
201     return Query.Types[TypeIdx0].getSizeInBits() >
202            Query.Types[TypeIdx1].getSizeInBits();
203   };
204 }
205 
206 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
207                                          const GCNTargetMachine &TM)
208   :  ST(ST_) {
209   using namespace TargetOpcode;
210 
211   auto GetAddrSpacePtr = [&TM](unsigned AS) {
212     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
213   };
214 
215   const LLT S1 = LLT::scalar(1);
216   const LLT S16 = LLT::scalar(16);
217   const LLT S32 = LLT::scalar(32);
218   const LLT S64 = LLT::scalar(64);
219   const LLT S128 = LLT::scalar(128);
220   const LLT S256 = LLT::scalar(256);
221   const LLT S512 = LLT::scalar(512);
222   const LLT S1024 = LLT::scalar(1024);
223 
224   const LLT V2S16 = LLT::vector(2, 16);
225   const LLT V4S16 = LLT::vector(4, 16);
226 
227   const LLT V2S32 = LLT::vector(2, 32);
228   const LLT V3S32 = LLT::vector(3, 32);
229   const LLT V4S32 = LLT::vector(4, 32);
230   const LLT V5S32 = LLT::vector(5, 32);
231   const LLT V6S32 = LLT::vector(6, 32);
232   const LLT V7S32 = LLT::vector(7, 32);
233   const LLT V8S32 = LLT::vector(8, 32);
234   const LLT V9S32 = LLT::vector(9, 32);
235   const LLT V10S32 = LLT::vector(10, 32);
236   const LLT V11S32 = LLT::vector(11, 32);
237   const LLT V12S32 = LLT::vector(12, 32);
238   const LLT V13S32 = LLT::vector(13, 32);
239   const LLT V14S32 = LLT::vector(14, 32);
240   const LLT V15S32 = LLT::vector(15, 32);
241   const LLT V16S32 = LLT::vector(16, 32);
242   const LLT V32S32 = LLT::vector(32, 32);
243 
244   const LLT V2S64 = LLT::vector(2, 64);
245   const LLT V3S64 = LLT::vector(3, 64);
246   const LLT V4S64 = LLT::vector(4, 64);
247   const LLT V5S64 = LLT::vector(5, 64);
248   const LLT V6S64 = LLT::vector(6, 64);
249   const LLT V7S64 = LLT::vector(7, 64);
250   const LLT V8S64 = LLT::vector(8, 64);
251   const LLT V16S64 = LLT::vector(16, 64);
252 
253   std::initializer_list<LLT> AllS32Vectors =
254     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
255      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
256   std::initializer_list<LLT> AllS64Vectors =
257     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
258 
259   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
260   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
261   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
262   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
263   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
264   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
265   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
266 
267   const LLT CodePtr = FlatPtr;
268 
269   const std::initializer_list<LLT> AddrSpaces64 = {
270     GlobalPtr, ConstantPtr, FlatPtr
271   };
272 
273   const std::initializer_list<LLT> AddrSpaces32 = {
274     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
275   };
276 
277   const std::initializer_list<LLT> FPTypesBase = {
278     S32, S64
279   };
280 
281   const std::initializer_list<LLT> FPTypes16 = {
282     S32, S64, S16
283   };
284 
285   const std::initializer_list<LLT> FPTypesPK16 = {
286     S32, S64, S16, V2S16
287   };
288 
289   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
290 
291   setAction({G_BRCOND, S1}, Legal); // VCC branches
292   setAction({G_BRCOND, S32}, Legal); // SCC branches
293 
294   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
295   // elements for v3s16
296   getActionDefinitionsBuilder(G_PHI)
297     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
298     .legalFor(AllS32Vectors)
299     .legalFor(AllS64Vectors)
300     .legalFor(AddrSpaces64)
301     .legalFor(AddrSpaces32)
302     .clampScalar(0, S32, S256)
303     .widenScalarToNextPow2(0, 32)
304     .clampMaxNumElements(0, S32, 16)
305     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
306     .legalIf(isPointer(0));
307 
308   if (ST.hasVOP3PInsts()) {
309     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
310       .legalFor({S32, S16, V2S16})
311       .clampScalar(0, S16, S32)
312       .clampMaxNumElements(0, S16, 2)
313       .scalarize(0)
314       .widenScalarToNextPow2(0, 32);
315   } else if (ST.has16BitInsts()) {
316     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
317       .legalFor({S32, S16})
318       .clampScalar(0, S16, S32)
319       .scalarize(0)
320       .widenScalarToNextPow2(0, 32);
321   } else {
322     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
323       .legalFor({S32})
324       .clampScalar(0, S32, S32)
325       .scalarize(0);
326   }
327 
328   // FIXME: Not really legal. Placeholder for custom lowering.
329   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
330     .customFor({S32, S64})
331     .clampScalar(0, S32, S64)
332     .widenScalarToNextPow2(0, 32)
333     .scalarize(0);
334 
335   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
336     .legalFor({S32})
337     .clampScalar(0, S32, S32)
338     .scalarize(0);
339 
340   // Report legal for any types we can handle anywhere. For the cases only legal
341   // on the SALU, RegBankSelect will be able to re-legalize.
342   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
343     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
344     .clampScalar(0, S32, S64)
345     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
346     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
347     .widenScalarToNextPow2(0)
348     .scalarize(0);
349 
350   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
351                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
352     .legalFor({{S32, S1}, {S32, S32}})
353     .minScalar(0, S32)
354     // TODO: .scalarize(0)
355     .lower();
356 
357   getActionDefinitionsBuilder(G_BITCAST)
358     // Don't worry about the size constraint.
359     .legalIf(all(isRegisterType(0), isRegisterType(1)))
360     .lower();
361 
362 
363   getActionDefinitionsBuilder(G_CONSTANT)
364     .legalFor({S1, S32, S64, S16, GlobalPtr,
365                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
366     .clampScalar(0, S32, S64)
367     .widenScalarToNextPow2(0)
368     .legalIf(isPointer(0));
369 
370   getActionDefinitionsBuilder(G_FCONSTANT)
371     .legalFor({S32, S64, S16})
372     .clampScalar(0, S16, S64);
373 
374   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
375       .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
376                  ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
377       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
378       .clampScalarOrElt(0, S32, S1024)
379       .legalIf(isMultiple32(0))
380       .widenScalarToNextPow2(0, 32)
381       .clampMaxNumElements(0, S32, 16);
382 
383   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
384   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
385     .unsupportedFor({PrivatePtr})
386     .custom();
387   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
388 
389   auto &FPOpActions = getActionDefinitionsBuilder(
390     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
391     .legalFor({S32, S64});
392   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
393     .customFor({S32, S64});
394   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
395     .customFor({S32, S64});
396 
397   if (ST.has16BitInsts()) {
398     if (ST.hasVOP3PInsts())
399       FPOpActions.legalFor({S16, V2S16});
400     else
401       FPOpActions.legalFor({S16});
402 
403     TrigActions.customFor({S16});
404     FDIVActions.customFor({S16});
405   }
406 
407   auto &MinNumMaxNum = getActionDefinitionsBuilder({
408       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
409 
410   if (ST.hasVOP3PInsts()) {
411     MinNumMaxNum.customFor(FPTypesPK16)
412       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
413       .clampMaxNumElements(0, S16, 2)
414       .clampScalar(0, S16, S64)
415       .scalarize(0);
416   } else if (ST.has16BitInsts()) {
417     MinNumMaxNum.customFor(FPTypes16)
418       .clampScalar(0, S16, S64)
419       .scalarize(0);
420   } else {
421     MinNumMaxNum.customFor(FPTypesBase)
422       .clampScalar(0, S32, S64)
423       .scalarize(0);
424   }
425 
426   if (ST.hasVOP3PInsts())
427     FPOpActions.clampMaxNumElements(0, S16, 2);
428 
429   FPOpActions
430     .scalarize(0)
431     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
432 
433   TrigActions
434     .scalarize(0)
435     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
436 
437   FDIVActions
438     .scalarize(0)
439     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
440 
441   getActionDefinitionsBuilder({G_FNEG, G_FABS})
442     .legalFor(FPTypesPK16)
443     .clampMaxNumElements(0, S16, 2)
444     .scalarize(0)
445     .clampScalar(0, S16, S64);
446 
447   if (ST.has16BitInsts()) {
448     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
449       .legalFor({S32, S64, S16})
450       .scalarize(0)
451       .clampScalar(0, S16, S64);
452   } else {
453     getActionDefinitionsBuilder(G_FSQRT)
454       .legalFor({S32, S64})
455       .scalarize(0)
456       .clampScalar(0, S32, S64);
457 
458     if (ST.hasFractBug()) {
459       getActionDefinitionsBuilder(G_FFLOOR)
460         .customFor({S64})
461         .legalFor({S32, S64})
462         .scalarize(0)
463         .clampScalar(0, S32, S64);
464     } else {
465       getActionDefinitionsBuilder(G_FFLOOR)
466         .legalFor({S32, S64})
467         .scalarize(0)
468         .clampScalar(0, S32, S64);
469     }
470   }
471 
472   getActionDefinitionsBuilder(G_FPTRUNC)
473     .legalFor({{S32, S64}, {S16, S32}})
474     .scalarize(0)
475     .lower();
476 
477   getActionDefinitionsBuilder(G_FPEXT)
478     .legalFor({{S64, S32}, {S32, S16}})
479     .lowerFor({{S64, S16}}) // FIXME: Implement
480     .scalarize(0);
481 
482   getActionDefinitionsBuilder(G_FSUB)
483       // Use actual fsub instruction
484       .legalFor({S32})
485       // Must use fadd + fneg
486       .lowerFor({S64, S16, V2S16})
487       .scalarize(0)
488       .clampScalar(0, S32, S64);
489 
490   // Whether this is legal depends on the floating point mode for the function.
491   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
492   if (ST.hasMadF16())
493     FMad.customFor({S32, S16});
494   else
495     FMad.customFor({S32});
496   FMad.scalarize(0)
497       .lower();
498 
499   // TODO: Do we need to clamp maximum bitwidth?
500   getActionDefinitionsBuilder(G_TRUNC)
501     .legalIf(isScalar(0))
502     .legalFor({{V2S16, V2S32}})
503     .clampMaxNumElements(0, S16, 2)
504     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
505     // situations (like an invalid implicit use), we don't want to infinite loop
506     // in the legalizer.
507     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
508     .alwaysLegal();
509 
510   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
511     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
512                {S32, S1}, {S64, S1}, {S16, S1}})
513     .scalarize(0)
514     .clampScalar(0, S32, S64)
515     .widenScalarToNextPow2(1, 32);
516 
517   // TODO: Split s1->s64 during regbankselect for VALU.
518   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
519     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
520     .lowerFor({{S32, S64}})
521     .lowerIf(typeIs(1, S1))
522     .customFor({{S64, S64}});
523   if (ST.has16BitInsts())
524     IToFP.legalFor({{S16, S16}});
525   IToFP.clampScalar(1, S32, S64)
526        .scalarize(0)
527        .widenScalarToNextPow2(1);
528 
529   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
530     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
531     .customFor({{S64, S64}});
532   if (ST.has16BitInsts())
533     FPToI.legalFor({{S16, S16}});
534   else
535     FPToI.minScalar(1, S32);
536 
537   FPToI.minScalar(0, S32)
538        .scalarize(0)
539        .lower();
540 
541   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
542     .scalarize(0)
543     .lower();
544 
545   if (ST.has16BitInsts()) {
546     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
547       .legalFor({S16, S32, S64})
548       .clampScalar(0, S16, S64)
549       .scalarize(0);
550   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
551     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
552       .legalFor({S32, S64})
553       .clampScalar(0, S32, S64)
554       .scalarize(0);
555   } else {
556     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
557       .legalFor({S32})
558       .customFor({S64})
559       .clampScalar(0, S32, S64)
560       .scalarize(0);
561   }
562 
563   getActionDefinitionsBuilder({G_PTR_ADD, G_PTR_MASK})
564     .scalarize(0)
565     .alwaysLegal();
566 
567   auto &CmpBuilder =
568     getActionDefinitionsBuilder(G_ICMP)
569     // The compare output type differs based on the register bank of the output,
570     // so make both s1 and s32 legal.
571     //
572     // Scalar compares producing output in scc will be promoted to s32, as that
573     // is the allocatable register type that will be needed for the copy from
574     // scc. This will be promoted during RegBankSelect, and we assume something
575     // before that won't try to use s32 result types.
576     //
577     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
578     // bank.
579     .legalForCartesianProduct(
580       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
581     .legalForCartesianProduct(
582       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
583   if (ST.has16BitInsts()) {
584     CmpBuilder.legalFor({{S1, S16}});
585   }
586 
587   CmpBuilder
588     .widenScalarToNextPow2(1)
589     .clampScalar(1, S32, S64)
590     .scalarize(0)
591     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
592 
593   getActionDefinitionsBuilder(G_FCMP)
594     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
595     .widenScalarToNextPow2(1)
596     .clampScalar(1, S32, S64)
597     .scalarize(0);
598 
599   // FIXME: fpow has a selection pattern that should move to custom lowering.
600   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
601   if (ST.has16BitInsts())
602     Exp2Ops.legalFor({S32, S16});
603   else
604     Exp2Ops.legalFor({S32});
605   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
606   Exp2Ops.scalarize(0);
607 
608   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
609   if (ST.has16BitInsts())
610     ExpOps.customFor({{S32}, {S16}});
611   else
612     ExpOps.customFor({S32});
613   ExpOps.clampScalar(0, MinScalarFPTy, S32)
614         .scalarize(0);
615 
616   // The 64-bit versions produce 32-bit results, but only on the SALU.
617   getActionDefinitionsBuilder(G_CTPOP)
618     .legalFor({{S32, S32}, {S32, S64}})
619     .clampScalar(0, S32, S32)
620     .clampScalar(1, S32, S64)
621     .scalarize(0)
622     .widenScalarToNextPow2(0, 32)
623     .widenScalarToNextPow2(1, 32);
624 
625   // The hardware instructions return a different result on 0 than the generic
626   // instructions expect. The hardware produces -1, but these produce the
627   // bitwidth.
628   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
629     .scalarize(0)
630     .clampScalar(0, S32, S32)
631     .clampScalar(1, S32, S64)
632     .widenScalarToNextPow2(0, 32)
633     .widenScalarToNextPow2(1, 32)
634     .lower();
635 
636   // The 64-bit versions produce 32-bit results, but only on the SALU.
637   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
638     .legalFor({{S32, S32}, {S32, S64}})
639     .clampScalar(0, S32, S32)
640     .clampScalar(1, S32, S64)
641     .scalarize(0)
642     .widenScalarToNextPow2(0, 32)
643     .widenScalarToNextPow2(1, 32);
644 
645   getActionDefinitionsBuilder(G_BITREVERSE)
646     .legalFor({S32})
647     .clampScalar(0, S32, S32)
648     .scalarize(0);
649 
650   if (ST.has16BitInsts()) {
651     getActionDefinitionsBuilder(G_BSWAP)
652       .legalFor({S16, S32, V2S16})
653       .clampMaxNumElements(0, S16, 2)
654       // FIXME: Fixing non-power-of-2 before clamp is workaround for
655       // narrowScalar limitation.
656       .widenScalarToNextPow2(0)
657       .clampScalar(0, S16, S32)
658       .scalarize(0);
659 
660     if (ST.hasVOP3PInsts()) {
661       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
662         .legalFor({S32, S16, V2S16})
663         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
664         .clampMaxNumElements(0, S16, 2)
665         .minScalar(0, S16)
666         .widenScalarToNextPow2(0)
667         .scalarize(0)
668         .lower();
669     } else {
670       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
671         .legalFor({S32, S16})
672         .widenScalarToNextPow2(0)
673         .minScalar(0, S16)
674         .scalarize(0)
675         .lower();
676     }
677   } else {
678     // TODO: Should have same legality without v_perm_b32
679     getActionDefinitionsBuilder(G_BSWAP)
680       .legalFor({S32})
681       .lowerIf(narrowerThan(0, 32))
682       // FIXME: Fixing non-power-of-2 before clamp is workaround for
683       // narrowScalar limitation.
684       .widenScalarToNextPow2(0)
685       .maxScalar(0, S32)
686       .scalarize(0)
687       .lower();
688 
689     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
690       .legalFor({S32})
691       .minScalar(0, S32)
692       .widenScalarToNextPow2(0)
693       .scalarize(0)
694       .lower();
695   }
696 
697   getActionDefinitionsBuilder(G_INTTOPTR)
698     // List the common cases
699     .legalForCartesianProduct(AddrSpaces64, {S64})
700     .legalForCartesianProduct(AddrSpaces32, {S32})
701     .scalarize(0)
702     // Accept any address space as long as the size matches
703     .legalIf(sameSize(0, 1))
704     .widenScalarIf(smallerThan(1, 0),
705       [](const LegalityQuery &Query) {
706         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
707       })
708     .narrowScalarIf(greaterThan(1, 0),
709       [](const LegalityQuery &Query) {
710         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
711       });
712 
713   getActionDefinitionsBuilder(G_PTRTOINT)
714     // List the common cases
715     .legalForCartesianProduct(AddrSpaces64, {S64})
716     .legalForCartesianProduct(AddrSpaces32, {S32})
717     .scalarize(0)
718     // Accept any address space as long as the size matches
719     .legalIf(sameSize(0, 1))
720     .widenScalarIf(smallerThan(0, 1),
721       [](const LegalityQuery &Query) {
722         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
723       })
724     .narrowScalarIf(
725       greaterThan(0, 1),
726       [](const LegalityQuery &Query) {
727         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
728       });
729 
730   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
731     .scalarize(0)
732     .custom();
733 
734   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
735   // handle some operations by just promoting the register during
736   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
737   auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned {
738     switch (AS) {
739     // FIXME: Private element size.
740     case AMDGPUAS::PRIVATE_ADDRESS:
741       return 32;
742     // FIXME: Check subtarget
743     case AMDGPUAS::LOCAL_ADDRESS:
744       return ST.useDS128() ? 128 : 64;
745 
746     // Treat constant and global as identical. SMRD loads are sometimes usable
747     // for global loads (ideally constant address space should be eliminated)
748     // depending on the context. Legality cannot be context dependent, but
749     // RegBankSelect can split the load as necessary depending on the pointer
750     // register bank/uniformity and if the memory is invariant or not written in
751     // a kernel.
752     case AMDGPUAS::CONSTANT_ADDRESS:
753     case AMDGPUAS::GLOBAL_ADDRESS:
754       return IsLoad ? 512 : 128;
755     default:
756       return 128;
757     }
758   };
759 
760   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
761                                     bool IsLoad) -> bool {
762     const LLT DstTy = Query.Types[0];
763 
764     // Split vector extloads.
765     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
766     unsigned Align = Query.MMODescrs[0].AlignInBits;
767 
768     if (MemSize < DstTy.getSizeInBits())
769       MemSize = std::max(MemSize, Align);
770 
771     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
772       return true;
773 
774     const LLT PtrTy = Query.Types[1];
775     unsigned AS = PtrTy.getAddressSpace();
776     if (MemSize > maxSizeForAddrSpace(AS, IsLoad))
777       return true;
778 
779     // Catch weird sized loads that don't evenly divide into the access sizes
780     // TODO: May be able to widen depending on alignment etc.
781     unsigned NumRegs = (MemSize + 31) / 32;
782     if (NumRegs == 3) {
783       if (!ST.hasDwordx3LoadStores())
784         return true;
785     } else {
786       // If the alignment allows, these should have been widened.
787       if (!isPowerOf2_32(NumRegs))
788         return true;
789     }
790 
791     if (Align < MemSize) {
792       const SITargetLowering *TLI = ST.getTargetLowering();
793       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
794     }
795 
796     return false;
797   };
798 
799   const auto shouldWidenLoadResult = [=](const LegalityQuery &Query) -> bool {
800     unsigned Size = Query.Types[0].getSizeInBits();
801     if (isPowerOf2_32(Size))
802       return false;
803 
804     if (Size == 96 && ST.hasDwordx3LoadStores())
805       return false;
806 
807     unsigned AddrSpace = Query.Types[1].getAddressSpace();
808     if (Size >= maxSizeForAddrSpace(AddrSpace, true))
809       return false;
810 
811     unsigned Align = Query.MMODescrs[0].AlignInBits;
812     unsigned RoundedSize = NextPowerOf2(Size);
813     return (Align >= RoundedSize);
814   };
815 
816   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
817   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
818   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
819 
820   // TODO: Refine based on subtargets which support unaligned access or 128-bit
821   // LDS
822   // TODO: Unsupported flat for SI.
823 
824   for (unsigned Op : {G_LOAD, G_STORE}) {
825     const bool IsStore = Op == G_STORE;
826 
827     auto &Actions = getActionDefinitionsBuilder(Op);
828     // Whitelist the common cases.
829     // TODO: Loads to s16 on gfx9
830     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
831                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
832                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
833                                       {S128, GlobalPtr, 128, GlobalAlign32},
834                                       {S64, GlobalPtr, 64, GlobalAlign32},
835                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
836                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
837                                       {S32, GlobalPtr, 8, GlobalAlign8},
838                                       {S32, GlobalPtr, 16, GlobalAlign16},
839 
840                                       {S32, LocalPtr, 32, 32},
841                                       {S64, LocalPtr, 64, 32},
842                                       {V2S32, LocalPtr, 64, 32},
843                                       {S32, LocalPtr, 8, 8},
844                                       {S32, LocalPtr, 16, 16},
845                                       {V2S16, LocalPtr, 32, 32},
846 
847                                       {S32, PrivatePtr, 32, 32},
848                                       {S32, PrivatePtr, 8, 8},
849                                       {S32, PrivatePtr, 16, 16},
850                                       {V2S16, PrivatePtr, 32, 32},
851 
852                                       {S32, FlatPtr, 32, GlobalAlign32},
853                                       {S32, FlatPtr, 16, GlobalAlign16},
854                                       {S32, FlatPtr, 8, GlobalAlign8},
855                                       {V2S16, FlatPtr, 32, GlobalAlign32},
856 
857                                       {S32, ConstantPtr, 32, GlobalAlign32},
858                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
859                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
860                                       {S64, ConstantPtr, 64, GlobalAlign32},
861                                       {S128, ConstantPtr, 128, GlobalAlign32},
862                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
863     Actions
864         .customIf(typeIs(1, Constant32Ptr))
865         // Widen suitably aligned loads by loading extra elements.
866         .moreElementsIf([=](const LegalityQuery &Query) {
867             const LLT Ty = Query.Types[0];
868             return Op == G_LOAD && Ty.isVector() &&
869                    shouldWidenLoadResult(Query);
870           }, moreElementsToNextPow2(0))
871         .widenScalarIf([=](const LegalityQuery &Query) {
872             const LLT Ty = Query.Types[0];
873             return Op == G_LOAD && !Ty.isVector() &&
874                    shouldWidenLoadResult(Query);
875           }, widenScalarOrEltToNextPow2(0))
876         .narrowScalarIf(
877             [=](const LegalityQuery &Query) -> bool {
878               return !Query.Types[0].isVector() &&
879                      needToSplitMemOp(Query, Op == G_LOAD);
880             },
881             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
882               const LLT DstTy = Query.Types[0];
883               const LLT PtrTy = Query.Types[1];
884 
885               const unsigned DstSize = DstTy.getSizeInBits();
886               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
887 
888               // Split extloads.
889               if (DstSize > MemSize)
890                 return std::make_pair(0, LLT::scalar(MemSize));
891 
892               if (!isPowerOf2_32(DstSize)) {
893                 // We're probably decomposing an odd sized store. Try to split
894                 // to the widest type. TODO: Account for alignment. As-is it
895                 // should be OK, since the new parts will be further legalized.
896                 unsigned FloorSize = PowerOf2Floor(DstSize);
897                 return std::make_pair(0, LLT::scalar(FloorSize));
898               }
899 
900               if (DstSize > 32 && (DstSize % 32 != 0)) {
901                 // FIXME: Need a way to specify non-extload of larger size if
902                 // suitably aligned.
903                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
904               }
905 
906               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
907                                                      Op == G_LOAD);
908               if (MemSize > MaxSize)
909                 return std::make_pair(0, LLT::scalar(MaxSize));
910 
911               unsigned Align = Query.MMODescrs[0].AlignInBits;
912               return std::make_pair(0, LLT::scalar(Align));
913             })
914         .fewerElementsIf(
915             [=](const LegalityQuery &Query) -> bool {
916               return Query.Types[0].isVector() &&
917                      needToSplitMemOp(Query, Op == G_LOAD);
918             },
919             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
920               const LLT DstTy = Query.Types[0];
921               const LLT PtrTy = Query.Types[1];
922 
923               LLT EltTy = DstTy.getElementType();
924               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
925                                                      Op == G_LOAD);
926 
927               // FIXME: Handle widened to power of 2 results better. This ends
928               // up scalarizing.
929               // FIXME: 3 element stores scalarized on SI
930 
931               // Split if it's too large for the address space.
932               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
933                 unsigned NumElts = DstTy.getNumElements();
934                 unsigned EltSize = EltTy.getSizeInBits();
935 
936                 if (MaxSize % EltSize == 0) {
937                   return std::make_pair(
938                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
939                 }
940 
941                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
942 
943                 // FIXME: Refine when odd breakdowns handled
944                 // The scalars will need to be re-legalized.
945                 if (NumPieces == 1 || NumPieces >= NumElts ||
946                     NumElts % NumPieces != 0)
947                   return std::make_pair(0, EltTy);
948 
949                 return std::make_pair(0,
950                                       LLT::vector(NumElts / NumPieces, EltTy));
951               }
952 
953               // FIXME: We could probably handle weird extending loads better.
954               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
955               if (DstTy.getSizeInBits() > MemSize)
956                 return std::make_pair(0, EltTy);
957 
958               unsigned EltSize = EltTy.getSizeInBits();
959               unsigned DstSize = DstTy.getSizeInBits();
960               if (!isPowerOf2_32(DstSize)) {
961                 // We're probably decomposing an odd sized store. Try to split
962                 // to the widest type. TODO: Account for alignment. As-is it
963                 // should be OK, since the new parts will be further legalized.
964                 unsigned FloorSize = PowerOf2Floor(DstSize);
965                 return std::make_pair(
966                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
967               }
968 
969               // Need to split because of alignment.
970               unsigned Align = Query.MMODescrs[0].AlignInBits;
971               if (EltSize > Align &&
972                   (EltSize / Align < DstTy.getNumElements())) {
973                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
974               }
975 
976               // May need relegalization for the scalars.
977               return std::make_pair(0, EltTy);
978             })
979         .minScalar(0, S32);
980 
981     if (IsStore)
982       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
983 
984     // TODO: Need a bitcast lower option?
985     Actions
986         .legalIf([=](const LegalityQuery &Query) {
987           const LLT Ty0 = Query.Types[0];
988           unsigned Size = Ty0.getSizeInBits();
989           unsigned MemSize = Query.MMODescrs[0].SizeInBits;
990           unsigned Align = Query.MMODescrs[0].AlignInBits;
991 
992           // FIXME: Widening store from alignment not valid.
993           if (MemSize < Size)
994             MemSize = std::max(MemSize, Align);
995 
996           // No extending vector loads.
997           if (Size > MemSize && Ty0.isVector())
998             return false;
999 
1000           switch (MemSize) {
1001           case 8:
1002           case 16:
1003             return Size == 32;
1004           case 32:
1005           case 64:
1006           case 128:
1007             return true;
1008           case 96:
1009             return ST.hasDwordx3LoadStores();
1010           case 256:
1011           case 512:
1012             return true;
1013           default:
1014             return false;
1015           }
1016         })
1017         .widenScalarToNextPow2(0)
1018         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
1019   }
1020 
1021   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1022                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1023                                                   {S32, GlobalPtr, 16, 2 * 8},
1024                                                   {S32, LocalPtr, 8, 8},
1025                                                   {S32, LocalPtr, 16, 16},
1026                                                   {S32, PrivatePtr, 8, 8},
1027                                                   {S32, PrivatePtr, 16, 16},
1028                                                   {S32, ConstantPtr, 8, 8},
1029                                                   {S32, ConstantPtr, 16, 2 * 8}});
1030   if (ST.hasFlatAddressSpace()) {
1031     ExtLoads.legalForTypesWithMemDesc(
1032         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1033   }
1034 
1035   ExtLoads.clampScalar(0, S32, S32)
1036           .widenScalarToNextPow2(0)
1037           .unsupportedIfMemSizeNotPow2()
1038           .lower();
1039 
1040   auto &Atomics = getActionDefinitionsBuilder(
1041     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1042      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1043      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1044      G_ATOMICRMW_UMIN})
1045     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1046                {S64, GlobalPtr}, {S64, LocalPtr}});
1047   if (ST.hasFlatAddressSpace()) {
1048     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1049   }
1050 
1051   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1052     .legalFor({{S32, LocalPtr}});
1053 
1054   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1055   // demarshalling
1056   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1057     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1058                 {S32, FlatPtr}, {S64, FlatPtr}})
1059     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1060                {S32, RegionPtr}, {S64, RegionPtr}});
1061   // TODO: Pointer types, any 32-bit or 64-bit vector
1062 
1063   // Condition should be s32 for scalar, s1 for vector.
1064   getActionDefinitionsBuilder(G_SELECT)
1065     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1066           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1067           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1068     .clampScalar(0, S16, S64)
1069     .scalarize(1)
1070     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1071     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1072     .clampMaxNumElements(0, S32, 2)
1073     .clampMaxNumElements(0, LocalPtr, 2)
1074     .clampMaxNumElements(0, PrivatePtr, 2)
1075     .scalarize(0)
1076     .widenScalarToNextPow2(0)
1077     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1078 
1079   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1080   // be more flexible with the shift amount type.
1081   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1082     .legalFor({{S32, S32}, {S64, S32}});
1083   if (ST.has16BitInsts()) {
1084     if (ST.hasVOP3PInsts()) {
1085       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1086             .clampMaxNumElements(0, S16, 2);
1087     } else
1088       Shifts.legalFor({{S16, S16}});
1089 
1090     // TODO: Support 16-bit shift amounts for all types
1091     Shifts.widenScalarIf(
1092       [=](const LegalityQuery &Query) {
1093         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1094         // 32-bit amount.
1095         const LLT ValTy = Query.Types[0];
1096         const LLT AmountTy = Query.Types[1];
1097         return ValTy.getSizeInBits() <= 16 &&
1098                AmountTy.getSizeInBits() < 16;
1099       }, changeTo(1, S16));
1100     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1101     Shifts.clampScalar(1, S32, S32);
1102     Shifts.clampScalar(0, S16, S64);
1103     Shifts.widenScalarToNextPow2(0, 16);
1104   } else {
1105     // Make sure we legalize the shift amount type first, as the general
1106     // expansion for the shifted type will produce much worse code if it hasn't
1107     // been truncated already.
1108     Shifts.clampScalar(1, S32, S32);
1109     Shifts.clampScalar(0, S32, S64);
1110     Shifts.widenScalarToNextPow2(0, 32);
1111   }
1112   Shifts.scalarize(0);
1113 
1114   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1115     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1116     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1117     unsigned IdxTypeIdx = 2;
1118 
1119     getActionDefinitionsBuilder(Op)
1120       .customIf([=](const LegalityQuery &Query) {
1121           const LLT EltTy = Query.Types[EltTypeIdx];
1122           const LLT VecTy = Query.Types[VecTypeIdx];
1123           const LLT IdxTy = Query.Types[IdxTypeIdx];
1124           return (EltTy.getSizeInBits() == 16 ||
1125                   EltTy.getSizeInBits() % 32 == 0) &&
1126                  VecTy.getSizeInBits() % 32 == 0 &&
1127                  VecTy.getSizeInBits() <= 1024 &&
1128                  IdxTy.getSizeInBits() == 32;
1129         })
1130       .clampScalar(EltTypeIdx, S32, S64)
1131       .clampScalar(VecTypeIdx, S32, S64)
1132       .clampScalar(IdxTypeIdx, S32, S32);
1133   }
1134 
1135   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1136     .unsupportedIf([=](const LegalityQuery &Query) {
1137         const LLT &EltTy = Query.Types[1].getElementType();
1138         return Query.Types[0] != EltTy;
1139       });
1140 
1141   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1142     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1143     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1144 
1145     // FIXME: Doesn't handle extract of illegal sizes.
1146     getActionDefinitionsBuilder(Op)
1147       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1148       // FIXME: Multiples of 16 should not be legal.
1149       .legalIf([=](const LegalityQuery &Query) {
1150           const LLT BigTy = Query.Types[BigTyIdx];
1151           const LLT LitTy = Query.Types[LitTyIdx];
1152           return (BigTy.getSizeInBits() % 32 == 0) &&
1153                  (LitTy.getSizeInBits() % 16 == 0);
1154         })
1155       .widenScalarIf(
1156         [=](const LegalityQuery &Query) {
1157           const LLT BigTy = Query.Types[BigTyIdx];
1158           return (BigTy.getScalarSizeInBits() < 16);
1159         },
1160         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1161       .widenScalarIf(
1162         [=](const LegalityQuery &Query) {
1163           const LLT LitTy = Query.Types[LitTyIdx];
1164           return (LitTy.getScalarSizeInBits() < 16);
1165         },
1166         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1167       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1168       .widenScalarToNextPow2(BigTyIdx, 32);
1169 
1170   }
1171 
1172   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1173     .legalForCartesianProduct(AllS32Vectors, {S32})
1174     .legalForCartesianProduct(AllS64Vectors, {S64})
1175     .clampNumElements(0, V16S32, V32S32)
1176     .clampNumElements(0, V2S64, V16S64)
1177     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1178 
1179   if (ST.hasScalarPackInsts()) {
1180     BuildVector
1181       // FIXME: Should probably widen s1 vectors straight to s32
1182       .minScalarOrElt(0, S16)
1183       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1184       .minScalar(1, S32);
1185 
1186     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1187       .legalFor({V2S16, S32})
1188       .lower();
1189     BuildVector.minScalarOrElt(0, S32);
1190   } else {
1191     BuildVector.customFor({V2S16, S16});
1192     BuildVector.minScalarOrElt(0, S32);
1193 
1194     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1195       .customFor({V2S16, S32})
1196       .lower();
1197   }
1198 
1199   BuildVector.legalIf(isRegisterType(0));
1200 
1201   // FIXME: Clamp maximum size
1202   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1203     .legalIf(isRegisterType(0));
1204 
1205   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1206   // pre-legalize.
1207   if (ST.hasVOP3PInsts()) {
1208     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1209       .customFor({V2S16, V2S16})
1210       .lower();
1211   } else
1212     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1213 
1214   // Merge/Unmerge
1215   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1216     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1217     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1218 
1219     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1220       const LLT Ty = Query.Types[TypeIdx];
1221       if (Ty.isVector()) {
1222         const LLT &EltTy = Ty.getElementType();
1223         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1224           return true;
1225         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1226           return true;
1227       }
1228       return false;
1229     };
1230 
1231     auto &Builder = getActionDefinitionsBuilder(Op)
1232       .lowerFor({{S16, V2S16}})
1233       .lowerIf([=](const LegalityQuery &Query) {
1234           const LLT BigTy = Query.Types[BigTyIdx];
1235           return BigTy.getSizeInBits() == 32;
1236         })
1237       // Try to widen to s16 first for small types.
1238       // TODO: Only do this on targets with legal s16 shifts
1239       .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1240       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1241       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1242       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1243                            elementTypeIs(1, S16)),
1244                        changeTo(1, V2S16))
1245       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1246       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1247       // valid.
1248       .clampScalar(LitTyIdx, S32, S512)
1249       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1250       // Break up vectors with weird elements into scalars
1251       .fewerElementsIf(
1252         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1253         scalarize(0))
1254       .fewerElementsIf(
1255         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1256         scalarize(1))
1257       .clampScalar(BigTyIdx, S32, S1024);
1258 
1259     if (Op == G_MERGE_VALUES) {
1260       Builder.widenScalarIf(
1261         // TODO: Use 16-bit shifts if legal for 8-bit values?
1262         [=](const LegalityQuery &Query) {
1263           const LLT Ty = Query.Types[LitTyIdx];
1264           return Ty.getSizeInBits() < 32;
1265         },
1266         changeTo(LitTyIdx, S32));
1267     }
1268 
1269     Builder.widenScalarIf(
1270       [=](const LegalityQuery &Query) {
1271         const LLT Ty = Query.Types[BigTyIdx];
1272         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1273           Ty.getSizeInBits() % 16 != 0;
1274       },
1275       [=](const LegalityQuery &Query) {
1276         // Pick the next power of 2, or a multiple of 64 over 128.
1277         // Whichever is smaller.
1278         const LLT &Ty = Query.Types[BigTyIdx];
1279         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1280         if (NewSizeInBits >= 256) {
1281           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1282           if (RoundedTo < NewSizeInBits)
1283             NewSizeInBits = RoundedTo;
1284         }
1285         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1286       })
1287       .legalIf([=](const LegalityQuery &Query) {
1288           const LLT &BigTy = Query.Types[BigTyIdx];
1289           const LLT &LitTy = Query.Types[LitTyIdx];
1290 
1291           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1292             return false;
1293           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1294             return false;
1295 
1296           return BigTy.getSizeInBits() % 16 == 0 &&
1297                  LitTy.getSizeInBits() % 16 == 0 &&
1298                  BigTy.getSizeInBits() <= 1024;
1299         })
1300       // Any vectors left are the wrong size. Scalarize them.
1301       .scalarize(0)
1302       .scalarize(1);
1303   }
1304 
1305   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1306   // RegBankSelect.
1307   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1308     .legalFor({{S32}, {S64}});
1309 
1310   if (ST.hasVOP3PInsts()) {
1311     SextInReg.lowerFor({{V2S16}})
1312       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1313       // get more vector shift opportunities, since we'll get those when
1314       // expanded.
1315       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1316   } else if (ST.has16BitInsts()) {
1317     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1318   } else {
1319     // Prefer to promote to s32 before lowering if we don't have 16-bit
1320     // shifts. This avoid a lot of intermediate truncate and extend operations.
1321     SextInReg.lowerFor({{S32}, {S64}});
1322   }
1323 
1324   SextInReg
1325     .scalarize(0)
1326     .clampScalar(0, S32, S64)
1327     .lower();
1328 
1329   getActionDefinitionsBuilder(G_FSHR)
1330     .legalFor({{S32, S32}})
1331     .scalarize(0)
1332     .lower();
1333 
1334   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1335     .legalFor({S64});
1336 
1337   getActionDefinitionsBuilder({
1338       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1339       G_FCOPYSIGN,
1340 
1341       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1342       G_READ_REGISTER,
1343       G_WRITE_REGISTER,
1344 
1345       G_SADDO, G_SSUBO,
1346 
1347        // TODO: Implement
1348       G_FMINIMUM, G_FMAXIMUM,
1349       G_FSHL
1350     }).lower();
1351 
1352   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1353         G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1354         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1355     .unsupported();
1356 
1357   computeTables();
1358   verify(*ST.getInstrInfo());
1359 }
1360 
1361 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1362                                          MachineRegisterInfo &MRI,
1363                                          MachineIRBuilder &B,
1364                                          GISelChangeObserver &Observer) const {
1365   switch (MI.getOpcode()) {
1366   case TargetOpcode::G_ADDRSPACE_CAST:
1367     return legalizeAddrSpaceCast(MI, MRI, B);
1368   case TargetOpcode::G_FRINT:
1369     return legalizeFrint(MI, MRI, B);
1370   case TargetOpcode::G_FCEIL:
1371     return legalizeFceil(MI, MRI, B);
1372   case TargetOpcode::G_INTRINSIC_TRUNC:
1373     return legalizeIntrinsicTrunc(MI, MRI, B);
1374   case TargetOpcode::G_SITOFP:
1375     return legalizeITOFP(MI, MRI, B, true);
1376   case TargetOpcode::G_UITOFP:
1377     return legalizeITOFP(MI, MRI, B, false);
1378   case TargetOpcode::G_FPTOSI:
1379     return legalizeFPTOI(MI, MRI, B, true);
1380   case TargetOpcode::G_FPTOUI:
1381     return legalizeFPTOI(MI, MRI, B, false);
1382   case TargetOpcode::G_FMINNUM:
1383   case TargetOpcode::G_FMAXNUM:
1384   case TargetOpcode::G_FMINNUM_IEEE:
1385   case TargetOpcode::G_FMAXNUM_IEEE:
1386     return legalizeMinNumMaxNum(MI, MRI, B);
1387   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1388     return legalizeExtractVectorElt(MI, MRI, B);
1389   case TargetOpcode::G_INSERT_VECTOR_ELT:
1390     return legalizeInsertVectorElt(MI, MRI, B);
1391   case TargetOpcode::G_SHUFFLE_VECTOR:
1392     return legalizeShuffleVector(MI, MRI, B);
1393   case TargetOpcode::G_FSIN:
1394   case TargetOpcode::G_FCOS:
1395     return legalizeSinCos(MI, MRI, B);
1396   case TargetOpcode::G_GLOBAL_VALUE:
1397     return legalizeGlobalValue(MI, MRI, B);
1398   case TargetOpcode::G_LOAD:
1399     return legalizeLoad(MI, MRI, B, Observer);
1400   case TargetOpcode::G_FMAD:
1401     return legalizeFMad(MI, MRI, B);
1402   case TargetOpcode::G_FDIV:
1403     return legalizeFDIV(MI, MRI, B);
1404   case TargetOpcode::G_UDIV:
1405   case TargetOpcode::G_UREM:
1406     return legalizeUDIV_UREM(MI, MRI, B);
1407   case TargetOpcode::G_SDIV:
1408   case TargetOpcode::G_SREM:
1409     return legalizeSDIV_SREM(MI, MRI, B);
1410   case TargetOpcode::G_ATOMIC_CMPXCHG:
1411     return legalizeAtomicCmpXChg(MI, MRI, B);
1412   case TargetOpcode::G_FLOG:
1413     return legalizeFlog(MI, B, 1.0f / numbers::log2ef);
1414   case TargetOpcode::G_FLOG10:
1415     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1416   case TargetOpcode::G_FEXP:
1417     return legalizeFExp(MI, B);
1418   case TargetOpcode::G_FPOW:
1419     return legalizeFPow(MI, B);
1420   case TargetOpcode::G_FFLOOR:
1421     return legalizeFFloor(MI, MRI, B);
1422   case TargetOpcode::G_BUILD_VECTOR:
1423     return legalizeBuildVector(MI, MRI, B);
1424   default:
1425     return false;
1426   }
1427 
1428   llvm_unreachable("expected switch to return");
1429 }
1430 
1431 Register AMDGPULegalizerInfo::getSegmentAperture(
1432   unsigned AS,
1433   MachineRegisterInfo &MRI,
1434   MachineIRBuilder &B) const {
1435   MachineFunction &MF = B.getMF();
1436   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1437   const LLT S32 = LLT::scalar(32);
1438 
1439   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1440 
1441   if (ST.hasApertureRegs()) {
1442     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1443     // getreg.
1444     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1445         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1446         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1447     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1448         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1449         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1450     unsigned Encoding =
1451         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1452         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1453         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1454 
1455     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1456 
1457     B.buildInstr(AMDGPU::S_GETREG_B32)
1458       .addDef(GetReg)
1459       .addImm(Encoding);
1460     MRI.setType(GetReg, S32);
1461 
1462     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1463     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1464   }
1465 
1466   Register QueuePtr = MRI.createGenericVirtualRegister(
1467     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1468 
1469   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1470   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1471     return Register();
1472 
1473   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1474   // private_segment_aperture_base_hi.
1475   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1476 
1477   // TODO: can we be smarter about machine pointer info?
1478   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1479   MachineMemOperand *MMO = MF.getMachineMemOperand(
1480       PtrInfo,
1481       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1482           MachineMemOperand::MOInvariant,
1483       4, commonAlignment(Align(64), StructOffset));
1484 
1485   Register LoadAddr;
1486 
1487   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1488   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1489 }
1490 
1491 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1492   MachineInstr &MI, MachineRegisterInfo &MRI,
1493   MachineIRBuilder &B) const {
1494   MachineFunction &MF = B.getMF();
1495 
1496   B.setInstr(MI);
1497 
1498   const LLT S32 = LLT::scalar(32);
1499   Register Dst = MI.getOperand(0).getReg();
1500   Register Src = MI.getOperand(1).getReg();
1501 
1502   LLT DstTy = MRI.getType(Dst);
1503   LLT SrcTy = MRI.getType(Src);
1504   unsigned DestAS = DstTy.getAddressSpace();
1505   unsigned SrcAS = SrcTy.getAddressSpace();
1506 
1507   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1508   // vector element.
1509   assert(!DstTy.isVector());
1510 
1511   const AMDGPUTargetMachine &TM
1512     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1513 
1514   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1515   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1516     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1517     return true;
1518   }
1519 
1520   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1521     // Truncate.
1522     B.buildExtract(Dst, Src, 0);
1523     MI.eraseFromParent();
1524     return true;
1525   }
1526 
1527   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1528     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1529     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1530 
1531     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1532     // another. Merge operands are required to be the same type, but creating an
1533     // extra ptrtoint would be kind of pointless.
1534     auto HighAddr = B.buildConstant(
1535       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1536     B.buildMerge(Dst, {Src, HighAddr});
1537     MI.eraseFromParent();
1538     return true;
1539   }
1540 
1541   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1542     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1543            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1544     unsigned NullVal = TM.getNullPointerValue(DestAS);
1545 
1546     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1547     auto FlatNull = B.buildConstant(SrcTy, 0);
1548 
1549     // Extract low 32-bits of the pointer.
1550     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1551 
1552     auto CmpRes =
1553         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1554     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1555 
1556     MI.eraseFromParent();
1557     return true;
1558   }
1559 
1560   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1561     return false;
1562 
1563   if (!ST.hasFlatAddressSpace())
1564     return false;
1565 
1566   auto SegmentNull =
1567       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1568   auto FlatNull =
1569       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1570 
1571   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1572   if (!ApertureReg.isValid())
1573     return false;
1574 
1575   auto CmpRes =
1576       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1577 
1578   // Coerce the type of the low half of the result so we can use merge_values.
1579   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1580 
1581   // TODO: Should we allow mismatched types but matching sizes in merges to
1582   // avoid the ptrtoint?
1583   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1584   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1585 
1586   MI.eraseFromParent();
1587   return true;
1588 }
1589 
1590 bool AMDGPULegalizerInfo::legalizeFrint(
1591   MachineInstr &MI, MachineRegisterInfo &MRI,
1592   MachineIRBuilder &B) const {
1593   B.setInstr(MI);
1594 
1595   Register Src = MI.getOperand(1).getReg();
1596   LLT Ty = MRI.getType(Src);
1597   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1598 
1599   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1600   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1601 
1602   auto C1 = B.buildFConstant(Ty, C1Val);
1603   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1604 
1605   // TODO: Should this propagate fast-math-flags?
1606   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1607   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1608 
1609   auto C2 = B.buildFConstant(Ty, C2Val);
1610   auto Fabs = B.buildFAbs(Ty, Src);
1611 
1612   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1613   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1614   return true;
1615 }
1616 
1617 bool AMDGPULegalizerInfo::legalizeFceil(
1618   MachineInstr &MI, MachineRegisterInfo &MRI,
1619   MachineIRBuilder &B) const {
1620   B.setInstr(MI);
1621 
1622   const LLT S1 = LLT::scalar(1);
1623   const LLT S64 = LLT::scalar(64);
1624 
1625   Register Src = MI.getOperand(1).getReg();
1626   assert(MRI.getType(Src) == S64);
1627 
1628   // result = trunc(src)
1629   // if (src > 0.0 && src != result)
1630   //   result += 1.0
1631 
1632   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1633 
1634   const auto Zero = B.buildFConstant(S64, 0.0);
1635   const auto One = B.buildFConstant(S64, 1.0);
1636   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1637   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1638   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1639   auto Add = B.buildSelect(S64, And, One, Zero);
1640 
1641   // TODO: Should this propagate fast-math-flags?
1642   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1643   return true;
1644 }
1645 
1646 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1647                                               MachineIRBuilder &B) {
1648   const unsigned FractBits = 52;
1649   const unsigned ExpBits = 11;
1650   LLT S32 = LLT::scalar(32);
1651 
1652   auto Const0 = B.buildConstant(S32, FractBits - 32);
1653   auto Const1 = B.buildConstant(S32, ExpBits);
1654 
1655   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1656     .addUse(Const0.getReg(0))
1657     .addUse(Const1.getReg(0));
1658 
1659   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1660 }
1661 
1662 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1663   MachineInstr &MI, MachineRegisterInfo &MRI,
1664   MachineIRBuilder &B) const {
1665   B.setInstr(MI);
1666 
1667   const LLT S1 = LLT::scalar(1);
1668   const LLT S32 = LLT::scalar(32);
1669   const LLT S64 = LLT::scalar(64);
1670 
1671   Register Src = MI.getOperand(1).getReg();
1672   assert(MRI.getType(Src) == S64);
1673 
1674   // TODO: Should this use extract since the low half is unused?
1675   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1676   Register Hi = Unmerge.getReg(1);
1677 
1678   // Extract the upper half, since this is where we will find the sign and
1679   // exponent.
1680   auto Exp = extractF64Exponent(Hi, B);
1681 
1682   const unsigned FractBits = 52;
1683 
1684   // Extract the sign bit.
1685   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1686   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1687 
1688   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1689 
1690   const auto Zero32 = B.buildConstant(S32, 0);
1691 
1692   // Extend back to 64-bits.
1693   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1694 
1695   auto Shr = B.buildAShr(S64, FractMask, Exp);
1696   auto Not = B.buildNot(S64, Shr);
1697   auto Tmp0 = B.buildAnd(S64, Src, Not);
1698   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1699 
1700   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1701   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1702 
1703   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1704   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1705   return true;
1706 }
1707 
1708 bool AMDGPULegalizerInfo::legalizeITOFP(
1709   MachineInstr &MI, MachineRegisterInfo &MRI,
1710   MachineIRBuilder &B, bool Signed) const {
1711   B.setInstr(MI);
1712 
1713   Register Dst = MI.getOperand(0).getReg();
1714   Register Src = MI.getOperand(1).getReg();
1715 
1716   const LLT S64 = LLT::scalar(64);
1717   const LLT S32 = LLT::scalar(32);
1718 
1719   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1720 
1721   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1722 
1723   auto CvtHi = Signed ?
1724     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1725     B.buildUITOFP(S64, Unmerge.getReg(1));
1726 
1727   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1728 
1729   auto ThirtyTwo = B.buildConstant(S32, 32);
1730   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1731     .addUse(CvtHi.getReg(0))
1732     .addUse(ThirtyTwo.getReg(0));
1733 
1734   // TODO: Should this propagate fast-math-flags?
1735   B.buildFAdd(Dst, LdExp, CvtLo);
1736   MI.eraseFromParent();
1737   return true;
1738 }
1739 
1740 // TODO: Copied from DAG implementation. Verify logic and document how this
1741 // actually works.
1742 bool AMDGPULegalizerInfo::legalizeFPTOI(
1743   MachineInstr &MI, MachineRegisterInfo &MRI,
1744   MachineIRBuilder &B, bool Signed) const {
1745   B.setInstr(MI);
1746 
1747   Register Dst = MI.getOperand(0).getReg();
1748   Register Src = MI.getOperand(1).getReg();
1749 
1750   const LLT S64 = LLT::scalar(64);
1751   const LLT S32 = LLT::scalar(32);
1752 
1753   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1754 
1755   unsigned Flags = MI.getFlags();
1756 
1757   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
1758   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
1759   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
1760 
1761   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
1762   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
1763   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
1764 
1765   auto Hi = Signed ?
1766     B.buildFPTOSI(S32, FloorMul) :
1767     B.buildFPTOUI(S32, FloorMul);
1768   auto Lo = B.buildFPTOUI(S32, Fma);
1769 
1770   B.buildMerge(Dst, { Lo, Hi });
1771   MI.eraseFromParent();
1772 
1773   return true;
1774 }
1775 
1776 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1777   MachineInstr &MI, MachineRegisterInfo &MRI,
1778   MachineIRBuilder &B) const {
1779   MachineFunction &MF = B.getMF();
1780   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1781 
1782   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1783                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1784 
1785   // With ieee_mode disabled, the instructions have the correct behavior
1786   // already for G_FMINNUM/G_FMAXNUM
1787   if (!MFI->getMode().IEEE)
1788     return !IsIEEEOp;
1789 
1790   if (IsIEEEOp)
1791     return true;
1792 
1793   MachineIRBuilder HelperBuilder(MI);
1794   GISelObserverWrapper DummyObserver;
1795   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1796   HelperBuilder.setInstr(MI);
1797   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1798 }
1799 
1800 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1801   MachineInstr &MI, MachineRegisterInfo &MRI,
1802   MachineIRBuilder &B) const {
1803   // TODO: Should move some of this into LegalizerHelper.
1804 
1805   // TODO: Promote dynamic indexing of s16 to s32
1806 
1807   // FIXME: Artifact combiner probably should have replaced the truncated
1808   // constant before this, so we shouldn't need
1809   // getConstantVRegValWithLookThrough.
1810   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1811     MI.getOperand(2).getReg(), MRI);
1812   if (!IdxVal) // Dynamic case will be selected to register indexing.
1813     return true;
1814 
1815   Register Dst = MI.getOperand(0).getReg();
1816   Register Vec = MI.getOperand(1).getReg();
1817 
1818   LLT VecTy = MRI.getType(Vec);
1819   LLT EltTy = VecTy.getElementType();
1820   assert(EltTy == MRI.getType(Dst));
1821 
1822   B.setInstr(MI);
1823 
1824   if (IdxVal->Value < VecTy.getNumElements())
1825     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
1826   else
1827     B.buildUndef(Dst);
1828 
1829   MI.eraseFromParent();
1830   return true;
1831 }
1832 
1833 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1834   MachineInstr &MI, MachineRegisterInfo &MRI,
1835   MachineIRBuilder &B) const {
1836   // TODO: Should move some of this into LegalizerHelper.
1837 
1838   // TODO: Promote dynamic indexing of s16 to s32
1839 
1840   // FIXME: Artifact combiner probably should have replaced the truncated
1841   // constant before this, so we shouldn't need
1842   // getConstantVRegValWithLookThrough.
1843   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1844     MI.getOperand(3).getReg(), MRI);
1845   if (!IdxVal) // Dynamic case will be selected to register indexing.
1846     return true;
1847 
1848   Register Dst = MI.getOperand(0).getReg();
1849   Register Vec = MI.getOperand(1).getReg();
1850   Register Ins = MI.getOperand(2).getReg();
1851 
1852   LLT VecTy = MRI.getType(Vec);
1853   LLT EltTy = VecTy.getElementType();
1854   assert(EltTy == MRI.getType(Ins));
1855 
1856   B.setInstr(MI);
1857 
1858   if (IdxVal->Value < VecTy.getNumElements())
1859     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
1860   else
1861     B.buildUndef(Dst);
1862 
1863   MI.eraseFromParent();
1864   return true;
1865 }
1866 
1867 bool AMDGPULegalizerInfo::legalizeShuffleVector(
1868   MachineInstr &MI, MachineRegisterInfo &MRI,
1869   MachineIRBuilder &B) const {
1870   const LLT V2S16 = LLT::vector(2, 16);
1871 
1872   Register Dst = MI.getOperand(0).getReg();
1873   Register Src0 = MI.getOperand(1).getReg();
1874   LLT DstTy = MRI.getType(Dst);
1875   LLT SrcTy = MRI.getType(Src0);
1876 
1877   if (SrcTy == V2S16 && DstTy == V2S16 &&
1878       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
1879     return true;
1880 
1881   MachineIRBuilder HelperBuilder(MI);
1882   GISelObserverWrapper DummyObserver;
1883   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
1884   HelperBuilder.setInstr(MI);
1885   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
1886 }
1887 
1888 bool AMDGPULegalizerInfo::legalizeSinCos(
1889   MachineInstr &MI, MachineRegisterInfo &MRI,
1890   MachineIRBuilder &B) const {
1891   B.setInstr(MI);
1892 
1893   Register DstReg = MI.getOperand(0).getReg();
1894   Register SrcReg = MI.getOperand(1).getReg();
1895   LLT Ty = MRI.getType(DstReg);
1896   unsigned Flags = MI.getFlags();
1897 
1898   Register TrigVal;
1899   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1900   if (ST.hasTrigReducedRange()) {
1901     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1902     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1903       .addUse(MulVal.getReg(0))
1904       .setMIFlags(Flags).getReg(0);
1905   } else
1906     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1907 
1908   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1909     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1910   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1911     .addUse(TrigVal)
1912     .setMIFlags(Flags);
1913   MI.eraseFromParent();
1914   return true;
1915 }
1916 
1917 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1918   Register DstReg, LLT PtrTy,
1919   MachineIRBuilder &B, const GlobalValue *GV,
1920   unsigned Offset, unsigned GAFlags) const {
1921   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1922   // to the following code sequence:
1923   //
1924   // For constant address space:
1925   //   s_getpc_b64 s[0:1]
1926   //   s_add_u32 s0, s0, $symbol
1927   //   s_addc_u32 s1, s1, 0
1928   //
1929   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1930   //   a fixup or relocation is emitted to replace $symbol with a literal
1931   //   constant, which is a pc-relative offset from the encoding of the $symbol
1932   //   operand to the global variable.
1933   //
1934   // For global address space:
1935   //   s_getpc_b64 s[0:1]
1936   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1937   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1938   //
1939   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1940   //   fixups or relocations are emitted to replace $symbol@*@lo and
1941   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1942   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
1943   //   operand to the global variable.
1944   //
1945   // What we want here is an offset from the value returned by s_getpc
1946   // (which is the address of the s_add_u32 instruction) to the global
1947   // variable, but since the encoding of $symbol starts 4 bytes after the start
1948   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1949   // small. This requires us to add 4 to the global variable offset in order to
1950   // compute the correct address.
1951 
1952   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1953 
1954   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1955     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1956 
1957   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1958     .addDef(PCReg);
1959 
1960   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1961   if (GAFlags == SIInstrInfo::MO_NONE)
1962     MIB.addImm(0);
1963   else
1964     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1965 
1966   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1967 
1968   if (PtrTy.getSizeInBits() == 32)
1969     B.buildExtract(DstReg, PCReg, 0);
1970   return true;
1971  }
1972 
1973 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1974   MachineInstr &MI, MachineRegisterInfo &MRI,
1975   MachineIRBuilder &B) const {
1976   Register DstReg = MI.getOperand(0).getReg();
1977   LLT Ty = MRI.getType(DstReg);
1978   unsigned AS = Ty.getAddressSpace();
1979 
1980   const GlobalValue *GV = MI.getOperand(1).getGlobal();
1981   MachineFunction &MF = B.getMF();
1982   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1983   B.setInstr(MI);
1984 
1985   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1986     if (!MFI->isEntryFunction()) {
1987       const Function &Fn = MF.getFunction();
1988       DiagnosticInfoUnsupported BadLDSDecl(
1989         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
1990         DS_Warning);
1991       Fn.getContext().diagnose(BadLDSDecl);
1992 
1993       // We currently don't have a way to correctly allocate LDS objects that
1994       // aren't directly associated with a kernel. We do force inlining of
1995       // functions that use local objects. However, if these dead functions are
1996       // not eliminated, we don't want a compile time error. Just emit a warning
1997       // and a trap, since there should be no callable path here.
1998       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
1999       B.buildUndef(DstReg);
2000       MI.eraseFromParent();
2001       return true;
2002     }
2003 
2004     // TODO: We could emit code to handle the initialization somewhere.
2005     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2006       const SITargetLowering *TLI = ST.getTargetLowering();
2007       if (!TLI->shouldUseLDSConstAddress(GV)) {
2008         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2009         return true; // Leave in place;
2010       }
2011 
2012       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
2013       MI.eraseFromParent();
2014       return true;
2015     }
2016 
2017     const Function &Fn = MF.getFunction();
2018     DiagnosticInfoUnsupported BadInit(
2019       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2020     Fn.getContext().diagnose(BadInit);
2021     return true;
2022   }
2023 
2024   const SITargetLowering *TLI = ST.getTargetLowering();
2025 
2026   if (TLI->shouldEmitFixup(GV)) {
2027     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2028     MI.eraseFromParent();
2029     return true;
2030   }
2031 
2032   if (TLI->shouldEmitPCReloc(GV)) {
2033     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2034     MI.eraseFromParent();
2035     return true;
2036   }
2037 
2038   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2039   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2040 
2041   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2042       MachinePointerInfo::getGOT(MF),
2043       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2044           MachineMemOperand::MOInvariant,
2045       8 /*Size*/, Align(8));
2046 
2047   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2048 
2049   if (Ty.getSizeInBits() == 32) {
2050     // Truncate if this is a 32-bit constant adrdess.
2051     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2052     B.buildExtract(DstReg, Load, 0);
2053   } else
2054     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2055 
2056   MI.eraseFromParent();
2057   return true;
2058 }
2059 
2060 bool AMDGPULegalizerInfo::legalizeLoad(
2061   MachineInstr &MI, MachineRegisterInfo &MRI,
2062   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
2063   B.setInstr(MI);
2064   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2065   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
2066   Observer.changingInstr(MI);
2067   MI.getOperand(1).setReg(Cast.getReg(0));
2068   Observer.changedInstr(MI);
2069   return true;
2070 }
2071 
2072 bool AMDGPULegalizerInfo::legalizeFMad(
2073   MachineInstr &MI, MachineRegisterInfo &MRI,
2074   MachineIRBuilder &B) const {
2075   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2076   assert(Ty.isScalar());
2077 
2078   MachineFunction &MF = B.getMF();
2079   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2080 
2081   // TODO: Always legal with future ftz flag.
2082   // FIXME: Do we need just output?
2083   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2084     return true;
2085   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2086     return true;
2087 
2088   MachineIRBuilder HelperBuilder(MI);
2089   GISelObserverWrapper DummyObserver;
2090   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2091   HelperBuilder.setInstr(MI);
2092   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2093 }
2094 
2095 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2096   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2097   Register DstReg = MI.getOperand(0).getReg();
2098   Register PtrReg = MI.getOperand(1).getReg();
2099   Register CmpVal = MI.getOperand(2).getReg();
2100   Register NewVal = MI.getOperand(3).getReg();
2101 
2102   assert(SITargetLowering::isFlatGlobalAddrSpace(
2103            MRI.getType(PtrReg).getAddressSpace()) &&
2104          "this should not have been custom lowered");
2105 
2106   LLT ValTy = MRI.getType(CmpVal);
2107   LLT VecTy = LLT::vector(2, ValTy);
2108 
2109   B.setInstr(MI);
2110   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2111 
2112   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2113     .addDef(DstReg)
2114     .addUse(PtrReg)
2115     .addUse(PackedVal)
2116     .setMemRefs(MI.memoperands());
2117 
2118   MI.eraseFromParent();
2119   return true;
2120 }
2121 
2122 bool AMDGPULegalizerInfo::legalizeFlog(
2123   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2124   Register Dst = MI.getOperand(0).getReg();
2125   Register Src = MI.getOperand(1).getReg();
2126   LLT Ty = B.getMRI()->getType(Dst);
2127   unsigned Flags = MI.getFlags();
2128   B.setInstr(MI);
2129 
2130   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2131   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2132 
2133   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2134   MI.eraseFromParent();
2135   return true;
2136 }
2137 
2138 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2139                                        MachineIRBuilder &B) const {
2140   Register Dst = MI.getOperand(0).getReg();
2141   Register Src = MI.getOperand(1).getReg();
2142   unsigned Flags = MI.getFlags();
2143   LLT Ty = B.getMRI()->getType(Dst);
2144   B.setInstr(MI);
2145 
2146   auto K = B.buildFConstant(Ty, numbers::log2e);
2147   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2148   B.buildFExp2(Dst, Mul, Flags);
2149   MI.eraseFromParent();
2150   return true;
2151 }
2152 
2153 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2154                                        MachineIRBuilder &B) const {
2155   Register Dst = MI.getOperand(0).getReg();
2156   Register Src0 = MI.getOperand(1).getReg();
2157   Register Src1 = MI.getOperand(2).getReg();
2158   unsigned Flags = MI.getFlags();
2159   LLT Ty = B.getMRI()->getType(Dst);
2160   B.setInstr(MI);
2161   const LLT S16 = LLT::scalar(16);
2162   const LLT S32 = LLT::scalar(32);
2163 
2164   if (Ty == S32) {
2165     auto Log = B.buildFLog2(S32, Src0, Flags);
2166     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2167       .addUse(Log.getReg(0))
2168       .addUse(Src1)
2169       .setMIFlags(Flags);
2170     B.buildFExp2(Dst, Mul, Flags);
2171   } else if (Ty == S16) {
2172     // There's no f16 fmul_legacy, so we need to convert for it.
2173     auto Log = B.buildFLog2(S16, Src0, Flags);
2174     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2175     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2176     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2177       .addUse(Ext0.getReg(0))
2178       .addUse(Ext1.getReg(0))
2179       .setMIFlags(Flags);
2180 
2181     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2182   } else
2183     return false;
2184 
2185   MI.eraseFromParent();
2186   return true;
2187 }
2188 
2189 // Find a source register, ignoring any possible source modifiers.
2190 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2191   Register ModSrc = OrigSrc;
2192   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2193     ModSrc = SrcFNeg->getOperand(1).getReg();
2194     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2195       ModSrc = SrcFAbs->getOperand(1).getReg();
2196   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2197     ModSrc = SrcFAbs->getOperand(1).getReg();
2198   return ModSrc;
2199 }
2200 
2201 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2202                                          MachineRegisterInfo &MRI,
2203                                          MachineIRBuilder &B) const {
2204   B.setInstr(MI);
2205 
2206   const LLT S1 = LLT::scalar(1);
2207   const LLT S64 = LLT::scalar(64);
2208   Register Dst = MI.getOperand(0).getReg();
2209   Register OrigSrc = MI.getOperand(1).getReg();
2210   unsigned Flags = MI.getFlags();
2211   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2212          "this should not have been custom lowered");
2213 
2214   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2215   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2216   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2217   // V_FRACT bug is:
2218   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2219   //
2220   // Convert floor(x) to (x - fract(x))
2221 
2222   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2223     .addUse(OrigSrc)
2224     .setMIFlags(Flags);
2225 
2226   // Give source modifier matching some assistance before obscuring a foldable
2227   // pattern.
2228 
2229   // TODO: We can avoid the neg on the fract? The input sign to fract
2230   // shouldn't matter?
2231   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2232 
2233   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2234 
2235   Register Min = MRI.createGenericVirtualRegister(S64);
2236 
2237   // We don't need to concern ourselves with the snan handling difference, so
2238   // use the one which will directly select.
2239   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2240   if (MFI->getMode().IEEE)
2241     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2242   else
2243     B.buildFMinNum(Min, Fract, Const, Flags);
2244 
2245   Register CorrectedFract = Min;
2246   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2247     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2248     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2249   }
2250 
2251   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2252   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2253 
2254   MI.eraseFromParent();
2255   return true;
2256 }
2257 
2258 // Turn an illegal packed v2s16 build vector into bit operations.
2259 // TODO: This should probably be a bitcast action in LegalizerHelper.
2260 bool AMDGPULegalizerInfo::legalizeBuildVector(
2261   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2262   Register Dst = MI.getOperand(0).getReg();
2263   const LLT S32 = LLT::scalar(32);
2264   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2265 
2266   Register Src0 = MI.getOperand(1).getReg();
2267   Register Src1 = MI.getOperand(2).getReg();
2268   assert(MRI.getType(Src0) == LLT::scalar(16));
2269 
2270   B.setInstr(MI);
2271   auto Merge = B.buildMerge(S32, {Src0, Src1});
2272   B.buildBitcast(Dst, Merge);
2273 
2274   MI.eraseFromParent();
2275   return true;
2276 }
2277 
2278 // Return the use branch instruction, otherwise null if the usage is invalid.
2279 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2280                                        MachineRegisterInfo &MRI,
2281                                        MachineInstr *&Br) {
2282   Register CondDef = MI.getOperand(0).getReg();
2283   if (!MRI.hasOneNonDBGUse(CondDef))
2284     return nullptr;
2285 
2286   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2287   if (UseMI.getParent() != MI.getParent() ||
2288       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2289     return nullptr;
2290 
2291   // Make sure the cond br is followed by a G_BR
2292   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2293   if (Next != MI.getParent()->end()) {
2294     if (Next->getOpcode() != AMDGPU::G_BR)
2295       return nullptr;
2296     Br = &*Next;
2297   }
2298 
2299   return &UseMI;
2300 }
2301 
2302 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B,
2303                                                MachineRegisterInfo &MRI,
2304                                                Register LiveIn,
2305                                                Register PhyReg) const {
2306   assert(PhyReg.isPhysical() && "Physical register expected");
2307 
2308   // Insert the live-in copy, if required, by defining destination virtual
2309   // register.
2310   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
2311   if (!MRI.getVRegDef(LiveIn)) {
2312     // FIXME: Should have scoped insert pt
2313     MachineBasicBlock &OrigInsBB = B.getMBB();
2314     auto OrigInsPt = B.getInsertPt();
2315 
2316     MachineBasicBlock &EntryMBB = B.getMF().front();
2317     EntryMBB.addLiveIn(PhyReg);
2318     B.setInsertPt(EntryMBB, EntryMBB.begin());
2319     B.buildCopy(LiveIn, PhyReg);
2320 
2321     B.setInsertPt(OrigInsBB, OrigInsPt);
2322   }
2323 
2324   return LiveIn;
2325 }
2326 
2327 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B,
2328                                                 MachineRegisterInfo &MRI,
2329                                                 Register PhyReg, LLT Ty,
2330                                                 bool InsertLiveInCopy) const {
2331   assert(PhyReg.isPhysical() && "Physical register expected");
2332 
2333   // Get or create virtual live-in regester
2334   Register LiveIn = MRI.getLiveInVirtReg(PhyReg);
2335   if (!LiveIn) {
2336     LiveIn = MRI.createGenericVirtualRegister(Ty);
2337     MRI.addLiveIn(PhyReg, LiveIn);
2338   }
2339 
2340   // When the actual true copy required is from virtual register to physical
2341   // register (to be inserted later), live-in copy insertion from physical
2342   // to register virtual register is not required
2343   if (!InsertLiveInCopy)
2344     return LiveIn;
2345 
2346   return insertLiveInCopy(B, MRI, LiveIn, PhyReg);
2347 }
2348 
2349 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor(
2350     MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2351   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2352   const ArgDescriptor *Arg;
2353   const TargetRegisterClass *RC;
2354   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
2355   if (!Arg) {
2356     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
2357     return nullptr;
2358   }
2359   return Arg;
2360 }
2361 
2362 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2363                                          const ArgDescriptor *Arg) const {
2364   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2365     return false; // TODO: Handle these
2366 
2367   Register SrcReg = Arg->getRegister();
2368   assert(SrcReg.isPhysical() && "Physical register expected");
2369   assert(DstReg.isVirtual() && "Virtual register expected");
2370 
2371   MachineRegisterInfo &MRI = *B.getMRI();
2372 
2373   LLT Ty = MRI.getType(DstReg);
2374   Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty);
2375 
2376   if (Arg->isMasked()) {
2377     // TODO: Should we try to emit this once in the entry block?
2378     const LLT S32 = LLT::scalar(32);
2379     const unsigned Mask = Arg->getMask();
2380     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2381 
2382     Register AndMaskSrc = LiveIn;
2383 
2384     if (Shift != 0) {
2385       auto ShiftAmt = B.buildConstant(S32, Shift);
2386       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2387     }
2388 
2389     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2390   } else {
2391     B.buildCopy(DstReg, LiveIn);
2392   }
2393 
2394   return true;
2395 }
2396 
2397 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2398     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2399     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2400   B.setInstr(MI);
2401 
2402   const ArgDescriptor *Arg = getArgDescriptor(B, ArgType);
2403   if (!Arg)
2404     return false;
2405 
2406   if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg))
2407     return false;
2408 
2409   MI.eraseFromParent();
2410   return true;
2411 }
2412 
2413 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2414                                        MachineRegisterInfo &MRI,
2415                                        MachineIRBuilder &B) const {
2416   B.setInstr(MI);
2417   Register Dst = MI.getOperand(0).getReg();
2418   LLT DstTy = MRI.getType(Dst);
2419   LLT S16 = LLT::scalar(16);
2420   LLT S32 = LLT::scalar(32);
2421   LLT S64 = LLT::scalar(64);
2422 
2423   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2424     return true;
2425 
2426   if (DstTy == S16)
2427     return legalizeFDIV16(MI, MRI, B);
2428   if (DstTy == S32)
2429     return legalizeFDIV32(MI, MRI, B);
2430   if (DstTy == S64)
2431     return legalizeFDIV64(MI, MRI, B);
2432 
2433   return false;
2434 }
2435 
2436 static Register buildDivRCP(MachineIRBuilder &B, Register Src) {
2437   const LLT S32 = LLT::scalar(32);
2438 
2439   auto Cvt0 = B.buildUITOFP(S32, Src);
2440   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Cvt0});
2441   auto FPUIntMaxPlus1 = B.buildFConstant(S32, BitsToFloat(0x4f800000));
2442   auto Mul = B.buildFMul(S32, RcpIFlag, FPUIntMaxPlus1);
2443   return B.buildFPTOUI(S32, Mul).getReg(0);
2444 }
2445 
2446 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2447                                                   Register DstReg,
2448                                                   Register Num,
2449                                                   Register Den,
2450                                                   bool IsRem) const {
2451   const LLT S1 = LLT::scalar(1);
2452   const LLT S32 = LLT::scalar(32);
2453 
2454   // RCP =  URECIP(Den) = 2^32 / Den + e
2455   // e is rounding error.
2456   auto RCP = buildDivRCP(B, Den);
2457 
2458   // RCP_LO = mul(RCP, Den)
2459   auto RCP_LO = B.buildMul(S32, RCP, Den);
2460 
2461   // RCP_HI = mulhu (RCP, Den) */
2462   auto RCP_HI = B.buildUMulH(S32, RCP, Den);
2463 
2464   // NEG_RCP_LO = -RCP_LO
2465   auto Zero = B.buildConstant(S32, 0);
2466   auto NEG_RCP_LO = B.buildSub(S32, Zero, RCP_LO);
2467 
2468   // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
2469   auto CmpRcpHiZero = B.buildICmp(CmpInst::ICMP_EQ, S1, RCP_HI, Zero);
2470   auto ABS_RCP_LO = B.buildSelect(S32, CmpRcpHiZero, NEG_RCP_LO, RCP_LO);
2471 
2472   // Calculate the rounding error from the URECIP instruction
2473   // E = mulhu(ABS_RCP_LO, RCP)
2474   auto E = B.buildUMulH(S32, ABS_RCP_LO, RCP);
2475 
2476   // RCP_A_E = RCP + E
2477   auto RCP_A_E = B.buildAdd(S32, RCP, E);
2478 
2479   // RCP_S_E = RCP - E
2480   auto RCP_S_E = B.buildSub(S32, RCP, E);
2481 
2482   // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
2483   auto Tmp0 = B.buildSelect(S32, CmpRcpHiZero, RCP_A_E, RCP_S_E);
2484 
2485   // Quotient = mulhu(Tmp0, Num)stmp
2486   auto Quotient = B.buildUMulH(S32, Tmp0, Num);
2487 
2488   // Num_S_Remainder = Quotient * Den
2489   auto Num_S_Remainder = B.buildMul(S32, Quotient, Den);
2490 
2491   // Remainder = Num - Num_S_Remainder
2492   auto Remainder = B.buildSub(S32, Num, Num_S_Remainder);
2493 
2494   // Remainder_GE_Den = Remainder >= Den
2495   auto Remainder_GE_Den = B.buildICmp(CmpInst::ICMP_UGE, S1, Remainder, Den);
2496 
2497   // Remainder_GE_Zero = Num >= Num_S_Remainder;
2498   auto Remainder_GE_Zero = B.buildICmp(CmpInst::ICMP_UGE, S1,
2499                                        Num, Num_S_Remainder);
2500 
2501   // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
2502   auto Tmp1 = B.buildAnd(S1, Remainder_GE_Den, Remainder_GE_Zero);
2503 
2504   // Calculate Division result:
2505 
2506   // Quotient_A_One = Quotient + 1
2507   auto One = B.buildConstant(S32, 1);
2508   auto Quotient_A_One = B.buildAdd(S32, Quotient, One);
2509 
2510   // Quotient_S_One = Quotient - 1
2511   auto Quotient_S_One = B.buildSub(S32, Quotient, One);
2512 
2513   // Div = (Tmp1 == 0 ? Quotient_A_One : Quotient)
2514   auto Div = B.buildSelect(S32, Tmp1, Quotient, Quotient_A_One);
2515 
2516   // Div = (Remainder_GE_Zero ? Div : Quotient_S_One)
2517   if (IsRem) {
2518     Div = B.buildSelect(S32, Remainder_GE_Zero, Div, Quotient_S_One);
2519 
2520     // Calculate Rem result:
2521     auto Remainder_S_Den = B.buildSub(S32, Remainder, Den);
2522 
2523     // Remainder_A_Den = Remainder + Den
2524     auto Remainder_A_Den = B.buildAdd(S32, Remainder, Den);
2525 
2526     // Rem = (Tmp1 ? Remainder_S_Den : Remainder)
2527     auto Rem = B.buildSelect(S32, Tmp1, Remainder_S_Den, Remainder);
2528 
2529     // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den)
2530     B.buildSelect(DstReg, Remainder_GE_Zero, Rem, Remainder_A_Den);
2531   } else {
2532     B.buildSelect(DstReg, Remainder_GE_Zero, Div, Quotient_S_One);
2533   }
2534 }
2535 
2536 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2537                                               MachineRegisterInfo &MRI,
2538                                               MachineIRBuilder &B) const {
2539   B.setInstr(MI);
2540   const bool IsRem = MI.getOpcode() == AMDGPU::G_UREM;
2541   Register DstReg = MI.getOperand(0).getReg();
2542   Register Num = MI.getOperand(1).getReg();
2543   Register Den = MI.getOperand(2).getReg();
2544   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsRem);
2545   MI.eraseFromParent();
2546   return true;
2547 }
2548 
2549 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2550 //
2551 // Return lo, hi of result
2552 //
2553 // %cvt.lo = G_UITOFP Val.lo
2554 // %cvt.hi = G_UITOFP Val.hi
2555 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2556 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2557 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2558 // %mul2 = G_FMUL %mul1, 2**(-32)
2559 // %trunc = G_INTRINSIC_TRUNC %mul2
2560 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2561 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2562 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2563                                                        Register Val) {
2564   const LLT S32 = LLT::scalar(32);
2565   auto Unmerge = B.buildUnmerge(S32, Val);
2566 
2567   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2568   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2569 
2570   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2571                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2572 
2573   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2574   auto Mul1 =
2575       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2576 
2577   // 2**(-32)
2578   auto Mul2 =
2579       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2580   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2581 
2582   // -(2**32)
2583   auto Mad2 = B.buildFMAD(S32, Trunc,
2584                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2585 
2586   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2587   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2588 
2589   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2590 }
2591 
2592 bool AMDGPULegalizerInfo::legalizeUDIV_UREM64(MachineInstr &MI,
2593                                               MachineRegisterInfo &MRI,
2594                                               MachineIRBuilder &B) const {
2595   B.setInstr(MI);
2596 
2597   const bool IsDiv = MI.getOpcode() == TargetOpcode::G_UDIV;
2598   const LLT S32 = LLT::scalar(32);
2599   const LLT S64 = LLT::scalar(64);
2600   const LLT S1 = LLT::scalar(1);
2601   Register Numer = MI.getOperand(1).getReg();
2602   Register Denom = MI.getOperand(2).getReg();
2603   Register RcpLo, RcpHi;
2604 
2605   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2606 
2607   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2608 
2609   auto Zero64 = B.buildConstant(S64, 0);
2610   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2611 
2612   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2613   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2614 
2615   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2616   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2617   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2618 
2619   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2620   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2621   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2622   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2623 
2624   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2625   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2626   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2627   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2628   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2629 
2630   auto Zero32 = B.buildConstant(S32, 0);
2631   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2632   auto Add2_HiC =
2633       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2634   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2635   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2636 
2637   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2638   Register NumerLo = UnmergeNumer.getReg(0);
2639   Register NumerHi = UnmergeNumer.getReg(1);
2640 
2641   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2642   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2643   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2644   Register Mul3_Lo = UnmergeMul3.getReg(0);
2645   Register Mul3_Hi = UnmergeMul3.getReg(1);
2646   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2647   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2648   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2649   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2650 
2651   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2652   Register DenomLo = UnmergeDenom.getReg(0);
2653   Register DenomHi = UnmergeDenom.getReg(1);
2654 
2655   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2656   auto C1 = B.buildSExt(S32, CmpHi);
2657 
2658   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2659   auto C2 = B.buildSExt(S32, CmpLo);
2660 
2661   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2662   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2663 
2664   // TODO: Here and below portions of the code can be enclosed into if/endif.
2665   // Currently control flow is unconditional and we have 4 selects after
2666   // potential endif to substitute PHIs.
2667 
2668   // if C3 != 0 ...
2669   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2670   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2671   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2672   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2673 
2674   auto One64 = B.buildConstant(S64, 1);
2675   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2676 
2677   auto C4 =
2678       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2679   auto C5 =
2680       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2681   auto C6 = B.buildSelect(
2682       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2683 
2684   // if (C6 != 0)
2685   auto Add4 = B.buildAdd(S64, Add3, One64);
2686   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2687 
2688   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2689   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2690   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2691 
2692   // endif C6
2693   // endif C3
2694 
2695   if (IsDiv) {
2696     auto Sel1 = B.buildSelect(
2697         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2698     B.buildSelect(MI.getOperand(0),
2699                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2700   } else {
2701     auto Sel2 = B.buildSelect(
2702         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2703     B.buildSelect(MI.getOperand(0),
2704                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2705   }
2706 
2707   MI.eraseFromParent();
2708   return true;
2709 }
2710 
2711 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2712                                             MachineRegisterInfo &MRI,
2713                                             MachineIRBuilder &B) const {
2714   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2715   if (Ty == LLT::scalar(32))
2716     return legalizeUDIV_UREM32(MI, MRI, B);
2717   if (Ty == LLT::scalar(64))
2718     return legalizeUDIV_UREM64(MI, MRI, B);
2719   return false;
2720 }
2721 
2722 bool AMDGPULegalizerInfo::legalizeSDIV_SREM32(MachineInstr &MI,
2723                                               MachineRegisterInfo &MRI,
2724                                               MachineIRBuilder &B) const {
2725   B.setInstr(MI);
2726   const LLT S32 = LLT::scalar(32);
2727 
2728   const bool IsRem = MI.getOpcode() == AMDGPU::G_SREM;
2729   Register DstReg = MI.getOperand(0).getReg();
2730   Register LHS = MI.getOperand(1).getReg();
2731   Register RHS = MI.getOperand(2).getReg();
2732 
2733   auto ThirtyOne = B.buildConstant(S32, 31);
2734   auto LHSign = B.buildAShr(S32, LHS, ThirtyOne);
2735   auto RHSign = B.buildAShr(S32, LHS, ThirtyOne);
2736 
2737   LHS = B.buildAdd(S32, LHS, LHSign).getReg(0);
2738   RHS = B.buildAdd(S32, RHS, RHSign).getReg(0);
2739 
2740   LHS = B.buildXor(S32, LHS, LHSign).getReg(0);
2741   RHS = B.buildXor(S32, RHS, RHSign).getReg(0);
2742 
2743   Register UDivRem = MRI.createGenericVirtualRegister(S32);
2744   legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsRem);
2745 
2746   if (IsRem) {
2747     auto RSign = LHSign; // Remainder sign is the same as LHS
2748     UDivRem = B.buildXor(S32, UDivRem, RSign).getReg(0);
2749     B.buildSub(DstReg, UDivRem, RSign);
2750   } else {
2751     auto DSign = B.buildXor(S32, LHSign, RHSign);
2752     UDivRem = B.buildXor(S32, UDivRem, DSign).getReg(0);
2753     B.buildSub(DstReg, UDivRem, DSign);
2754   }
2755 
2756   MI.eraseFromParent();
2757   return true;
2758 }
2759 
2760 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2761                                             MachineRegisterInfo &MRI,
2762                                             MachineIRBuilder &B) const {
2763   if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32))
2764     return legalizeSDIV_SREM32(MI, MRI, B);
2765   return false;
2766 }
2767 
2768 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2769                                                  MachineRegisterInfo &MRI,
2770                                                  MachineIRBuilder &B) const {
2771   Register Res = MI.getOperand(0).getReg();
2772   Register LHS = MI.getOperand(1).getReg();
2773   Register RHS = MI.getOperand(2).getReg();
2774 
2775   uint16_t Flags = MI.getFlags();
2776 
2777   LLT ResTy = MRI.getType(Res);
2778   LLT S32 = LLT::scalar(32);
2779   LLT S64 = LLT::scalar(64);
2780 
2781   const MachineFunction &MF = B.getMF();
2782   bool Unsafe =
2783     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2784 
2785   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2786     return false;
2787 
2788   if (!Unsafe && ResTy == S32 &&
2789       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2790     return false;
2791 
2792   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2793     // 1 / x -> RCP(x)
2794     if (CLHS->isExactlyValue(1.0)) {
2795       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2796         .addUse(RHS)
2797         .setMIFlags(Flags);
2798 
2799       MI.eraseFromParent();
2800       return true;
2801     }
2802 
2803     // -1 / x -> RCP( FNEG(x) )
2804     if (CLHS->isExactlyValue(-1.0)) {
2805       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2806       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2807         .addUse(FNeg.getReg(0))
2808         .setMIFlags(Flags);
2809 
2810       MI.eraseFromParent();
2811       return true;
2812     }
2813   }
2814 
2815   // x / y -> x * (1.0 / y)
2816   if (Unsafe) {
2817     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2818       .addUse(RHS)
2819       .setMIFlags(Flags);
2820     B.buildFMul(Res, LHS, RCP, Flags);
2821 
2822     MI.eraseFromParent();
2823     return true;
2824   }
2825 
2826   return false;
2827 }
2828 
2829 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2830                                          MachineRegisterInfo &MRI,
2831                                          MachineIRBuilder &B) const {
2832   B.setInstr(MI);
2833   Register Res = MI.getOperand(0).getReg();
2834   Register LHS = MI.getOperand(1).getReg();
2835   Register RHS = MI.getOperand(2).getReg();
2836 
2837   uint16_t Flags = MI.getFlags();
2838 
2839   LLT S16 = LLT::scalar(16);
2840   LLT S32 = LLT::scalar(32);
2841 
2842   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2843   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2844 
2845   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2846     .addUse(RHSExt.getReg(0))
2847     .setMIFlags(Flags);
2848 
2849   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2850   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2851 
2852   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2853     .addUse(RDst.getReg(0))
2854     .addUse(RHS)
2855     .addUse(LHS)
2856     .setMIFlags(Flags);
2857 
2858   MI.eraseFromParent();
2859   return true;
2860 }
2861 
2862 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2863 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2864 static void toggleSPDenormMode(bool Enable,
2865                                MachineIRBuilder &B,
2866                                const GCNSubtarget &ST,
2867                                AMDGPU::SIModeRegisterDefaults Mode) {
2868   // Set SP denorm mode to this value.
2869   unsigned SPDenormMode =
2870     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
2871 
2872   if (ST.hasDenormModeInst()) {
2873     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2874     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
2875 
2876     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2877     B.buildInstr(AMDGPU::S_DENORM_MODE)
2878       .addImm(NewDenormModeValue);
2879 
2880   } else {
2881     // Select FP32 bit field in mode register.
2882     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2883                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2884                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2885 
2886     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2887       .addImm(SPDenormMode)
2888       .addImm(SPDenormModeBitField);
2889   }
2890 }
2891 
2892 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2893                                          MachineRegisterInfo &MRI,
2894                                          MachineIRBuilder &B) const {
2895   B.setInstr(MI);
2896   Register Res = MI.getOperand(0).getReg();
2897   Register LHS = MI.getOperand(1).getReg();
2898   Register RHS = MI.getOperand(2).getReg();
2899   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2900   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2901 
2902   uint16_t Flags = MI.getFlags();
2903 
2904   LLT S32 = LLT::scalar(32);
2905   LLT S1 = LLT::scalar(1);
2906 
2907   auto One = B.buildFConstant(S32, 1.0f);
2908 
2909   auto DenominatorScaled =
2910     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2911       .addUse(LHS)
2912       .addUse(RHS)
2913       .addImm(0)
2914       .setMIFlags(Flags);
2915   auto NumeratorScaled =
2916     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2917       .addUse(LHS)
2918       .addUse(RHS)
2919       .addImm(1)
2920       .setMIFlags(Flags);
2921 
2922   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2923     .addUse(DenominatorScaled.getReg(0))
2924     .setMIFlags(Flags);
2925   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2926 
2927   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2928   // aren't modeled as reading it.
2929   if (!Mode.allFP32Denormals())
2930     toggleSPDenormMode(true, B, ST, Mode);
2931 
2932   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2933   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2934   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2935   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2936   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2937   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2938 
2939   if (!Mode.allFP32Denormals())
2940     toggleSPDenormMode(false, B, ST, Mode);
2941 
2942   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2943     .addUse(Fma4.getReg(0))
2944     .addUse(Fma1.getReg(0))
2945     .addUse(Fma3.getReg(0))
2946     .addUse(NumeratorScaled.getReg(1))
2947     .setMIFlags(Flags);
2948 
2949   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2950     .addUse(Fmas.getReg(0))
2951     .addUse(RHS)
2952     .addUse(LHS)
2953     .setMIFlags(Flags);
2954 
2955   MI.eraseFromParent();
2956   return true;
2957 }
2958 
2959 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
2960                                          MachineRegisterInfo &MRI,
2961                                          MachineIRBuilder &B) const {
2962   B.setInstr(MI);
2963   Register Res = MI.getOperand(0).getReg();
2964   Register LHS = MI.getOperand(1).getReg();
2965   Register RHS = MI.getOperand(2).getReg();
2966 
2967   uint16_t Flags = MI.getFlags();
2968 
2969   LLT S64 = LLT::scalar(64);
2970   LLT S1 = LLT::scalar(1);
2971 
2972   auto One = B.buildFConstant(S64, 1.0);
2973 
2974   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2975     .addUse(LHS)
2976     .addUse(RHS)
2977     .addImm(0)
2978     .setMIFlags(Flags);
2979 
2980   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
2981 
2982   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
2983     .addUse(DivScale0.getReg(0))
2984     .setMIFlags(Flags);
2985 
2986   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
2987   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
2988   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
2989 
2990   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2991     .addUse(LHS)
2992     .addUse(RHS)
2993     .addImm(1)
2994     .setMIFlags(Flags);
2995 
2996   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
2997   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
2998   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
2999 
3000   Register Scale;
3001   if (!ST.hasUsableDivScaleConditionOutput()) {
3002     // Workaround a hardware bug on SI where the condition output from div_scale
3003     // is not usable.
3004 
3005     LLT S32 = LLT::scalar(32);
3006 
3007     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3008     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3009     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3010     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3011 
3012     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3013                               Scale1Unmerge.getReg(1));
3014     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3015                               Scale0Unmerge.getReg(1));
3016     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3017   } else {
3018     Scale = DivScale1.getReg(1);
3019   }
3020 
3021   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3022     .addUse(Fma4.getReg(0))
3023     .addUse(Fma3.getReg(0))
3024     .addUse(Mul.getReg(0))
3025     .addUse(Scale)
3026     .setMIFlags(Flags);
3027 
3028   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3029     .addUse(Fmas.getReg(0))
3030     .addUse(RHS)
3031     .addUse(LHS)
3032     .setMIFlags(Flags);
3033 
3034   MI.eraseFromParent();
3035   return true;
3036 }
3037 
3038 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3039                                                  MachineRegisterInfo &MRI,
3040                                                  MachineIRBuilder &B) const {
3041   B.setInstr(MI);
3042   Register Res = MI.getOperand(0).getReg();
3043   Register LHS = MI.getOperand(2).getReg();
3044   Register RHS = MI.getOperand(3).getReg();
3045   uint16_t Flags = MI.getFlags();
3046 
3047   LLT S32 = LLT::scalar(32);
3048   LLT S1 = LLT::scalar(1);
3049 
3050   auto Abs = B.buildFAbs(S32, RHS, Flags);
3051   const APFloat C0Val(1.0f);
3052 
3053   auto C0 = B.buildConstant(S32, 0x6f800000);
3054   auto C1 = B.buildConstant(S32, 0x2f800000);
3055   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3056 
3057   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3058   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3059 
3060   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3061 
3062   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3063     .addUse(Mul0.getReg(0))
3064     .setMIFlags(Flags);
3065 
3066   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3067 
3068   B.buildFMul(Res, Sel, Mul1, Flags);
3069 
3070   MI.eraseFromParent();
3071   return true;
3072 }
3073 
3074 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3075                                                  MachineRegisterInfo &MRI,
3076                                                  MachineIRBuilder &B) const {
3077   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3078   if (!MFI->isEntryFunction()) {
3079     return legalizePreloadedArgIntrin(MI, MRI, B,
3080                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3081   }
3082 
3083   B.setInstr(MI);
3084 
3085   uint64_t Offset =
3086     ST.getTargetLowering()->getImplicitParameterOffset(
3087       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3088   Register DstReg = MI.getOperand(0).getReg();
3089   LLT DstTy = MRI.getType(DstReg);
3090   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3091 
3092   const ArgDescriptor *Arg;
3093   const TargetRegisterClass *RC;
3094   std::tie(Arg, RC)
3095     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
3096   if (!Arg)
3097     return false;
3098 
3099   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3100   if (!loadInputValue(KernargPtrReg, B, Arg))
3101     return false;
3102 
3103   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3104   MI.eraseFromParent();
3105   return true;
3106 }
3107 
3108 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3109                                               MachineRegisterInfo &MRI,
3110                                               MachineIRBuilder &B,
3111                                               unsigned AddrSpace) const {
3112   B.setInstr(MI);
3113   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3114   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
3115   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3116   MI.eraseFromParent();
3117   return true;
3118 }
3119 
3120 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3121 // offset (the offset that is included in bounds checking and swizzling, to be
3122 // split between the instruction's voffset and immoffset fields) and soffset
3123 // (the offset that is excluded from bounds checking and swizzling, to go in
3124 // the instruction's soffset field).  This function takes the first kind of
3125 // offset and figures out how to split it between voffset and immoffset.
3126 std::tuple<Register, unsigned, unsigned>
3127 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3128                                         Register OrigOffset) const {
3129   const unsigned MaxImm = 4095;
3130   Register BaseReg;
3131   unsigned TotalConstOffset;
3132   MachineInstr *OffsetDef;
3133   const LLT S32 = LLT::scalar(32);
3134 
3135   std::tie(BaseReg, TotalConstOffset, OffsetDef)
3136     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
3137 
3138   unsigned ImmOffset = TotalConstOffset;
3139 
3140   // If the immediate value is too big for the immoffset field, put the value
3141   // and -4096 into the immoffset field so that the value that is copied/added
3142   // for the voffset field is a multiple of 4096, and it stands more chance
3143   // of being CSEd with the copy/add for another similar load/store.
3144   // However, do not do that rounding down to a multiple of 4096 if that is a
3145   // negative number, as it appears to be illegal to have a negative offset
3146   // in the vgpr, even if adding the immediate offset makes it positive.
3147   unsigned Overflow = ImmOffset & ~MaxImm;
3148   ImmOffset -= Overflow;
3149   if ((int32_t)Overflow < 0) {
3150     Overflow += ImmOffset;
3151     ImmOffset = 0;
3152   }
3153 
3154   if (Overflow != 0) {
3155     if (!BaseReg) {
3156       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3157     } else {
3158       auto OverflowVal = B.buildConstant(S32, Overflow);
3159       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3160     }
3161   }
3162 
3163   if (!BaseReg)
3164     BaseReg = B.buildConstant(S32, 0).getReg(0);
3165 
3166   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3167 }
3168 
3169 /// Handle register layout difference for f16 images for some subtargets.
3170 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3171                                              MachineRegisterInfo &MRI,
3172                                              Register Reg) const {
3173   if (!ST.hasUnpackedD16VMem())
3174     return Reg;
3175 
3176   const LLT S16 = LLT::scalar(16);
3177   const LLT S32 = LLT::scalar(32);
3178   LLT StoreVT = MRI.getType(Reg);
3179   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3180 
3181   auto Unmerge = B.buildUnmerge(S16, Reg);
3182 
3183   SmallVector<Register, 4> WideRegs;
3184   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3185     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3186 
3187   int NumElts = StoreVT.getNumElements();
3188 
3189   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3190 }
3191 
3192 Register AMDGPULegalizerInfo::fixStoreSourceType(
3193   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3194   MachineRegisterInfo *MRI = B.getMRI();
3195   LLT Ty = MRI->getType(VData);
3196 
3197   const LLT S16 = LLT::scalar(16);
3198 
3199   // Fixup illegal register types for i8 stores.
3200   if (Ty == LLT::scalar(8) || Ty == S16) {
3201     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3202     return AnyExt;
3203   }
3204 
3205   if (Ty.isVector()) {
3206     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3207       if (IsFormat)
3208         return handleD16VData(B, *MRI, VData);
3209     }
3210   }
3211 
3212   return VData;
3213 }
3214 
3215 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3216                                               MachineRegisterInfo &MRI,
3217                                               MachineIRBuilder &B,
3218                                               bool IsTyped,
3219                                               bool IsFormat) const {
3220   B.setInstr(MI);
3221 
3222   Register VData = MI.getOperand(1).getReg();
3223   LLT Ty = MRI.getType(VData);
3224   LLT EltTy = Ty.getScalarType();
3225   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3226   const LLT S32 = LLT::scalar(32);
3227 
3228   VData = fixStoreSourceType(B, VData, IsFormat);
3229   Register RSrc = MI.getOperand(2).getReg();
3230 
3231   MachineMemOperand *MMO = *MI.memoperands_begin();
3232   const int MemSize = MMO->getSize();
3233 
3234   unsigned ImmOffset;
3235   unsigned TotalOffset;
3236 
3237   // The typed intrinsics add an immediate after the registers.
3238   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3239 
3240   // The struct intrinsic variants add one additional operand over raw.
3241   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3242   Register VIndex;
3243   int OpOffset = 0;
3244   if (HasVIndex) {
3245     VIndex = MI.getOperand(3).getReg();
3246     OpOffset = 1;
3247   }
3248 
3249   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3250   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3251 
3252   unsigned Format = 0;
3253   if (IsTyped) {
3254     Format = MI.getOperand(5 + OpOffset).getImm();
3255     ++OpOffset;
3256   }
3257 
3258   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3259 
3260   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3261   if (TotalOffset != 0)
3262     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3263 
3264   unsigned Opc;
3265   if (IsTyped) {
3266     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3267                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3268   } else if (IsFormat) {
3269     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3270                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3271   } else {
3272     switch (MemSize) {
3273     case 1:
3274       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3275       break;
3276     case 2:
3277       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3278       break;
3279     default:
3280       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3281       break;
3282     }
3283   }
3284 
3285   if (!VIndex)
3286     VIndex = B.buildConstant(S32, 0).getReg(0);
3287 
3288   auto MIB = B.buildInstr(Opc)
3289     .addUse(VData)              // vdata
3290     .addUse(RSrc)               // rsrc
3291     .addUse(VIndex)             // vindex
3292     .addUse(VOffset)            // voffset
3293     .addUse(SOffset)            // soffset
3294     .addImm(ImmOffset);         // offset(imm)
3295 
3296   if (IsTyped)
3297     MIB.addImm(Format);
3298 
3299   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3300      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3301      .addMemOperand(MMO);
3302 
3303   MI.eraseFromParent();
3304   return true;
3305 }
3306 
3307 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3308                                              MachineRegisterInfo &MRI,
3309                                              MachineIRBuilder &B,
3310                                              bool IsFormat,
3311                                              bool IsTyped) const {
3312   B.setInstr(MI);
3313 
3314   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3315   MachineMemOperand *MMO = *MI.memoperands_begin();
3316   const int MemSize = MMO->getSize();
3317   const LLT S32 = LLT::scalar(32);
3318 
3319   Register Dst = MI.getOperand(0).getReg();
3320   Register RSrc = MI.getOperand(2).getReg();
3321 
3322   // The typed intrinsics add an immediate after the registers.
3323   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3324 
3325   // The struct intrinsic variants add one additional operand over raw.
3326   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3327   Register VIndex;
3328   int OpOffset = 0;
3329   if (HasVIndex) {
3330     VIndex = MI.getOperand(3).getReg();
3331     OpOffset = 1;
3332   }
3333 
3334   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3335   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3336 
3337   unsigned Format = 0;
3338   if (IsTyped) {
3339     Format = MI.getOperand(5 + OpOffset).getImm();
3340     ++OpOffset;
3341   }
3342 
3343   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3344   unsigned ImmOffset;
3345   unsigned TotalOffset;
3346 
3347   LLT Ty = MRI.getType(Dst);
3348   LLT EltTy = Ty.getScalarType();
3349   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3350   const bool Unpacked = ST.hasUnpackedD16VMem();
3351 
3352   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3353   if (TotalOffset != 0)
3354     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3355 
3356   unsigned Opc;
3357 
3358   if (IsTyped) {
3359     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3360                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3361   } else if (IsFormat) {
3362     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3363                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3364   } else {
3365     switch (MemSize) {
3366     case 1:
3367       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3368       break;
3369     case 2:
3370       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3371       break;
3372     default:
3373       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3374       break;
3375     }
3376   }
3377 
3378   Register LoadDstReg;
3379 
3380   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3381   LLT UnpackedTy = Ty.changeElementSize(32);
3382 
3383   if (IsExtLoad)
3384     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3385   else if (Unpacked && IsD16 && Ty.isVector())
3386     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3387   else
3388     LoadDstReg = Dst;
3389 
3390   if (!VIndex)
3391     VIndex = B.buildConstant(S32, 0).getReg(0);
3392 
3393   auto MIB = B.buildInstr(Opc)
3394     .addDef(LoadDstReg)         // vdata
3395     .addUse(RSrc)               // rsrc
3396     .addUse(VIndex)             // vindex
3397     .addUse(VOffset)            // voffset
3398     .addUse(SOffset)            // soffset
3399     .addImm(ImmOffset);         // offset(imm)
3400 
3401   if (IsTyped)
3402     MIB.addImm(Format);
3403 
3404   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3405      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3406      .addMemOperand(MMO);
3407 
3408   if (LoadDstReg != Dst) {
3409     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3410 
3411     // Widen result for extending loads was widened.
3412     if (IsExtLoad)
3413       B.buildTrunc(Dst, LoadDstReg);
3414     else {
3415       // Repack to original 16-bit vector result
3416       // FIXME: G_TRUNC should work, but legalization currently fails
3417       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3418       SmallVector<Register, 4> Repack;
3419       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3420         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3421       B.buildMerge(Dst, Repack);
3422     }
3423   }
3424 
3425   MI.eraseFromParent();
3426   return true;
3427 }
3428 
3429 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3430                                                MachineIRBuilder &B,
3431                                                bool IsInc) const {
3432   B.setInstr(MI);
3433   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3434                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3435   B.buildInstr(Opc)
3436     .addDef(MI.getOperand(0).getReg())
3437     .addUse(MI.getOperand(2).getReg())
3438     .addUse(MI.getOperand(3).getReg())
3439     .cloneMemRefs(MI);
3440   MI.eraseFromParent();
3441   return true;
3442 }
3443 
3444 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3445   switch (IntrID) {
3446   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3447   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3448     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3449   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3450   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3451     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3452   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3453   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3454     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3455   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3456   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3457     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3458   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3459   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3460     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3461   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3462   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3463     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3464   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3465   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3466     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3467   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3468   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3469     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3470   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3471   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3472     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3473   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3474   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3475     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3476   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3477   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3478     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3479   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3480   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3481     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3482   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3483   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3484     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3485   default:
3486     llvm_unreachable("unhandled atomic opcode");
3487   }
3488 }
3489 
3490 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3491                                                MachineIRBuilder &B,
3492                                                Intrinsic::ID IID) const {
3493   B.setInstr(MI);
3494 
3495   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3496                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3497 
3498   Register Dst = MI.getOperand(0).getReg();
3499   Register VData = MI.getOperand(2).getReg();
3500 
3501   Register CmpVal;
3502   int OpOffset = 0;
3503 
3504   if (IsCmpSwap) {
3505     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3506     ++OpOffset;
3507   }
3508 
3509   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3510   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
3511 
3512   // The struct intrinsic variants add one additional operand over raw.
3513   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3514   Register VIndex;
3515   if (HasVIndex) {
3516     VIndex = MI.getOperand(4 + OpOffset).getReg();
3517     ++OpOffset;
3518   }
3519 
3520   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3521   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3522   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3523 
3524   MachineMemOperand *MMO = *MI.memoperands_begin();
3525 
3526   unsigned ImmOffset;
3527   unsigned TotalOffset;
3528   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3529   if (TotalOffset != 0)
3530     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3531 
3532   if (!VIndex)
3533     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3534 
3535   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
3536     .addDef(Dst)
3537     .addUse(VData); // vdata
3538 
3539   if (IsCmpSwap)
3540     MIB.addReg(CmpVal);
3541 
3542   MIB.addUse(RSrc)               // rsrc
3543      .addUse(VIndex)             // vindex
3544      .addUse(VOffset)            // voffset
3545      .addUse(SOffset)            // soffset
3546      .addImm(ImmOffset)          // offset(imm)
3547      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3548      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3549      .addMemOperand(MMO);
3550 
3551   MI.eraseFromParent();
3552   return true;
3553 }
3554 
3555 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3556 /// vector with s16 typed elements.
3557 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI,
3558                                         SmallVectorImpl<Register> &PackedAddrs,
3559                                         int AddrIdx, int DimIdx, int NumVAddrs,
3560                                         int NumGradients) {
3561   const LLT S16 = LLT::scalar(16);
3562   const LLT V2S16 = LLT::vector(2, 16);
3563 
3564   for (int I = AddrIdx; I < AddrIdx + NumVAddrs; ++I) {
3565     MachineOperand &SrcOp = MI.getOperand(I);
3566     if (!SrcOp.isReg())
3567       continue; // _L to _LZ may have eliminated this.
3568 
3569     Register AddrReg = SrcOp.getReg();
3570 
3571     if (I < DimIdx) {
3572       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
3573       PackedAddrs.push_back(AddrReg);
3574     } else {
3575       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
3576       // derivatives dx/dh and dx/dv are packed with undef.
3577       if (((I + 1) >= (AddrIdx + NumVAddrs)) ||
3578           ((NumGradients / 2) % 2 == 1 &&
3579            (I == DimIdx + (NumGradients / 2) - 1 ||
3580             I == DimIdx + NumGradients - 1)) ||
3581           // Check for _L to _LZ optimization
3582           !MI.getOperand(I + 1).isReg()) {
3583         PackedAddrs.push_back(
3584             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
3585                 .getReg(0));
3586       } else {
3587         PackedAddrs.push_back(
3588             B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()})
3589                 .getReg(0));
3590         ++I;
3591       }
3592     }
3593   }
3594 }
3595 
3596 /// Convert from separate vaddr components to a single vector address register,
3597 /// and replace the remaining operands with $noreg.
3598 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
3599                                      int DimIdx, int NumVAddrs) {
3600   const LLT S32 = LLT::scalar(32);
3601 
3602   SmallVector<Register, 8> AddrRegs;
3603   for (int I = 0; I != NumVAddrs; ++I) {
3604     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3605     if (SrcOp.isReg()) {
3606       AddrRegs.push_back(SrcOp.getReg());
3607       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
3608     }
3609   }
3610 
3611   int NumAddrRegs = AddrRegs.size();
3612   if (NumAddrRegs != 1) {
3613     // Round up to 8 elements for v5-v7
3614     // FIXME: Missing intermediate sized register classes and instructions.
3615     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
3616       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
3617       auto Undef = B.buildUndef(S32);
3618       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
3619       NumAddrRegs = RoundedNumRegs;
3620     }
3621 
3622     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
3623     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
3624   }
3625 
3626   for (int I = 1; I != NumVAddrs; ++I) {
3627     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3628     if (SrcOp.isReg())
3629       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
3630   }
3631 }
3632 
3633 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
3634 ///
3635 /// Depending on the subtarget, load/store with 16-bit element data need to be
3636 /// rewritten to use the low half of 32-bit registers, or directly use a packed
3637 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
3638 /// registers.
3639 ///
3640 /// We don't want to directly select image instructions just yet, but also want
3641 /// to exposes all register repacking to the legalizer/combiners. We also don't
3642 /// want a selected instrution entering RegBankSelect. In order to avoid
3643 /// defining a multitude of intermediate image instructions, directly hack on
3644 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
3645 /// now unnecessary arguments with $noreg.
3646 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3647     MachineInstr &MI, MachineIRBuilder &B,
3648     GISelChangeObserver &Observer,
3649     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3650   B.setInstr(MI);
3651 
3652   const int NumDefs = MI.getNumExplicitDefs();
3653   bool IsTFE = NumDefs == 2;
3654   // We are only processing the operands of d16 image operations on subtargets
3655   // that use the unpacked register layout, or need to repack the TFE result.
3656 
3657   // TODO: Do we need to guard against already legalized intrinsics?
3658   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3659     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3660 
3661   MachineRegisterInfo *MRI = B.getMRI();
3662   const LLT S32 = LLT::scalar(32);
3663   const LLT S16 = LLT::scalar(16);
3664   const LLT V2S16 = LLT::vector(2, 16);
3665 
3666   // Index of first address argument
3667   const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs);
3668 
3669   // Check for 16 bit addresses and pack if true.
3670   int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
3671   LLT AddrTy = MRI->getType(MI.getOperand(DimIdx).getReg());
3672   const bool IsA16 = AddrTy == S16;
3673 
3674   int NumVAddrs, NumGradients;
3675   std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode);
3676   const int DMaskIdx = BaseOpcode->Atomic ? -1 :
3677     getDMaskIdx(BaseOpcode, NumDefs);
3678   unsigned DMask = 0;
3679 
3680   int DMaskLanes = 0;
3681   if (!BaseOpcode->Atomic) {
3682     DMask = MI.getOperand(DMaskIdx).getImm();
3683     if (BaseOpcode->Gather4) {
3684       DMaskLanes = 4;
3685     } else if (DMask != 0) {
3686       DMaskLanes = countPopulation(DMask);
3687     } else if (!IsTFE && !BaseOpcode->Store) {
3688       // If dmask is 0, this is a no-op load. This can be eliminated.
3689       B.buildUndef(MI.getOperand(0));
3690       MI.eraseFromParent();
3691       return true;
3692     }
3693   }
3694 
3695   Observer.changingInstr(MI);
3696   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
3697 
3698   unsigned NewOpcode = NumDefs == 0 ?
3699     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
3700 
3701   // Track that we legalized this
3702   MI.setDesc(B.getTII().get(NewOpcode));
3703 
3704   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
3705   // dmask to be at least 1 otherwise the instruction will fail
3706   if (IsTFE && DMask == 0) {
3707     DMask = 0x1;
3708     DMaskLanes = 1;
3709     MI.getOperand(DMaskIdx).setImm(DMask);
3710   }
3711 
3712   if (BaseOpcode->Atomic) {
3713     Register VData0 = MI.getOperand(2).getReg();
3714     LLT Ty = MRI->getType(VData0);
3715 
3716     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
3717     if (Ty.isVector())
3718       return false;
3719 
3720     if (BaseOpcode->AtomicX2) {
3721       Register VData1 = MI.getOperand(3).getReg();
3722       // The two values are packed in one register.
3723       LLT PackedTy = LLT::vector(2, Ty);
3724       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
3725       MI.getOperand(2).setReg(Concat.getReg(0));
3726       MI.getOperand(3).setReg(AMDGPU::NoRegister);
3727     }
3728   }
3729 
3730   int CorrectedNumVAddrs = NumVAddrs;
3731 
3732   // Optimize _L to _LZ when _L is zero
3733   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
3734         AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
3735     const ConstantFP *ConstantLod;
3736     const int LodIdx = AddrIdx + NumVAddrs - 1;
3737 
3738     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) {
3739       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
3740         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
3741         ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode(
3742           LZMappingInfo->LZ, ImageDimIntr->Dim);
3743 
3744         // The starting indexes should remain in the same place.
3745         --NumVAddrs;
3746         --CorrectedNumVAddrs;
3747 
3748         MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID(
3749           static_cast<Intrinsic::ID>(ImageDimIntr->Intr));
3750         MI.RemoveOperand(LodIdx);
3751       }
3752     }
3753   }
3754 
3755   // Optimize _mip away, when 'lod' is zero
3756   if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
3757     int64_t ConstantLod;
3758     const int LodIdx = AddrIdx + NumVAddrs - 1;
3759 
3760     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) {
3761       if (ConstantLod == 0) {
3762         // TODO: Change intrinsic opcode and remove operand instead or replacing
3763         // it with 0, as the _L to _LZ handling is done above.
3764         MI.getOperand(LodIdx).ChangeToImmediate(0);
3765         --CorrectedNumVAddrs;
3766       }
3767     }
3768   }
3769 
3770   // If the register allocator cannot place the address registers contiguously
3771   // without introducing moves, then using the non-sequential address encoding
3772   // is always preferable, since it saves VALU instructions and is usually a
3773   // wash in terms of code size or even better.
3774   //
3775   // However, we currently have no way of hinting to the register allocator
3776   // that MIMG addresses should be placed contiguously when it is possible to
3777   // do so, so force non-NSA for the common 2-address case as a heuristic.
3778   //
3779   // SIShrinkInstructions will convert NSA encodings to non-NSA after register
3780   // allocation when possible.
3781   const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
3782 
3783   // Rewrite the addressing register layout before doing anything else.
3784   if (IsA16) {
3785     // FIXME: this feature is missing from gfx10. When that is fixed, this check
3786     // should be introduced.
3787     if (!ST.hasR128A16() && !ST.hasGFX10A16())
3788       return false;
3789 
3790     if (NumVAddrs > 1) {
3791       SmallVector<Register, 4> PackedRegs;
3792       packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, NumVAddrs,
3793                                   NumGradients);
3794 
3795       if (!UseNSA && PackedRegs.size() > 1) {
3796         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
3797         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
3798         PackedRegs[0] = Concat.getReg(0);
3799         PackedRegs.resize(1);
3800       }
3801 
3802       const int NumPacked = PackedRegs.size();
3803       for (int I = 0; I != NumVAddrs; ++I) {
3804         MachineOperand &SrcOp = MI.getOperand(AddrIdx + I);
3805         if (!SrcOp.isReg()) {
3806           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
3807           continue;
3808         }
3809 
3810         assert(SrcOp.getReg() != AMDGPU::NoRegister);
3811 
3812         if (I < NumPacked)
3813           SrcOp.setReg(PackedRegs[I]);
3814         else
3815           SrcOp.setReg(AMDGPU::NoRegister);
3816       }
3817     }
3818   } else if (!UseNSA && NumVAddrs > 1) {
3819     convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
3820   }
3821 
3822 
3823   if (BaseOpcode->Store) { // No TFE for stores?
3824     // TODO: Handle dmask trim
3825     Register VData = MI.getOperand(1).getReg();
3826     LLT Ty = MRI->getType(VData);
3827     if (!Ty.isVector() || Ty.getElementType() != S16)
3828       return true;
3829 
3830     B.setInstr(MI);
3831 
3832     Register RepackedReg = handleD16VData(B, *MRI, VData);
3833     if (RepackedReg != VData) {
3834       MI.getOperand(1).setReg(RepackedReg);
3835     }
3836 
3837     return true;
3838   }
3839 
3840   Register DstReg = MI.getOperand(0).getReg();
3841   LLT Ty = MRI->getType(DstReg);
3842   const LLT EltTy = Ty.getScalarType();
3843   const bool IsD16 = Ty.getScalarType() == S16;
3844   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
3845 
3846   // Confirm that the return type is large enough for the dmask specified
3847   if (NumElts < DMaskLanes)
3848     return false;
3849 
3850   if (NumElts > 4 || DMaskLanes > 4)
3851     return false;
3852 
3853   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
3854   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
3855 
3856   // The raw dword aligned data component of the load. The only legal cases
3857   // where this matters should be when using the packed D16 format, for
3858   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
3859   LLT RoundedTy;
3860 
3861   // S32 vector to to cover all data, plus TFE result element.
3862   LLT TFETy;
3863 
3864   // Register type to use for each loaded component. Will be S32 or V2S16.
3865   LLT RegTy;
3866 
3867   if (IsD16 && ST.hasUnpackedD16VMem()) {
3868     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
3869     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
3870     RegTy = S32;
3871   } else {
3872     unsigned EltSize = EltTy.getSizeInBits();
3873     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
3874     unsigned RoundedSize = 32 * RoundedElts;
3875     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
3876     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
3877     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
3878   }
3879 
3880   // The return type does not need adjustment.
3881   // TODO: Should we change s16 case to s32 or <2 x s16>?
3882   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
3883     return true;
3884 
3885   Register Dst1Reg;
3886 
3887   // Insert after the instruction.
3888   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3889 
3890   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
3891   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
3892   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
3893   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
3894 
3895   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
3896 
3897   MI.getOperand(0).setReg(NewResultReg);
3898 
3899   // In the IR, TFE is supposed to be used with a 2 element struct return
3900   // type. The intruction really returns these two values in one contiguous
3901   // register, with one additional dword beyond the loaded data. Rewrite the
3902   // return type to use a single register result.
3903 
3904   if (IsTFE) {
3905     Dst1Reg = MI.getOperand(1).getReg();
3906     if (MRI->getType(Dst1Reg) != S32)
3907       return false;
3908 
3909     // TODO: Make sure the TFE operand bit is set.
3910     MI.RemoveOperand(1);
3911 
3912     // Handle the easy case that requires no repack instructions.
3913     if (Ty == S32) {
3914       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
3915       return true;
3916     }
3917   }
3918 
3919   // Now figure out how to copy the new result register back into the old
3920   // result.
3921   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
3922 
3923   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
3924 
3925   if (ResultNumRegs == 1) {
3926     assert(!IsTFE);
3927     ResultRegs[0] = NewResultReg;
3928   } else {
3929     // We have to repack into a new vector of some kind.
3930     for (int I = 0; I != NumDataRegs; ++I)
3931       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
3932     B.buildUnmerge(ResultRegs, NewResultReg);
3933 
3934     // Drop the final TFE element to get the data part. The TFE result is
3935     // directly written to the right place already.
3936     if (IsTFE)
3937       ResultRegs.resize(NumDataRegs);
3938   }
3939 
3940   // For an s16 scalar result, we form an s32 result with a truncate regardless
3941   // of packed vs. unpacked.
3942   if (IsD16 && !Ty.isVector()) {
3943     B.buildTrunc(DstReg, ResultRegs[0]);
3944     return true;
3945   }
3946 
3947   // Avoid a build/concat_vector of 1 entry.
3948   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
3949     B.buildBitcast(DstReg, ResultRegs[0]);
3950     return true;
3951   }
3952 
3953   assert(Ty.isVector());
3954 
3955   if (IsD16) {
3956     // For packed D16 results with TFE enabled, all the data components are
3957     // S32. Cast back to the expected type.
3958     //
3959     // TODO: We don't really need to use load s32 elements. We would only need one
3960     // cast for the TFE result if a multiple of v2s16 was used.
3961     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
3962       for (Register &Reg : ResultRegs)
3963         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
3964     } else if (ST.hasUnpackedD16VMem()) {
3965       for (Register &Reg : ResultRegs)
3966         Reg = B.buildTrunc(S16, Reg).getReg(0);
3967     }
3968   }
3969 
3970   auto padWithUndef = [&](LLT Ty, int NumElts) {
3971     if (NumElts == 0)
3972       return;
3973     Register Undef = B.buildUndef(Ty).getReg(0);
3974     for (int I = 0; I != NumElts; ++I)
3975       ResultRegs.push_back(Undef);
3976   };
3977 
3978   // Pad out any elements eliminated due to the dmask.
3979   LLT ResTy = MRI->getType(ResultRegs[0]);
3980   if (!ResTy.isVector()) {
3981     padWithUndef(ResTy, NumElts - ResultRegs.size());
3982     B.buildBuildVector(DstReg, ResultRegs);
3983     return true;
3984   }
3985 
3986   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
3987   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
3988 
3989   // Deal with the one annoying legal case.
3990   const LLT V3S16 = LLT::vector(3, 16);
3991   if (Ty == V3S16) {
3992     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
3993     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
3994     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
3995     return true;
3996   }
3997 
3998   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
3999   B.buildConcatVectors(DstReg, ResultRegs);
4000   return true;
4001 }
4002 
4003 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4004   MachineInstr &MI, MachineIRBuilder &B,
4005   GISelChangeObserver &Observer) const {
4006   Register Dst = MI.getOperand(0).getReg();
4007   LLT Ty = B.getMRI()->getType(Dst);
4008   unsigned Size = Ty.getSizeInBits();
4009   MachineFunction &MF = B.getMF();
4010 
4011   Observer.changingInstr(MI);
4012 
4013   // FIXME: We don't really need this intermediate instruction. The intrinsic
4014   // should be fixed to have a memory operand. Since it's readnone, we're not
4015   // allowed to add one.
4016   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4017   MI.RemoveOperand(1); // Remove intrinsic ID
4018 
4019   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4020   // TODO: Should this use datalayout alignment?
4021   const unsigned MemSize = (Size + 7) / 8;
4022   const Align MemAlign(4);
4023   MachineMemOperand *MMO = MF.getMachineMemOperand(
4024       MachinePointerInfo(),
4025       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4026           MachineMemOperand::MOInvariant,
4027       MemSize, MemAlign);
4028   MI.addMemOperand(MF, MMO);
4029 
4030   // There are no 96-bit result scalar loads, but widening to 128-bit should
4031   // always be legal. We may need to restore this to a 96-bit result if it turns
4032   // out this needs to be converted to a vector load during RegBankSelect.
4033   if (!isPowerOf2_32(Size)) {
4034     LegalizerHelper Helper(MF, *this, Observer, B);
4035     B.setInstr(MI);
4036 
4037     if (Ty.isVector())
4038       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4039     else
4040       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4041   }
4042 
4043   Observer.changedInstr(MI);
4044   return true;
4045 }
4046 
4047 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4048                                                 MachineRegisterInfo &MRI,
4049                                                 MachineIRBuilder &B) const {
4050   B.setInstr(MI);
4051 
4052   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
4053   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4054       !ST.isTrapHandlerEnabled()) {
4055     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4056   } else {
4057     // Pass queue pointer to trap handler as input, and insert trap instruction
4058     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4059     const ArgDescriptor *Arg =
4060         getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR);
4061     if (!Arg)
4062       return false;
4063     MachineRegisterInfo &MRI = *B.getMRI();
4064     Register SGPR01(AMDGPU::SGPR0_SGPR1);
4065     Register LiveIn = getLiveInRegister(
4066         B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64),
4067         /*InsertLiveInCopy=*/false);
4068     if (!loadInputValue(LiveIn, B, Arg))
4069       return false;
4070     B.buildCopy(SGPR01, LiveIn);
4071     B.buildInstr(AMDGPU::S_TRAP)
4072         .addImm(GCNSubtarget::TrapIDLLVMTrap)
4073         .addReg(SGPR01, RegState::Implicit);
4074   }
4075 
4076   MI.eraseFromParent();
4077   return true;
4078 }
4079 
4080 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4081     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4082   B.setInstr(MI);
4083 
4084   // Is non-HSA path or trap-handler disabled? then, report a warning
4085   // accordingly
4086   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4087       !ST.isTrapHandlerEnabled()) {
4088     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4089                                      "debugtrap handler not supported",
4090                                      MI.getDebugLoc(), DS_Warning);
4091     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4092     Ctx.diagnose(NoTrap);
4093   } else {
4094     // Insert debug-trap instruction
4095     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4096   }
4097 
4098   MI.eraseFromParent();
4099   return true;
4100 }
4101 
4102 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
4103                                             MachineIRBuilder &B,
4104                                             GISelChangeObserver &Observer) const {
4105   MachineRegisterInfo &MRI = *B.getMRI();
4106 
4107   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4108   auto IntrID = MI.getIntrinsicID();
4109   switch (IntrID) {
4110   case Intrinsic::amdgcn_if:
4111   case Intrinsic::amdgcn_else: {
4112     MachineInstr *Br = nullptr;
4113     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
4114       const SIRegisterInfo *TRI
4115         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4116 
4117       B.setInstr(*BrCond);
4118       Register Def = MI.getOperand(1).getReg();
4119       Register Use = MI.getOperand(3).getReg();
4120 
4121       MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB();
4122       if (Br)
4123         BrTarget = Br->getOperand(0).getMBB();
4124 
4125       if (IntrID == Intrinsic::amdgcn_if) {
4126         B.buildInstr(AMDGPU::SI_IF)
4127           .addDef(Def)
4128           .addUse(Use)
4129           .addMBB(BrTarget);
4130       } else {
4131         B.buildInstr(AMDGPU::SI_ELSE)
4132           .addDef(Def)
4133           .addUse(Use)
4134           .addMBB(BrTarget)
4135           .addImm(0);
4136       }
4137 
4138       if (Br)
4139         Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB());
4140 
4141       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4142       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4143       MI.eraseFromParent();
4144       BrCond->eraseFromParent();
4145       return true;
4146     }
4147 
4148     return false;
4149   }
4150   case Intrinsic::amdgcn_loop: {
4151     MachineInstr *Br = nullptr;
4152     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
4153       const SIRegisterInfo *TRI
4154         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4155 
4156       B.setInstr(*BrCond);
4157 
4158       MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB();
4159       if (Br)
4160         BrTarget = Br->getOperand(0).getMBB();
4161 
4162       Register Reg = MI.getOperand(2).getReg();
4163       B.buildInstr(AMDGPU::SI_LOOP)
4164         .addUse(Reg)
4165         .addMBB(BrTarget);
4166 
4167       if (Br)
4168         Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB());
4169 
4170       MI.eraseFromParent();
4171       BrCond->eraseFromParent();
4172       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4173       return true;
4174     }
4175 
4176     return false;
4177   }
4178   case Intrinsic::amdgcn_kernarg_segment_ptr:
4179     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4180       B.setInstr(MI);
4181       // This only makes sense to call in a kernel, so just lower to null.
4182       B.buildConstant(MI.getOperand(0).getReg(), 0);
4183       MI.eraseFromParent();
4184       return true;
4185     }
4186 
4187     return legalizePreloadedArgIntrin(
4188       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4189   case Intrinsic::amdgcn_implicitarg_ptr:
4190     return legalizeImplicitArgPtr(MI, MRI, B);
4191   case Intrinsic::amdgcn_workitem_id_x:
4192     return legalizePreloadedArgIntrin(MI, MRI, B,
4193                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4194   case Intrinsic::amdgcn_workitem_id_y:
4195     return legalizePreloadedArgIntrin(MI, MRI, B,
4196                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4197   case Intrinsic::amdgcn_workitem_id_z:
4198     return legalizePreloadedArgIntrin(MI, MRI, B,
4199                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4200   case Intrinsic::amdgcn_workgroup_id_x:
4201     return legalizePreloadedArgIntrin(MI, MRI, B,
4202                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4203   case Intrinsic::amdgcn_workgroup_id_y:
4204     return legalizePreloadedArgIntrin(MI, MRI, B,
4205                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4206   case Intrinsic::amdgcn_workgroup_id_z:
4207     return legalizePreloadedArgIntrin(MI, MRI, B,
4208                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4209   case Intrinsic::amdgcn_dispatch_ptr:
4210     return legalizePreloadedArgIntrin(MI, MRI, B,
4211                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4212   case Intrinsic::amdgcn_queue_ptr:
4213     return legalizePreloadedArgIntrin(MI, MRI, B,
4214                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4215   case Intrinsic::amdgcn_implicit_buffer_ptr:
4216     return legalizePreloadedArgIntrin(
4217       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4218   case Intrinsic::amdgcn_dispatch_id:
4219     return legalizePreloadedArgIntrin(MI, MRI, B,
4220                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4221   case Intrinsic::amdgcn_fdiv_fast:
4222     return legalizeFDIVFastIntrin(MI, MRI, B);
4223   case Intrinsic::amdgcn_is_shared:
4224     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4225   case Intrinsic::amdgcn_is_private:
4226     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4227   case Intrinsic::amdgcn_wavefrontsize: {
4228     B.setInstr(MI);
4229     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4230     MI.eraseFromParent();
4231     return true;
4232   }
4233   case Intrinsic::amdgcn_s_buffer_load:
4234     return legalizeSBufferLoad(MI, B, Observer);
4235   case Intrinsic::amdgcn_raw_buffer_store:
4236   case Intrinsic::amdgcn_struct_buffer_store:
4237     return legalizeBufferStore(MI, MRI, B, false, false);
4238   case Intrinsic::amdgcn_raw_buffer_store_format:
4239   case Intrinsic::amdgcn_struct_buffer_store_format:
4240     return legalizeBufferStore(MI, MRI, B, false, true);
4241   case Intrinsic::amdgcn_raw_tbuffer_store:
4242   case Intrinsic::amdgcn_struct_tbuffer_store:
4243     return legalizeBufferStore(MI, MRI, B, true, true);
4244   case Intrinsic::amdgcn_raw_buffer_load:
4245   case Intrinsic::amdgcn_struct_buffer_load:
4246     return legalizeBufferLoad(MI, MRI, B, false, false);
4247   case Intrinsic::amdgcn_raw_buffer_load_format:
4248   case Intrinsic::amdgcn_struct_buffer_load_format:
4249     return legalizeBufferLoad(MI, MRI, B, true, false);
4250   case Intrinsic::amdgcn_raw_tbuffer_load:
4251   case Intrinsic::amdgcn_struct_tbuffer_load:
4252     return legalizeBufferLoad(MI, MRI, B, true, true);
4253   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4254   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4255   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4256   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4257   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4258   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4259   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4260   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4261   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4262   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4263   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4264   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4265   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4266   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4267   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4268   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4269   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4270   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4271   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4272   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4273   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4274   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4275   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4276   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4277   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4278   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4279     return legalizeBufferAtomic(MI, B, IntrID);
4280   case Intrinsic::amdgcn_atomic_inc:
4281     return legalizeAtomicIncDec(MI, B, true);
4282   case Intrinsic::amdgcn_atomic_dec:
4283     return legalizeAtomicIncDec(MI, B, false);
4284   case Intrinsic::trap:
4285     return legalizeTrapIntrinsic(MI, MRI, B);
4286   case Intrinsic::debugtrap:
4287     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4288   default: {
4289     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4290             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4291       return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr);
4292     return true;
4293   }
4294   }
4295 
4296   return true;
4297 }
4298