1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
19 #endif
20 
21 #include "AMDGPULegalizerInfo.h"
22 
23 #include "AMDGPU.h"
24 #include "AMDGPUGlobalISelUtils.h"
25 #include "AMDGPUTargetMachine.h"
26 #include "SIMachineFunctionInfo.h"
27 #include "llvm/ADT/ScopeExit.h"
28 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
29 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
30 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
31 #include "llvm/CodeGen/TargetOpcodes.h"
32 #include "llvm/CodeGen/ValueTypes.h"
33 #include "llvm/IR/DerivedTypes.h"
34 #include "llvm/IR/DiagnosticInfo.h"
35 #include "llvm/IR/Type.h"
36 #include "llvm/Support/Debug.h"
37 
38 #define DEBUG_TYPE "amdgpu-legalinfo"
39 
40 using namespace llvm;
41 using namespace LegalizeActions;
42 using namespace LegalizeMutations;
43 using namespace LegalityPredicates;
44 using namespace MIPatternMatch;
45 
46 // Round the number of elements to the next power of two elements
47 static LLT getPow2VectorType(LLT Ty) {
48   unsigned NElts = Ty.getNumElements();
49   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
50   return Ty.changeNumElements(Pow2NElts);
51 }
52 
53 // Round the number of bits to the next power of two bits
54 static LLT getPow2ScalarType(LLT Ty) {
55   unsigned Bits = Ty.getSizeInBits();
56   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
57   return LLT::scalar(Pow2Bits);
58 }
59 
60 static LegalityPredicate isMultiple32(unsigned TypeIdx,
61                                       unsigned MaxSize = 1024) {
62   return [=](const LegalityQuery &Query) {
63     const LLT Ty = Query.Types[TypeIdx];
64     const LLT EltTy = Ty.getScalarType();
65     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
66   };
67 }
68 
69 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
70   return [=](const LegalityQuery &Query) {
71     return Query.Types[TypeIdx].getSizeInBits() == Size;
72   };
73 }
74 
75 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
76   return [=](const LegalityQuery &Query) {
77     const LLT Ty = Query.Types[TypeIdx];
78     return Ty.isVector() &&
79            Ty.getNumElements() % 2 != 0 &&
80            Ty.getElementType().getSizeInBits() < 32 &&
81            Ty.getSizeInBits() % 32 != 0;
82   };
83 }
84 
85 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
86   return [=](const LegalityQuery &Query) {
87     const LLT Ty = Query.Types[TypeIdx];
88     const LLT EltTy = Ty.getScalarType();
89     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
90   };
91 }
92 
93 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
94   return [=](const LegalityQuery &Query) {
95     const LLT Ty = Query.Types[TypeIdx];
96     const LLT EltTy = Ty.getElementType();
97     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
98   };
99 }
100 
101 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
102   return [=](const LegalityQuery &Query) {
103     const LLT Ty = Query.Types[TypeIdx];
104     const LLT EltTy = Ty.getElementType();
105     unsigned Size = Ty.getSizeInBits();
106     unsigned Pieces = (Size + 63) / 64;
107     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
108     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
109   };
110 }
111 
112 // Increase the number of vector elements to reach the next multiple of 32-bit
113 // type.
114 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
115   return [=](const LegalityQuery &Query) {
116     const LLT Ty = Query.Types[TypeIdx];
117 
118     const LLT EltTy = Ty.getElementType();
119     const int Size = Ty.getSizeInBits();
120     const int EltSize = EltTy.getSizeInBits();
121     const int NextMul32 = (Size + 31) / 32;
122 
123     assert(EltSize < 32);
124 
125     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
126     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
127   };
128 }
129 
130 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
131   return [=](const LegalityQuery &Query) {
132     const LLT QueryTy = Query.Types[TypeIdx];
133     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
134   };
135 }
136 
137 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
138   return [=](const LegalityQuery &Query) {
139     const LLT QueryTy = Query.Types[TypeIdx];
140     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
141   };
142 }
143 
144 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
145   return [=](const LegalityQuery &Query) {
146     const LLT QueryTy = Query.Types[TypeIdx];
147     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
148   };
149 }
150 
151 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
152 // v2s16.
153 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
154   return [=](const LegalityQuery &Query) {
155     const LLT Ty = Query.Types[TypeIdx];
156     if (Ty.isVector()) {
157       const int EltSize = Ty.getElementType().getSizeInBits();
158       return EltSize == 32 || EltSize == 64 ||
159             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
160              EltSize == 128 || EltSize == 256;
161     }
162 
163     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
164   };
165 }
166 
167 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
168   return [=](const LegalityQuery &Query) {
169     const LLT QueryTy = Query.Types[TypeIdx];
170     return QueryTy.isVector() && QueryTy.getElementType() == Type;
171   };
172 }
173 
174 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
175   return [=](const LegalityQuery &Query) {
176     const LLT QueryTy = Query.Types[TypeIdx];
177     if (!QueryTy.isVector())
178       return false;
179     const LLT EltTy = QueryTy.getElementType();
180     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
181   };
182 }
183 
184 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
185   return [=](const LegalityQuery &Query) {
186     const LLT Ty = Query.Types[TypeIdx];
187     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
188            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
189   };
190 }
191 
192 static LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1) {
193   return [=](const LegalityQuery &Query) {
194     return Query.Types[TypeIdx0].getSizeInBits() <
195            Query.Types[TypeIdx1].getSizeInBits();
196   };
197 }
198 
199 static LegalityPredicate greaterThan(unsigned TypeIdx0, unsigned TypeIdx1) {
200   return [=](const LegalityQuery &Query) {
201     return Query.Types[TypeIdx0].getSizeInBits() >
202            Query.Types[TypeIdx1].getSizeInBits();
203   };
204 }
205 
206 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
207                                          const GCNTargetMachine &TM)
208   :  ST(ST_) {
209   using namespace TargetOpcode;
210 
211   auto GetAddrSpacePtr = [&TM](unsigned AS) {
212     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
213   };
214 
215   const LLT S1 = LLT::scalar(1);
216   const LLT S16 = LLT::scalar(16);
217   const LLT S32 = LLT::scalar(32);
218   const LLT S64 = LLT::scalar(64);
219   const LLT S128 = LLT::scalar(128);
220   const LLT S256 = LLT::scalar(256);
221   const LLT S512 = LLT::scalar(512);
222   const LLT S1024 = LLT::scalar(1024);
223 
224   const LLT V2S16 = LLT::vector(2, 16);
225   const LLT V4S16 = LLT::vector(4, 16);
226 
227   const LLT V2S32 = LLT::vector(2, 32);
228   const LLT V3S32 = LLT::vector(3, 32);
229   const LLT V4S32 = LLT::vector(4, 32);
230   const LLT V5S32 = LLT::vector(5, 32);
231   const LLT V6S32 = LLT::vector(6, 32);
232   const LLT V7S32 = LLT::vector(7, 32);
233   const LLT V8S32 = LLT::vector(8, 32);
234   const LLT V9S32 = LLT::vector(9, 32);
235   const LLT V10S32 = LLT::vector(10, 32);
236   const LLT V11S32 = LLT::vector(11, 32);
237   const LLT V12S32 = LLT::vector(12, 32);
238   const LLT V13S32 = LLT::vector(13, 32);
239   const LLT V14S32 = LLT::vector(14, 32);
240   const LLT V15S32 = LLT::vector(15, 32);
241   const LLT V16S32 = LLT::vector(16, 32);
242   const LLT V32S32 = LLT::vector(32, 32);
243 
244   const LLT V2S64 = LLT::vector(2, 64);
245   const LLT V3S64 = LLT::vector(3, 64);
246   const LLT V4S64 = LLT::vector(4, 64);
247   const LLT V5S64 = LLT::vector(5, 64);
248   const LLT V6S64 = LLT::vector(6, 64);
249   const LLT V7S64 = LLT::vector(7, 64);
250   const LLT V8S64 = LLT::vector(8, 64);
251   const LLT V16S64 = LLT::vector(16, 64);
252 
253   std::initializer_list<LLT> AllS32Vectors =
254     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
255      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
256   std::initializer_list<LLT> AllS64Vectors =
257     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
258 
259   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
260   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
261   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
262   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
263   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
264   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
265   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
266 
267   const LLT CodePtr = FlatPtr;
268 
269   const std::initializer_list<LLT> AddrSpaces64 = {
270     GlobalPtr, ConstantPtr, FlatPtr
271   };
272 
273   const std::initializer_list<LLT> AddrSpaces32 = {
274     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
275   };
276 
277   const std::initializer_list<LLT> FPTypesBase = {
278     S32, S64
279   };
280 
281   const std::initializer_list<LLT> FPTypes16 = {
282     S32, S64, S16
283   };
284 
285   const std::initializer_list<LLT> FPTypesPK16 = {
286     S32, S64, S16, V2S16
287   };
288 
289   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
290 
291   setAction({G_BRCOND, S1}, Legal); // VCC branches
292   setAction({G_BRCOND, S32}, Legal); // SCC branches
293 
294   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
295   // elements for v3s16
296   getActionDefinitionsBuilder(G_PHI)
297     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
298     .legalFor(AllS32Vectors)
299     .legalFor(AllS64Vectors)
300     .legalFor(AddrSpaces64)
301     .legalFor(AddrSpaces32)
302     .clampScalar(0, S32, S256)
303     .widenScalarToNextPow2(0, 32)
304     .clampMaxNumElements(0, S32, 16)
305     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
306     .legalIf(isPointer(0));
307 
308   if (ST.hasVOP3PInsts()) {
309     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
310       .legalFor({S32, S16, V2S16})
311       .clampScalar(0, S16, S32)
312       .clampMaxNumElements(0, S16, 2)
313       .scalarize(0)
314       .widenScalarToNextPow2(0, 32);
315   } else if (ST.has16BitInsts()) {
316     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
317       .legalFor({S32, S16})
318       .clampScalar(0, S16, S32)
319       .scalarize(0)
320       .widenScalarToNextPow2(0, 32);
321   } else {
322     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
323       .legalFor({S32})
324       .clampScalar(0, S32, S32)
325       .scalarize(0);
326   }
327 
328   // FIXME: Not really legal. Placeholder for custom lowering.
329   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
330     .customFor({S32, S64})
331     .clampScalar(0, S32, S64)
332     .widenScalarToNextPow2(0, 32)
333     .scalarize(0);
334 
335   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
336     .legalFor({S32})
337     .clampScalar(0, S32, S32)
338     .scalarize(0);
339 
340   // Report legal for any types we can handle anywhere. For the cases only legal
341   // on the SALU, RegBankSelect will be able to re-legalize.
342   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
343     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
344     .clampScalar(0, S32, S64)
345     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
346     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
347     .widenScalarToNextPow2(0)
348     .scalarize(0);
349 
350   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
351                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
352     .legalFor({{S32, S1}, {S32, S32}})
353     .minScalar(0, S32)
354     // TODO: .scalarize(0)
355     .lower();
356 
357   getActionDefinitionsBuilder(G_BITCAST)
358     // Don't worry about the size constraint.
359     .legalIf(all(isRegisterType(0), isRegisterType(1)))
360     .lower();
361 
362 
363   getActionDefinitionsBuilder(G_CONSTANT)
364     .legalFor({S1, S32, S64, S16, GlobalPtr,
365                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
366     .clampScalar(0, S32, S64)
367     .widenScalarToNextPow2(0)
368     .legalIf(isPointer(0));
369 
370   getActionDefinitionsBuilder(G_FCONSTANT)
371     .legalFor({S32, S64, S16})
372     .clampScalar(0, S16, S64);
373 
374   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
375       .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
376                  ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
377       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
378       .clampScalarOrElt(0, S32, S1024)
379       .legalIf(isMultiple32(0))
380       .widenScalarToNextPow2(0, 32)
381       .clampMaxNumElements(0, S32, 16);
382 
383   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
384   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
385     .unsupportedFor({PrivatePtr})
386     .custom();
387   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
388 
389   auto &FPOpActions = getActionDefinitionsBuilder(
390     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
391     .legalFor({S32, S64});
392   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
393     .customFor({S32, S64});
394   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
395     .customFor({S32, S64});
396 
397   if (ST.has16BitInsts()) {
398     if (ST.hasVOP3PInsts())
399       FPOpActions.legalFor({S16, V2S16});
400     else
401       FPOpActions.legalFor({S16});
402 
403     TrigActions.customFor({S16});
404     FDIVActions.customFor({S16});
405   }
406 
407   auto &MinNumMaxNum = getActionDefinitionsBuilder({
408       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
409 
410   if (ST.hasVOP3PInsts()) {
411     MinNumMaxNum.customFor(FPTypesPK16)
412       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
413       .clampMaxNumElements(0, S16, 2)
414       .clampScalar(0, S16, S64)
415       .scalarize(0);
416   } else if (ST.has16BitInsts()) {
417     MinNumMaxNum.customFor(FPTypes16)
418       .clampScalar(0, S16, S64)
419       .scalarize(0);
420   } else {
421     MinNumMaxNum.customFor(FPTypesBase)
422       .clampScalar(0, S32, S64)
423       .scalarize(0);
424   }
425 
426   if (ST.hasVOP3PInsts())
427     FPOpActions.clampMaxNumElements(0, S16, 2);
428 
429   FPOpActions
430     .scalarize(0)
431     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
432 
433   TrigActions
434     .scalarize(0)
435     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
436 
437   FDIVActions
438     .scalarize(0)
439     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
440 
441   getActionDefinitionsBuilder({G_FNEG, G_FABS})
442     .legalFor(FPTypesPK16)
443     .clampMaxNumElements(0, S16, 2)
444     .scalarize(0)
445     .clampScalar(0, S16, S64);
446 
447   if (ST.has16BitInsts()) {
448     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
449       .legalFor({S32, S64, S16})
450       .scalarize(0)
451       .clampScalar(0, S16, S64);
452   } else {
453     getActionDefinitionsBuilder(G_FSQRT)
454       .legalFor({S32, S64})
455       .scalarize(0)
456       .clampScalar(0, S32, S64);
457 
458     if (ST.hasFractBug()) {
459       getActionDefinitionsBuilder(G_FFLOOR)
460         .customFor({S64})
461         .legalFor({S32, S64})
462         .scalarize(0)
463         .clampScalar(0, S32, S64);
464     } else {
465       getActionDefinitionsBuilder(G_FFLOOR)
466         .legalFor({S32, S64})
467         .scalarize(0)
468         .clampScalar(0, S32, S64);
469     }
470   }
471 
472   getActionDefinitionsBuilder(G_FPTRUNC)
473     .legalFor({{S32, S64}, {S16, S32}})
474     .scalarize(0)
475     .lower();
476 
477   getActionDefinitionsBuilder(G_FPEXT)
478     .legalFor({{S64, S32}, {S32, S16}})
479     .lowerFor({{S64, S16}}) // FIXME: Implement
480     .scalarize(0);
481 
482   getActionDefinitionsBuilder(G_FSUB)
483       // Use actual fsub instruction
484       .legalFor({S32})
485       // Must use fadd + fneg
486       .lowerFor({S64, S16, V2S16})
487       .scalarize(0)
488       .clampScalar(0, S32, S64);
489 
490   // Whether this is legal depends on the floating point mode for the function.
491   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
492   if (ST.hasMadF16())
493     FMad.customFor({S32, S16});
494   else
495     FMad.customFor({S32});
496   FMad.scalarize(0)
497       .lower();
498 
499   // TODO: Do we need to clamp maximum bitwidth?
500   getActionDefinitionsBuilder(G_TRUNC)
501     .legalIf(isScalar(0))
502     .legalFor({{V2S16, V2S32}})
503     .clampMaxNumElements(0, S16, 2)
504     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
505     // situations (like an invalid implicit use), we don't want to infinite loop
506     // in the legalizer.
507     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
508     .alwaysLegal();
509 
510   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
511     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
512                {S32, S1}, {S64, S1}, {S16, S1}})
513     .scalarize(0)
514     .clampScalar(0, S32, S64)
515     .widenScalarToNextPow2(1, 32);
516 
517   // TODO: Split s1->s64 during regbankselect for VALU.
518   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
519     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
520     .lowerFor({{S32, S64}})
521     .lowerIf(typeIs(1, S1))
522     .customFor({{S64, S64}});
523   if (ST.has16BitInsts())
524     IToFP.legalFor({{S16, S16}});
525   IToFP.clampScalar(1, S32, S64)
526        .scalarize(0)
527        .widenScalarToNextPow2(1);
528 
529   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
530     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
531     .customFor({{S64, S64}});
532   if (ST.has16BitInsts())
533     FPToI.legalFor({{S16, S16}});
534   else
535     FPToI.minScalar(1, S32);
536 
537   FPToI.minScalar(0, S32)
538        .scalarize(0)
539        .lower();
540 
541   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
542     .scalarize(0)
543     .lower();
544 
545   if (ST.has16BitInsts()) {
546     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
547       .legalFor({S16, S32, S64})
548       .clampScalar(0, S16, S64)
549       .scalarize(0);
550   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
551     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
552       .legalFor({S32, S64})
553       .clampScalar(0, S32, S64)
554       .scalarize(0);
555   } else {
556     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
557       .legalFor({S32})
558       .customFor({S64})
559       .clampScalar(0, S32, S64)
560       .scalarize(0);
561   }
562 
563   getActionDefinitionsBuilder({G_PTR_ADD, G_PTR_MASK})
564     .scalarize(0)
565     .alwaysLegal();
566 
567   auto &CmpBuilder =
568     getActionDefinitionsBuilder(G_ICMP)
569     // The compare output type differs based on the register bank of the output,
570     // so make both s1 and s32 legal.
571     //
572     // Scalar compares producing output in scc will be promoted to s32, as that
573     // is the allocatable register type that will be needed for the copy from
574     // scc. This will be promoted during RegBankSelect, and we assume something
575     // before that won't try to use s32 result types.
576     //
577     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
578     // bank.
579     .legalForCartesianProduct(
580       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
581     .legalForCartesianProduct(
582       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
583   if (ST.has16BitInsts()) {
584     CmpBuilder.legalFor({{S1, S16}});
585   }
586 
587   CmpBuilder
588     .widenScalarToNextPow2(1)
589     .clampScalar(1, S32, S64)
590     .scalarize(0)
591     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
592 
593   getActionDefinitionsBuilder(G_FCMP)
594     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
595     .widenScalarToNextPow2(1)
596     .clampScalar(1, S32, S64)
597     .scalarize(0);
598 
599   // FIXME: fpow has a selection pattern that should move to custom lowering.
600   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
601   if (ST.has16BitInsts())
602     Exp2Ops.legalFor({S32, S16});
603   else
604     Exp2Ops.legalFor({S32});
605   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
606   Exp2Ops.scalarize(0);
607 
608   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
609   if (ST.has16BitInsts())
610     ExpOps.customFor({{S32}, {S16}});
611   else
612     ExpOps.customFor({S32});
613   ExpOps.clampScalar(0, MinScalarFPTy, S32)
614         .scalarize(0);
615 
616   // The 64-bit versions produce 32-bit results, but only on the SALU.
617   getActionDefinitionsBuilder(G_CTPOP)
618     .legalFor({{S32, S32}, {S32, S64}})
619     .clampScalar(0, S32, S32)
620     .clampScalar(1, S32, S64)
621     .scalarize(0)
622     .widenScalarToNextPow2(0, 32)
623     .widenScalarToNextPow2(1, 32);
624 
625   // The hardware instructions return a different result on 0 than the generic
626   // instructions expect. The hardware produces -1, but these produce the
627   // bitwidth.
628   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
629     .scalarize(0)
630     .clampScalar(0, S32, S32)
631     .clampScalar(1, S32, S64)
632     .widenScalarToNextPow2(0, 32)
633     .widenScalarToNextPow2(1, 32)
634     .lower();
635 
636   // The 64-bit versions produce 32-bit results, but only on the SALU.
637   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
638     .legalFor({{S32, S32}, {S32, S64}})
639     .clampScalar(0, S32, S32)
640     .clampScalar(1, S32, S64)
641     .scalarize(0)
642     .widenScalarToNextPow2(0, 32)
643     .widenScalarToNextPow2(1, 32);
644 
645   getActionDefinitionsBuilder(G_BITREVERSE)
646     .legalFor({S32})
647     .clampScalar(0, S32, S32)
648     .scalarize(0);
649 
650   if (ST.has16BitInsts()) {
651     getActionDefinitionsBuilder(G_BSWAP)
652       .legalFor({S16, S32, V2S16})
653       .clampMaxNumElements(0, S16, 2)
654       // FIXME: Fixing non-power-of-2 before clamp is workaround for
655       // narrowScalar limitation.
656       .widenScalarToNextPow2(0)
657       .clampScalar(0, S16, S32)
658       .scalarize(0);
659 
660     if (ST.hasVOP3PInsts()) {
661       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
662         .legalFor({S32, S16, V2S16})
663         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
664         .clampMaxNumElements(0, S16, 2)
665         .minScalar(0, S16)
666         .widenScalarToNextPow2(0)
667         .scalarize(0)
668         .lower();
669     } else {
670       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
671         .legalFor({S32, S16})
672         .widenScalarToNextPow2(0)
673         .minScalar(0, S16)
674         .scalarize(0)
675         .lower();
676     }
677   } else {
678     // TODO: Should have same legality without v_perm_b32
679     getActionDefinitionsBuilder(G_BSWAP)
680       .legalFor({S32})
681       .lowerIf(narrowerThan(0, 32))
682       // FIXME: Fixing non-power-of-2 before clamp is workaround for
683       // narrowScalar limitation.
684       .widenScalarToNextPow2(0)
685       .maxScalar(0, S32)
686       .scalarize(0)
687       .lower();
688 
689     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
690       .legalFor({S32})
691       .minScalar(0, S32)
692       .widenScalarToNextPow2(0)
693       .scalarize(0)
694       .lower();
695   }
696 
697   getActionDefinitionsBuilder(G_INTTOPTR)
698     // List the common cases
699     .legalForCartesianProduct(AddrSpaces64, {S64})
700     .legalForCartesianProduct(AddrSpaces32, {S32})
701     .scalarize(0)
702     // Accept any address space as long as the size matches
703     .legalIf(sameSize(0, 1))
704     .widenScalarIf(smallerThan(1, 0),
705       [](const LegalityQuery &Query) {
706         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
707       })
708     .narrowScalarIf(greaterThan(1, 0),
709       [](const LegalityQuery &Query) {
710         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
711       });
712 
713   getActionDefinitionsBuilder(G_PTRTOINT)
714     // List the common cases
715     .legalForCartesianProduct(AddrSpaces64, {S64})
716     .legalForCartesianProduct(AddrSpaces32, {S32})
717     .scalarize(0)
718     // Accept any address space as long as the size matches
719     .legalIf(sameSize(0, 1))
720     .widenScalarIf(smallerThan(0, 1),
721       [](const LegalityQuery &Query) {
722         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
723       })
724     .narrowScalarIf(
725       greaterThan(0, 1),
726       [](const LegalityQuery &Query) {
727         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
728       });
729 
730   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
731     .scalarize(0)
732     .custom();
733 
734   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
735   // handle some operations by just promoting the register during
736   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
737   auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned {
738     switch (AS) {
739     // FIXME: Private element size.
740     case AMDGPUAS::PRIVATE_ADDRESS:
741       return 32;
742     // FIXME: Check subtarget
743     case AMDGPUAS::LOCAL_ADDRESS:
744       return ST.useDS128() ? 128 : 64;
745 
746     // Treat constant and global as identical. SMRD loads are sometimes usable
747     // for global loads (ideally constant address space should be eliminated)
748     // depending on the context. Legality cannot be context dependent, but
749     // RegBankSelect can split the load as necessary depending on the pointer
750     // register bank/uniformity and if the memory is invariant or not written in
751     // a kernel.
752     case AMDGPUAS::CONSTANT_ADDRESS:
753     case AMDGPUAS::GLOBAL_ADDRESS:
754       return IsLoad ? 512 : 128;
755     default:
756       return 128;
757     }
758   };
759 
760   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
761                                     bool IsLoad) -> bool {
762     const LLT DstTy = Query.Types[0];
763 
764     // Split vector extloads.
765     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
766     unsigned Align = Query.MMODescrs[0].AlignInBits;
767 
768     if (MemSize < DstTy.getSizeInBits())
769       MemSize = std::max(MemSize, Align);
770 
771     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
772       return true;
773 
774     const LLT PtrTy = Query.Types[1];
775     unsigned AS = PtrTy.getAddressSpace();
776     if (MemSize > maxSizeForAddrSpace(AS, IsLoad))
777       return true;
778 
779     // Catch weird sized loads that don't evenly divide into the access sizes
780     // TODO: May be able to widen depending on alignment etc.
781     unsigned NumRegs = (MemSize + 31) / 32;
782     if (NumRegs == 3) {
783       if (!ST.hasDwordx3LoadStores())
784         return true;
785     } else {
786       // If the alignment allows, these should have been widened.
787       if (!isPowerOf2_32(NumRegs))
788         return true;
789     }
790 
791     if (Align < MemSize) {
792       const SITargetLowering *TLI = ST.getTargetLowering();
793       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
794     }
795 
796     return false;
797   };
798 
799   const auto shouldWidenLoadResult = [=](const LegalityQuery &Query) -> bool {
800     unsigned Size = Query.Types[0].getSizeInBits();
801     if (isPowerOf2_32(Size))
802       return false;
803 
804     if (Size == 96 && ST.hasDwordx3LoadStores())
805       return false;
806 
807     unsigned AddrSpace = Query.Types[1].getAddressSpace();
808     if (Size >= maxSizeForAddrSpace(AddrSpace, true))
809       return false;
810 
811     unsigned Align = Query.MMODescrs[0].AlignInBits;
812     unsigned RoundedSize = NextPowerOf2(Size);
813     return (Align >= RoundedSize);
814   };
815 
816   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
817   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
818   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
819 
820   // TODO: Refine based on subtargets which support unaligned access or 128-bit
821   // LDS
822   // TODO: Unsupported flat for SI.
823 
824   for (unsigned Op : {G_LOAD, G_STORE}) {
825     const bool IsStore = Op == G_STORE;
826 
827     auto &Actions = getActionDefinitionsBuilder(Op);
828     // Whitelist the common cases.
829     // TODO: Loads to s16 on gfx9
830     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
831                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
832                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
833                                       {S128, GlobalPtr, 128, GlobalAlign32},
834                                       {S64, GlobalPtr, 64, GlobalAlign32},
835                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
836                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
837                                       {S32, GlobalPtr, 8, GlobalAlign8},
838                                       {S32, GlobalPtr, 16, GlobalAlign16},
839 
840                                       {S32, LocalPtr, 32, 32},
841                                       {S64, LocalPtr, 64, 32},
842                                       {V2S32, LocalPtr, 64, 32},
843                                       {S32, LocalPtr, 8, 8},
844                                       {S32, LocalPtr, 16, 16},
845                                       {V2S16, LocalPtr, 32, 32},
846 
847                                       {S32, PrivatePtr, 32, 32},
848                                       {S32, PrivatePtr, 8, 8},
849                                       {S32, PrivatePtr, 16, 16},
850                                       {V2S16, PrivatePtr, 32, 32},
851 
852                                       {S32, FlatPtr, 32, GlobalAlign32},
853                                       {S32, FlatPtr, 16, GlobalAlign16},
854                                       {S32, FlatPtr, 8, GlobalAlign8},
855                                       {V2S16, FlatPtr, 32, GlobalAlign32},
856 
857                                       {S32, ConstantPtr, 32, GlobalAlign32},
858                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
859                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
860                                       {S64, ConstantPtr, 64, GlobalAlign32},
861                                       {S128, ConstantPtr, 128, GlobalAlign32},
862                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
863     Actions
864         .customIf(typeIs(1, Constant32Ptr))
865         // Widen suitably aligned loads by loading extra elements.
866         .moreElementsIf([=](const LegalityQuery &Query) {
867             const LLT Ty = Query.Types[0];
868             return Op == G_LOAD && Ty.isVector() &&
869                    shouldWidenLoadResult(Query);
870           }, moreElementsToNextPow2(0))
871         .widenScalarIf([=](const LegalityQuery &Query) {
872             const LLT Ty = Query.Types[0];
873             return Op == G_LOAD && !Ty.isVector() &&
874                    shouldWidenLoadResult(Query);
875           }, widenScalarOrEltToNextPow2(0))
876         .narrowScalarIf(
877             [=](const LegalityQuery &Query) -> bool {
878               return !Query.Types[0].isVector() &&
879                      needToSplitMemOp(Query, Op == G_LOAD);
880             },
881             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
882               const LLT DstTy = Query.Types[0];
883               const LLT PtrTy = Query.Types[1];
884 
885               const unsigned DstSize = DstTy.getSizeInBits();
886               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
887 
888               // Split extloads.
889               if (DstSize > MemSize)
890                 return std::make_pair(0, LLT::scalar(MemSize));
891 
892               if (!isPowerOf2_32(DstSize)) {
893                 // We're probably decomposing an odd sized store. Try to split
894                 // to the widest type. TODO: Account for alignment. As-is it
895                 // should be OK, since the new parts will be further legalized.
896                 unsigned FloorSize = PowerOf2Floor(DstSize);
897                 return std::make_pair(0, LLT::scalar(FloorSize));
898               }
899 
900               if (DstSize > 32 && (DstSize % 32 != 0)) {
901                 // FIXME: Need a way to specify non-extload of larger size if
902                 // suitably aligned.
903                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
904               }
905 
906               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
907                                                      Op == G_LOAD);
908               if (MemSize > MaxSize)
909                 return std::make_pair(0, LLT::scalar(MaxSize));
910 
911               unsigned Align = Query.MMODescrs[0].AlignInBits;
912               return std::make_pair(0, LLT::scalar(Align));
913             })
914         .fewerElementsIf(
915             [=](const LegalityQuery &Query) -> bool {
916               return Query.Types[0].isVector() &&
917                      needToSplitMemOp(Query, Op == G_LOAD);
918             },
919             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
920               const LLT DstTy = Query.Types[0];
921               const LLT PtrTy = Query.Types[1];
922 
923               LLT EltTy = DstTy.getElementType();
924               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
925                                                      Op == G_LOAD);
926 
927               // FIXME: Handle widened to power of 2 results better. This ends
928               // up scalarizing.
929               // FIXME: 3 element stores scalarized on SI
930 
931               // Split if it's too large for the address space.
932               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
933                 unsigned NumElts = DstTy.getNumElements();
934                 unsigned EltSize = EltTy.getSizeInBits();
935 
936                 if (MaxSize % EltSize == 0) {
937                   return std::make_pair(
938                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
939                 }
940 
941                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
942 
943                 // FIXME: Refine when odd breakdowns handled
944                 // The scalars will need to be re-legalized.
945                 if (NumPieces == 1 || NumPieces >= NumElts ||
946                     NumElts % NumPieces != 0)
947                   return std::make_pair(0, EltTy);
948 
949                 return std::make_pair(0,
950                                       LLT::vector(NumElts / NumPieces, EltTy));
951               }
952 
953               // FIXME: We could probably handle weird extending loads better.
954               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
955               if (DstTy.getSizeInBits() > MemSize)
956                 return std::make_pair(0, EltTy);
957 
958               unsigned EltSize = EltTy.getSizeInBits();
959               unsigned DstSize = DstTy.getSizeInBits();
960               if (!isPowerOf2_32(DstSize)) {
961                 // We're probably decomposing an odd sized store. Try to split
962                 // to the widest type. TODO: Account for alignment. As-is it
963                 // should be OK, since the new parts will be further legalized.
964                 unsigned FloorSize = PowerOf2Floor(DstSize);
965                 return std::make_pair(
966                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
967               }
968 
969               // Need to split because of alignment.
970               unsigned Align = Query.MMODescrs[0].AlignInBits;
971               if (EltSize > Align &&
972                   (EltSize / Align < DstTy.getNumElements())) {
973                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
974               }
975 
976               // May need relegalization for the scalars.
977               return std::make_pair(0, EltTy);
978             })
979         .minScalar(0, S32);
980 
981     if (IsStore)
982       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
983 
984     // TODO: Need a bitcast lower option?
985     Actions
986         .legalIf([=](const LegalityQuery &Query) {
987           const LLT Ty0 = Query.Types[0];
988           unsigned Size = Ty0.getSizeInBits();
989           unsigned MemSize = Query.MMODescrs[0].SizeInBits;
990           unsigned Align = Query.MMODescrs[0].AlignInBits;
991 
992           // FIXME: Widening store from alignment not valid.
993           if (MemSize < Size)
994             MemSize = std::max(MemSize, Align);
995 
996           // No extending vector loads.
997           if (Size > MemSize && Ty0.isVector())
998             return false;
999 
1000           switch (MemSize) {
1001           case 8:
1002           case 16:
1003             return Size == 32;
1004           case 32:
1005           case 64:
1006           case 128:
1007             return true;
1008           case 96:
1009             return ST.hasDwordx3LoadStores();
1010           case 256:
1011           case 512:
1012             return true;
1013           default:
1014             return false;
1015           }
1016         })
1017         .widenScalarToNextPow2(0)
1018         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
1019   }
1020 
1021   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1022                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1023                                                   {S32, GlobalPtr, 16, 2 * 8},
1024                                                   {S32, LocalPtr, 8, 8},
1025                                                   {S32, LocalPtr, 16, 16},
1026                                                   {S32, PrivatePtr, 8, 8},
1027                                                   {S32, PrivatePtr, 16, 16},
1028                                                   {S32, ConstantPtr, 8, 8},
1029                                                   {S32, ConstantPtr, 16, 2 * 8}});
1030   if (ST.hasFlatAddressSpace()) {
1031     ExtLoads.legalForTypesWithMemDesc(
1032         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1033   }
1034 
1035   ExtLoads.clampScalar(0, S32, S32)
1036           .widenScalarToNextPow2(0)
1037           .unsupportedIfMemSizeNotPow2()
1038           .lower();
1039 
1040   auto &Atomics = getActionDefinitionsBuilder(
1041     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1042      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1043      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1044      G_ATOMICRMW_UMIN})
1045     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1046                {S64, GlobalPtr}, {S64, LocalPtr}});
1047   if (ST.hasFlatAddressSpace()) {
1048     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1049   }
1050 
1051   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1052     .legalFor({{S32, LocalPtr}});
1053 
1054   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1055   // demarshalling
1056   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1057     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1058                 {S32, FlatPtr}, {S64, FlatPtr}})
1059     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1060                {S32, RegionPtr}, {S64, RegionPtr}});
1061   // TODO: Pointer types, any 32-bit or 64-bit vector
1062 
1063   // Condition should be s32 for scalar, s1 for vector.
1064   getActionDefinitionsBuilder(G_SELECT)
1065     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1066           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1067           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1068     .clampScalar(0, S16, S64)
1069     .scalarize(1)
1070     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1071     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1072     .clampMaxNumElements(0, S32, 2)
1073     .clampMaxNumElements(0, LocalPtr, 2)
1074     .clampMaxNumElements(0, PrivatePtr, 2)
1075     .scalarize(0)
1076     .widenScalarToNextPow2(0)
1077     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1078 
1079   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1080   // be more flexible with the shift amount type.
1081   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1082     .legalFor({{S32, S32}, {S64, S32}});
1083   if (ST.has16BitInsts()) {
1084     if (ST.hasVOP3PInsts()) {
1085       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1086             .clampMaxNumElements(0, S16, 2);
1087     } else
1088       Shifts.legalFor({{S16, S16}});
1089 
1090     // TODO: Support 16-bit shift amounts for all types
1091     Shifts.widenScalarIf(
1092       [=](const LegalityQuery &Query) {
1093         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1094         // 32-bit amount.
1095         const LLT ValTy = Query.Types[0];
1096         const LLT AmountTy = Query.Types[1];
1097         return ValTy.getSizeInBits() <= 16 &&
1098                AmountTy.getSizeInBits() < 16;
1099       }, changeTo(1, S16));
1100     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1101     Shifts.clampScalar(1, S32, S32);
1102     Shifts.clampScalar(0, S16, S64);
1103     Shifts.widenScalarToNextPow2(0, 16);
1104   } else {
1105     // Make sure we legalize the shift amount type first, as the general
1106     // expansion for the shifted type will produce much worse code if it hasn't
1107     // been truncated already.
1108     Shifts.clampScalar(1, S32, S32);
1109     Shifts.clampScalar(0, S32, S64);
1110     Shifts.widenScalarToNextPow2(0, 32);
1111   }
1112   Shifts.scalarize(0);
1113 
1114   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1115     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1116     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1117     unsigned IdxTypeIdx = 2;
1118 
1119     getActionDefinitionsBuilder(Op)
1120       .customIf([=](const LegalityQuery &Query) {
1121           const LLT EltTy = Query.Types[EltTypeIdx];
1122           const LLT VecTy = Query.Types[VecTypeIdx];
1123           const LLT IdxTy = Query.Types[IdxTypeIdx];
1124           return (EltTy.getSizeInBits() == 16 ||
1125                   EltTy.getSizeInBits() % 32 == 0) &&
1126                  VecTy.getSizeInBits() % 32 == 0 &&
1127                  VecTy.getSizeInBits() <= 1024 &&
1128                  IdxTy.getSizeInBits() == 32;
1129         })
1130       .clampScalar(EltTypeIdx, S32, S64)
1131       .clampScalar(VecTypeIdx, S32, S64)
1132       .clampScalar(IdxTypeIdx, S32, S32);
1133   }
1134 
1135   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1136     .unsupportedIf([=](const LegalityQuery &Query) {
1137         const LLT &EltTy = Query.Types[1].getElementType();
1138         return Query.Types[0] != EltTy;
1139       });
1140 
1141   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1142     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1143     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1144 
1145     // FIXME: Doesn't handle extract of illegal sizes.
1146     getActionDefinitionsBuilder(Op)
1147       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1148       // FIXME: Multiples of 16 should not be legal.
1149       .legalIf([=](const LegalityQuery &Query) {
1150           const LLT BigTy = Query.Types[BigTyIdx];
1151           const LLT LitTy = Query.Types[LitTyIdx];
1152           return (BigTy.getSizeInBits() % 32 == 0) &&
1153                  (LitTy.getSizeInBits() % 16 == 0);
1154         })
1155       .widenScalarIf(
1156         [=](const LegalityQuery &Query) {
1157           const LLT BigTy = Query.Types[BigTyIdx];
1158           return (BigTy.getScalarSizeInBits() < 16);
1159         },
1160         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1161       .widenScalarIf(
1162         [=](const LegalityQuery &Query) {
1163           const LLT LitTy = Query.Types[LitTyIdx];
1164           return (LitTy.getScalarSizeInBits() < 16);
1165         },
1166         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1167       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1168       .widenScalarToNextPow2(BigTyIdx, 32);
1169 
1170   }
1171 
1172   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1173     .legalForCartesianProduct(AllS32Vectors, {S32})
1174     .legalForCartesianProduct(AllS64Vectors, {S64})
1175     .clampNumElements(0, V16S32, V32S32)
1176     .clampNumElements(0, V2S64, V16S64)
1177     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1178 
1179   if (ST.hasScalarPackInsts()) {
1180     BuildVector
1181       // FIXME: Should probably widen s1 vectors straight to s32
1182       .minScalarOrElt(0, S16)
1183       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1184       .minScalar(1, S32);
1185 
1186     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1187       .legalFor({V2S16, S32})
1188       .lower();
1189     BuildVector.minScalarOrElt(0, S32);
1190   } else {
1191     BuildVector.customFor({V2S16, S16});
1192     BuildVector.minScalarOrElt(0, S32);
1193 
1194     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1195       .customFor({V2S16, S32})
1196       .lower();
1197   }
1198 
1199   BuildVector.legalIf(isRegisterType(0));
1200 
1201   // FIXME: Clamp maximum size
1202   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1203     .legalIf(isRegisterType(0));
1204 
1205   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1206   // pre-legalize.
1207   if (ST.hasVOP3PInsts()) {
1208     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1209       .customFor({V2S16, V2S16})
1210       .lower();
1211   } else
1212     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1213 
1214   // Merge/Unmerge
1215   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1216     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1217     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1218 
1219     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1220       const LLT Ty = Query.Types[TypeIdx];
1221       if (Ty.isVector()) {
1222         const LLT &EltTy = Ty.getElementType();
1223         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1224           return true;
1225         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1226           return true;
1227       }
1228       return false;
1229     };
1230 
1231     auto &Builder = getActionDefinitionsBuilder(Op)
1232       .lowerFor({{S16, V2S16}})
1233       .lowerIf([=](const LegalityQuery &Query) {
1234           const LLT BigTy = Query.Types[BigTyIdx];
1235           return BigTy.getSizeInBits() == 32;
1236         })
1237       // Try to widen to s16 first for small types.
1238       // TODO: Only do this on targets with legal s16 shifts
1239       .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1240       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1241       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1242       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1243                            elementTypeIs(1, S16)),
1244                        changeTo(1, V2S16))
1245       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1246       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1247       // valid.
1248       .clampScalar(LitTyIdx, S32, S512)
1249       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1250       // Break up vectors with weird elements into scalars
1251       .fewerElementsIf(
1252         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1253         scalarize(0))
1254       .fewerElementsIf(
1255         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1256         scalarize(1))
1257       .clampScalar(BigTyIdx, S32, S1024);
1258 
1259     if (Op == G_MERGE_VALUES) {
1260       Builder.widenScalarIf(
1261         // TODO: Use 16-bit shifts if legal for 8-bit values?
1262         [=](const LegalityQuery &Query) {
1263           const LLT Ty = Query.Types[LitTyIdx];
1264           return Ty.getSizeInBits() < 32;
1265         },
1266         changeTo(LitTyIdx, S32));
1267     }
1268 
1269     Builder.widenScalarIf(
1270       [=](const LegalityQuery &Query) {
1271         const LLT Ty = Query.Types[BigTyIdx];
1272         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1273           Ty.getSizeInBits() % 16 != 0;
1274       },
1275       [=](const LegalityQuery &Query) {
1276         // Pick the next power of 2, or a multiple of 64 over 128.
1277         // Whichever is smaller.
1278         const LLT &Ty = Query.Types[BigTyIdx];
1279         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1280         if (NewSizeInBits >= 256) {
1281           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1282           if (RoundedTo < NewSizeInBits)
1283             NewSizeInBits = RoundedTo;
1284         }
1285         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1286       })
1287       .legalIf([=](const LegalityQuery &Query) {
1288           const LLT &BigTy = Query.Types[BigTyIdx];
1289           const LLT &LitTy = Query.Types[LitTyIdx];
1290 
1291           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1292             return false;
1293           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1294             return false;
1295 
1296           return BigTy.getSizeInBits() % 16 == 0 &&
1297                  LitTy.getSizeInBits() % 16 == 0 &&
1298                  BigTy.getSizeInBits() <= 1024;
1299         })
1300       // Any vectors left are the wrong size. Scalarize them.
1301       .scalarize(0)
1302       .scalarize(1);
1303   }
1304 
1305   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1306   // RegBankSelect.
1307   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1308     .legalFor({{S32}, {S64}});
1309 
1310   if (ST.hasVOP3PInsts()) {
1311     SextInReg.lowerFor({{V2S16}})
1312       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1313       // get more vector shift opportunities, since we'll get those when
1314       // expanded.
1315       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1316   } else if (ST.has16BitInsts()) {
1317     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1318   } else {
1319     // Prefer to promote to s32 before lowering if we don't have 16-bit
1320     // shifts. This avoid a lot of intermediate truncate and extend operations.
1321     SextInReg.lowerFor({{S32}, {S64}});
1322   }
1323 
1324   SextInReg
1325     .scalarize(0)
1326     .clampScalar(0, S32, S64)
1327     .lower();
1328 
1329   getActionDefinitionsBuilder(G_FSHR)
1330     .legalFor({{S32, S32}})
1331     .scalarize(0)
1332     .lower();
1333 
1334   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1335     .legalFor({S64});
1336 
1337   getActionDefinitionsBuilder({
1338       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1339       G_FCOPYSIGN,
1340 
1341       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1342       G_READ_REGISTER,
1343       G_WRITE_REGISTER,
1344 
1345       G_SADDO, G_SSUBO,
1346 
1347        // TODO: Implement
1348       G_FMINIMUM, G_FMAXIMUM,
1349       G_FSHL
1350     }).lower();
1351 
1352   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1353         G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1354         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1355     .unsupported();
1356 
1357   computeTables();
1358   verify(*ST.getInstrInfo());
1359 }
1360 
1361 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1362                                          MachineRegisterInfo &MRI,
1363                                          MachineIRBuilder &B,
1364                                          GISelChangeObserver &Observer) const {
1365   switch (MI.getOpcode()) {
1366   case TargetOpcode::G_ADDRSPACE_CAST:
1367     return legalizeAddrSpaceCast(MI, MRI, B);
1368   case TargetOpcode::G_FRINT:
1369     return legalizeFrint(MI, MRI, B);
1370   case TargetOpcode::G_FCEIL:
1371     return legalizeFceil(MI, MRI, B);
1372   case TargetOpcode::G_INTRINSIC_TRUNC:
1373     return legalizeIntrinsicTrunc(MI, MRI, B);
1374   case TargetOpcode::G_SITOFP:
1375     return legalizeITOFP(MI, MRI, B, true);
1376   case TargetOpcode::G_UITOFP:
1377     return legalizeITOFP(MI, MRI, B, false);
1378   case TargetOpcode::G_FPTOSI:
1379     return legalizeFPTOI(MI, MRI, B, true);
1380   case TargetOpcode::G_FPTOUI:
1381     return legalizeFPTOI(MI, MRI, B, false);
1382   case TargetOpcode::G_FMINNUM:
1383   case TargetOpcode::G_FMAXNUM:
1384   case TargetOpcode::G_FMINNUM_IEEE:
1385   case TargetOpcode::G_FMAXNUM_IEEE:
1386     return legalizeMinNumMaxNum(MI, MRI, B);
1387   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1388     return legalizeExtractVectorElt(MI, MRI, B);
1389   case TargetOpcode::G_INSERT_VECTOR_ELT:
1390     return legalizeInsertVectorElt(MI, MRI, B);
1391   case TargetOpcode::G_SHUFFLE_VECTOR:
1392     return legalizeShuffleVector(MI, MRI, B);
1393   case TargetOpcode::G_FSIN:
1394   case TargetOpcode::G_FCOS:
1395     return legalizeSinCos(MI, MRI, B);
1396   case TargetOpcode::G_GLOBAL_VALUE:
1397     return legalizeGlobalValue(MI, MRI, B);
1398   case TargetOpcode::G_LOAD:
1399     return legalizeLoad(MI, MRI, B, Observer);
1400   case TargetOpcode::G_FMAD:
1401     return legalizeFMad(MI, MRI, B);
1402   case TargetOpcode::G_FDIV:
1403     return legalizeFDIV(MI, MRI, B);
1404   case TargetOpcode::G_UDIV:
1405   case TargetOpcode::G_UREM:
1406     return legalizeUDIV_UREM(MI, MRI, B);
1407   case TargetOpcode::G_SDIV:
1408   case TargetOpcode::G_SREM:
1409     return legalizeSDIV_SREM(MI, MRI, B);
1410   case TargetOpcode::G_ATOMIC_CMPXCHG:
1411     return legalizeAtomicCmpXChg(MI, MRI, B);
1412   case TargetOpcode::G_FLOG:
1413     return legalizeFlog(MI, B, 1.0f / numbers::log2ef);
1414   case TargetOpcode::G_FLOG10:
1415     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1416   case TargetOpcode::G_FEXP:
1417     return legalizeFExp(MI, B);
1418   case TargetOpcode::G_FPOW:
1419     return legalizeFPow(MI, B);
1420   case TargetOpcode::G_FFLOOR:
1421     return legalizeFFloor(MI, MRI, B);
1422   case TargetOpcode::G_BUILD_VECTOR:
1423     return legalizeBuildVector(MI, MRI, B);
1424   default:
1425     return false;
1426   }
1427 
1428   llvm_unreachable("expected switch to return");
1429 }
1430 
1431 Register AMDGPULegalizerInfo::getSegmentAperture(
1432   unsigned AS,
1433   MachineRegisterInfo &MRI,
1434   MachineIRBuilder &B) const {
1435   MachineFunction &MF = B.getMF();
1436   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1437   const LLT S32 = LLT::scalar(32);
1438 
1439   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1440 
1441   if (ST.hasApertureRegs()) {
1442     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1443     // getreg.
1444     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1445         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1446         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1447     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1448         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1449         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1450     unsigned Encoding =
1451         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1452         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1453         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1454 
1455     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1456 
1457     B.buildInstr(AMDGPU::S_GETREG_B32)
1458       .addDef(GetReg)
1459       .addImm(Encoding);
1460     MRI.setType(GetReg, S32);
1461 
1462     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1463     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1464   }
1465 
1466   Register QueuePtr = MRI.createGenericVirtualRegister(
1467     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1468 
1469   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1470   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1471     return Register();
1472 
1473   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1474   // private_segment_aperture_base_hi.
1475   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1476 
1477   // TODO: can we be smarter about machine pointer info?
1478   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1479   MachineMemOperand *MMO = MF.getMachineMemOperand(
1480       PtrInfo,
1481       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1482           MachineMemOperand::MOInvariant,
1483       4, commonAlignment(Align(64), StructOffset));
1484 
1485   Register LoadAddr;
1486 
1487   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1488   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1489 }
1490 
1491 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1492   MachineInstr &MI, MachineRegisterInfo &MRI,
1493   MachineIRBuilder &B) const {
1494   MachineFunction &MF = B.getMF();
1495 
1496   B.setInstr(MI);
1497 
1498   const LLT S32 = LLT::scalar(32);
1499   Register Dst = MI.getOperand(0).getReg();
1500   Register Src = MI.getOperand(1).getReg();
1501 
1502   LLT DstTy = MRI.getType(Dst);
1503   LLT SrcTy = MRI.getType(Src);
1504   unsigned DestAS = DstTy.getAddressSpace();
1505   unsigned SrcAS = SrcTy.getAddressSpace();
1506 
1507   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1508   // vector element.
1509   assert(!DstTy.isVector());
1510 
1511   const AMDGPUTargetMachine &TM
1512     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1513 
1514   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1515   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1516     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1517     return true;
1518   }
1519 
1520   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1521     // Truncate.
1522     B.buildExtract(Dst, Src, 0);
1523     MI.eraseFromParent();
1524     return true;
1525   }
1526 
1527   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1528     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1529     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1530 
1531     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1532     // another. Merge operands are required to be the same type, but creating an
1533     // extra ptrtoint would be kind of pointless.
1534     auto HighAddr = B.buildConstant(
1535       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1536     B.buildMerge(Dst, {Src, HighAddr});
1537     MI.eraseFromParent();
1538     return true;
1539   }
1540 
1541   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1542     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1543            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1544     unsigned NullVal = TM.getNullPointerValue(DestAS);
1545 
1546     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1547     auto FlatNull = B.buildConstant(SrcTy, 0);
1548 
1549     // Extract low 32-bits of the pointer.
1550     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1551 
1552     auto CmpRes =
1553         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1554     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1555 
1556     MI.eraseFromParent();
1557     return true;
1558   }
1559 
1560   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1561     return false;
1562 
1563   if (!ST.hasFlatAddressSpace())
1564     return false;
1565 
1566   auto SegmentNull =
1567       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1568   auto FlatNull =
1569       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1570 
1571   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1572   if (!ApertureReg.isValid())
1573     return false;
1574 
1575   auto CmpRes =
1576       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1577 
1578   // Coerce the type of the low half of the result so we can use merge_values.
1579   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1580 
1581   // TODO: Should we allow mismatched types but matching sizes in merges to
1582   // avoid the ptrtoint?
1583   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1584   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1585 
1586   MI.eraseFromParent();
1587   return true;
1588 }
1589 
1590 bool AMDGPULegalizerInfo::legalizeFrint(
1591   MachineInstr &MI, MachineRegisterInfo &MRI,
1592   MachineIRBuilder &B) const {
1593   B.setInstr(MI);
1594 
1595   Register Src = MI.getOperand(1).getReg();
1596   LLT Ty = MRI.getType(Src);
1597   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1598 
1599   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1600   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1601 
1602   auto C1 = B.buildFConstant(Ty, C1Val);
1603   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1604 
1605   // TODO: Should this propagate fast-math-flags?
1606   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1607   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1608 
1609   auto C2 = B.buildFConstant(Ty, C2Val);
1610   auto Fabs = B.buildFAbs(Ty, Src);
1611 
1612   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1613   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1614   return true;
1615 }
1616 
1617 bool AMDGPULegalizerInfo::legalizeFceil(
1618   MachineInstr &MI, MachineRegisterInfo &MRI,
1619   MachineIRBuilder &B) const {
1620   B.setInstr(MI);
1621 
1622   const LLT S1 = LLT::scalar(1);
1623   const LLT S64 = LLT::scalar(64);
1624 
1625   Register Src = MI.getOperand(1).getReg();
1626   assert(MRI.getType(Src) == S64);
1627 
1628   // result = trunc(src)
1629   // if (src > 0.0 && src != result)
1630   //   result += 1.0
1631 
1632   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1633 
1634   const auto Zero = B.buildFConstant(S64, 0.0);
1635   const auto One = B.buildFConstant(S64, 1.0);
1636   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1637   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1638   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1639   auto Add = B.buildSelect(S64, And, One, Zero);
1640 
1641   // TODO: Should this propagate fast-math-flags?
1642   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1643   return true;
1644 }
1645 
1646 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1647                                               MachineIRBuilder &B) {
1648   const unsigned FractBits = 52;
1649   const unsigned ExpBits = 11;
1650   LLT S32 = LLT::scalar(32);
1651 
1652   auto Const0 = B.buildConstant(S32, FractBits - 32);
1653   auto Const1 = B.buildConstant(S32, ExpBits);
1654 
1655   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1656     .addUse(Const0.getReg(0))
1657     .addUse(Const1.getReg(0));
1658 
1659   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1660 }
1661 
1662 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1663   MachineInstr &MI, MachineRegisterInfo &MRI,
1664   MachineIRBuilder &B) const {
1665   B.setInstr(MI);
1666 
1667   const LLT S1 = LLT::scalar(1);
1668   const LLT S32 = LLT::scalar(32);
1669   const LLT S64 = LLT::scalar(64);
1670 
1671   Register Src = MI.getOperand(1).getReg();
1672   assert(MRI.getType(Src) == S64);
1673 
1674   // TODO: Should this use extract since the low half is unused?
1675   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1676   Register Hi = Unmerge.getReg(1);
1677 
1678   // Extract the upper half, since this is where we will find the sign and
1679   // exponent.
1680   auto Exp = extractF64Exponent(Hi, B);
1681 
1682   const unsigned FractBits = 52;
1683 
1684   // Extract the sign bit.
1685   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1686   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1687 
1688   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1689 
1690   const auto Zero32 = B.buildConstant(S32, 0);
1691 
1692   // Extend back to 64-bits.
1693   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1694 
1695   auto Shr = B.buildAShr(S64, FractMask, Exp);
1696   auto Not = B.buildNot(S64, Shr);
1697   auto Tmp0 = B.buildAnd(S64, Src, Not);
1698   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1699 
1700   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1701   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1702 
1703   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1704   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1705   return true;
1706 }
1707 
1708 bool AMDGPULegalizerInfo::legalizeITOFP(
1709   MachineInstr &MI, MachineRegisterInfo &MRI,
1710   MachineIRBuilder &B, bool Signed) const {
1711   B.setInstr(MI);
1712 
1713   Register Dst = MI.getOperand(0).getReg();
1714   Register Src = MI.getOperand(1).getReg();
1715 
1716   const LLT S64 = LLT::scalar(64);
1717   const LLT S32 = LLT::scalar(32);
1718 
1719   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1720 
1721   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1722 
1723   auto CvtHi = Signed ?
1724     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1725     B.buildUITOFP(S64, Unmerge.getReg(1));
1726 
1727   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1728 
1729   auto ThirtyTwo = B.buildConstant(S32, 32);
1730   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1731     .addUse(CvtHi.getReg(0))
1732     .addUse(ThirtyTwo.getReg(0));
1733 
1734   // TODO: Should this propagate fast-math-flags?
1735   B.buildFAdd(Dst, LdExp, CvtLo);
1736   MI.eraseFromParent();
1737   return true;
1738 }
1739 
1740 // TODO: Copied from DAG implementation. Verify logic and document how this
1741 // actually works.
1742 bool AMDGPULegalizerInfo::legalizeFPTOI(
1743   MachineInstr &MI, MachineRegisterInfo &MRI,
1744   MachineIRBuilder &B, bool Signed) const {
1745   B.setInstr(MI);
1746 
1747   Register Dst = MI.getOperand(0).getReg();
1748   Register Src = MI.getOperand(1).getReg();
1749 
1750   const LLT S64 = LLT::scalar(64);
1751   const LLT S32 = LLT::scalar(32);
1752 
1753   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1754 
1755   unsigned Flags = MI.getFlags();
1756 
1757   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
1758   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
1759   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
1760 
1761   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
1762   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
1763   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
1764 
1765   auto Hi = Signed ?
1766     B.buildFPTOSI(S32, FloorMul) :
1767     B.buildFPTOUI(S32, FloorMul);
1768   auto Lo = B.buildFPTOUI(S32, Fma);
1769 
1770   B.buildMerge(Dst, { Lo, Hi });
1771   MI.eraseFromParent();
1772 
1773   return true;
1774 }
1775 
1776 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1777   MachineInstr &MI, MachineRegisterInfo &MRI,
1778   MachineIRBuilder &B) const {
1779   MachineFunction &MF = B.getMF();
1780   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1781 
1782   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1783                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1784 
1785   // With ieee_mode disabled, the instructions have the correct behavior
1786   // already for G_FMINNUM/G_FMAXNUM
1787   if (!MFI->getMode().IEEE)
1788     return !IsIEEEOp;
1789 
1790   if (IsIEEEOp)
1791     return true;
1792 
1793   MachineIRBuilder HelperBuilder(MI);
1794   GISelObserverWrapper DummyObserver;
1795   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1796   HelperBuilder.setInstr(MI);
1797   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1798 }
1799 
1800 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1801   MachineInstr &MI, MachineRegisterInfo &MRI,
1802   MachineIRBuilder &B) const {
1803   // TODO: Should move some of this into LegalizerHelper.
1804 
1805   // TODO: Promote dynamic indexing of s16 to s32
1806 
1807   // FIXME: Artifact combiner probably should have replaced the truncated
1808   // constant before this, so we shouldn't need
1809   // getConstantVRegValWithLookThrough.
1810   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1811     MI.getOperand(2).getReg(), MRI);
1812   if (!IdxVal) // Dynamic case will be selected to register indexing.
1813     return true;
1814 
1815   Register Dst = MI.getOperand(0).getReg();
1816   Register Vec = MI.getOperand(1).getReg();
1817 
1818   LLT VecTy = MRI.getType(Vec);
1819   LLT EltTy = VecTy.getElementType();
1820   assert(EltTy == MRI.getType(Dst));
1821 
1822   B.setInstr(MI);
1823 
1824   if (IdxVal->Value < VecTy.getNumElements())
1825     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
1826   else
1827     B.buildUndef(Dst);
1828 
1829   MI.eraseFromParent();
1830   return true;
1831 }
1832 
1833 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1834   MachineInstr &MI, MachineRegisterInfo &MRI,
1835   MachineIRBuilder &B) const {
1836   // TODO: Should move some of this into LegalizerHelper.
1837 
1838   // TODO: Promote dynamic indexing of s16 to s32
1839 
1840   // FIXME: Artifact combiner probably should have replaced the truncated
1841   // constant before this, so we shouldn't need
1842   // getConstantVRegValWithLookThrough.
1843   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1844     MI.getOperand(3).getReg(), MRI);
1845   if (!IdxVal) // Dynamic case will be selected to register indexing.
1846     return true;
1847 
1848   Register Dst = MI.getOperand(0).getReg();
1849   Register Vec = MI.getOperand(1).getReg();
1850   Register Ins = MI.getOperand(2).getReg();
1851 
1852   LLT VecTy = MRI.getType(Vec);
1853   LLT EltTy = VecTy.getElementType();
1854   assert(EltTy == MRI.getType(Ins));
1855 
1856   B.setInstr(MI);
1857 
1858   if (IdxVal->Value < VecTy.getNumElements())
1859     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
1860   else
1861     B.buildUndef(Dst);
1862 
1863   MI.eraseFromParent();
1864   return true;
1865 }
1866 
1867 bool AMDGPULegalizerInfo::legalizeShuffleVector(
1868   MachineInstr &MI, MachineRegisterInfo &MRI,
1869   MachineIRBuilder &B) const {
1870   const LLT V2S16 = LLT::vector(2, 16);
1871 
1872   Register Dst = MI.getOperand(0).getReg();
1873   Register Src0 = MI.getOperand(1).getReg();
1874   LLT DstTy = MRI.getType(Dst);
1875   LLT SrcTy = MRI.getType(Src0);
1876 
1877   if (SrcTy == V2S16 && DstTy == V2S16 &&
1878       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
1879     return true;
1880 
1881   MachineIRBuilder HelperBuilder(MI);
1882   GISelObserverWrapper DummyObserver;
1883   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
1884   HelperBuilder.setInstr(MI);
1885   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
1886 }
1887 
1888 bool AMDGPULegalizerInfo::legalizeSinCos(
1889   MachineInstr &MI, MachineRegisterInfo &MRI,
1890   MachineIRBuilder &B) const {
1891   B.setInstr(MI);
1892 
1893   Register DstReg = MI.getOperand(0).getReg();
1894   Register SrcReg = MI.getOperand(1).getReg();
1895   LLT Ty = MRI.getType(DstReg);
1896   unsigned Flags = MI.getFlags();
1897 
1898   Register TrigVal;
1899   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1900   if (ST.hasTrigReducedRange()) {
1901     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1902     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1903       .addUse(MulVal.getReg(0))
1904       .setMIFlags(Flags).getReg(0);
1905   } else
1906     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1907 
1908   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1909     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1910   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1911     .addUse(TrigVal)
1912     .setMIFlags(Flags);
1913   MI.eraseFromParent();
1914   return true;
1915 }
1916 
1917 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1918   Register DstReg, LLT PtrTy,
1919   MachineIRBuilder &B, const GlobalValue *GV,
1920   unsigned Offset, unsigned GAFlags) const {
1921   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1922   // to the following code sequence:
1923   //
1924   // For constant address space:
1925   //   s_getpc_b64 s[0:1]
1926   //   s_add_u32 s0, s0, $symbol
1927   //   s_addc_u32 s1, s1, 0
1928   //
1929   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1930   //   a fixup or relocation is emitted to replace $symbol with a literal
1931   //   constant, which is a pc-relative offset from the encoding of the $symbol
1932   //   operand to the global variable.
1933   //
1934   // For global address space:
1935   //   s_getpc_b64 s[0:1]
1936   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1937   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1938   //
1939   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1940   //   fixups or relocations are emitted to replace $symbol@*@lo and
1941   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1942   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
1943   //   operand to the global variable.
1944   //
1945   // What we want here is an offset from the value returned by s_getpc
1946   // (which is the address of the s_add_u32 instruction) to the global
1947   // variable, but since the encoding of $symbol starts 4 bytes after the start
1948   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1949   // small. This requires us to add 4 to the global variable offset in order to
1950   // compute the correct address.
1951 
1952   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1953 
1954   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1955     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1956 
1957   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1958     .addDef(PCReg);
1959 
1960   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1961   if (GAFlags == SIInstrInfo::MO_NONE)
1962     MIB.addImm(0);
1963   else
1964     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1965 
1966   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1967 
1968   if (PtrTy.getSizeInBits() == 32)
1969     B.buildExtract(DstReg, PCReg, 0);
1970   return true;
1971  }
1972 
1973 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1974   MachineInstr &MI, MachineRegisterInfo &MRI,
1975   MachineIRBuilder &B) const {
1976   Register DstReg = MI.getOperand(0).getReg();
1977   LLT Ty = MRI.getType(DstReg);
1978   unsigned AS = Ty.getAddressSpace();
1979 
1980   const GlobalValue *GV = MI.getOperand(1).getGlobal();
1981   MachineFunction &MF = B.getMF();
1982   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1983   B.setInstr(MI);
1984 
1985   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1986     if (!MFI->isEntryFunction()) {
1987       const Function &Fn = MF.getFunction();
1988       DiagnosticInfoUnsupported BadLDSDecl(
1989         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
1990         DS_Warning);
1991       Fn.getContext().diagnose(BadLDSDecl);
1992 
1993       // We currently don't have a way to correctly allocate LDS objects that
1994       // aren't directly associated with a kernel. We do force inlining of
1995       // functions that use local objects. However, if these dead functions are
1996       // not eliminated, we don't want a compile time error. Just emit a warning
1997       // and a trap, since there should be no callable path here.
1998       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
1999       B.buildUndef(DstReg);
2000       MI.eraseFromParent();
2001       return true;
2002     }
2003 
2004     // TODO: We could emit code to handle the initialization somewhere.
2005     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2006       const SITargetLowering *TLI = ST.getTargetLowering();
2007       if (!TLI->shouldUseLDSConstAddress(GV)) {
2008         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2009         return true; // Leave in place;
2010       }
2011 
2012       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
2013       MI.eraseFromParent();
2014       return true;
2015     }
2016 
2017     const Function &Fn = MF.getFunction();
2018     DiagnosticInfoUnsupported BadInit(
2019       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2020     Fn.getContext().diagnose(BadInit);
2021     return true;
2022   }
2023 
2024   const SITargetLowering *TLI = ST.getTargetLowering();
2025 
2026   if (TLI->shouldEmitFixup(GV)) {
2027     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2028     MI.eraseFromParent();
2029     return true;
2030   }
2031 
2032   if (TLI->shouldEmitPCReloc(GV)) {
2033     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2034     MI.eraseFromParent();
2035     return true;
2036   }
2037 
2038   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2039   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2040 
2041   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2042       MachinePointerInfo::getGOT(MF),
2043       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2044           MachineMemOperand::MOInvariant,
2045       8 /*Size*/, Align(8));
2046 
2047   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2048 
2049   if (Ty.getSizeInBits() == 32) {
2050     // Truncate if this is a 32-bit constant adrdess.
2051     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2052     B.buildExtract(DstReg, Load, 0);
2053   } else
2054     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2055 
2056   MI.eraseFromParent();
2057   return true;
2058 }
2059 
2060 bool AMDGPULegalizerInfo::legalizeLoad(
2061   MachineInstr &MI, MachineRegisterInfo &MRI,
2062   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
2063   B.setInstr(MI);
2064   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2065   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
2066   Observer.changingInstr(MI);
2067   MI.getOperand(1).setReg(Cast.getReg(0));
2068   Observer.changedInstr(MI);
2069   return true;
2070 }
2071 
2072 bool AMDGPULegalizerInfo::legalizeFMad(
2073   MachineInstr &MI, MachineRegisterInfo &MRI,
2074   MachineIRBuilder &B) const {
2075   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2076   assert(Ty.isScalar());
2077 
2078   MachineFunction &MF = B.getMF();
2079   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2080 
2081   // TODO: Always legal with future ftz flag.
2082   // FIXME: Do we need just output?
2083   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2084     return true;
2085   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2086     return true;
2087 
2088   MachineIRBuilder HelperBuilder(MI);
2089   GISelObserverWrapper DummyObserver;
2090   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2091   HelperBuilder.setInstr(MI);
2092   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2093 }
2094 
2095 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2096   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2097   Register DstReg = MI.getOperand(0).getReg();
2098   Register PtrReg = MI.getOperand(1).getReg();
2099   Register CmpVal = MI.getOperand(2).getReg();
2100   Register NewVal = MI.getOperand(3).getReg();
2101 
2102   assert(SITargetLowering::isFlatGlobalAddrSpace(
2103            MRI.getType(PtrReg).getAddressSpace()) &&
2104          "this should not have been custom lowered");
2105 
2106   LLT ValTy = MRI.getType(CmpVal);
2107   LLT VecTy = LLT::vector(2, ValTy);
2108 
2109   B.setInstr(MI);
2110   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2111 
2112   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2113     .addDef(DstReg)
2114     .addUse(PtrReg)
2115     .addUse(PackedVal)
2116     .setMemRefs(MI.memoperands());
2117 
2118   MI.eraseFromParent();
2119   return true;
2120 }
2121 
2122 bool AMDGPULegalizerInfo::legalizeFlog(
2123   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2124   Register Dst = MI.getOperand(0).getReg();
2125   Register Src = MI.getOperand(1).getReg();
2126   LLT Ty = B.getMRI()->getType(Dst);
2127   unsigned Flags = MI.getFlags();
2128   B.setInstr(MI);
2129 
2130   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2131   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2132 
2133   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2134   MI.eraseFromParent();
2135   return true;
2136 }
2137 
2138 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2139                                        MachineIRBuilder &B) const {
2140   Register Dst = MI.getOperand(0).getReg();
2141   Register Src = MI.getOperand(1).getReg();
2142   unsigned Flags = MI.getFlags();
2143   LLT Ty = B.getMRI()->getType(Dst);
2144   B.setInstr(MI);
2145 
2146   auto K = B.buildFConstant(Ty, numbers::log2e);
2147   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2148   B.buildFExp2(Dst, Mul, Flags);
2149   MI.eraseFromParent();
2150   return true;
2151 }
2152 
2153 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2154                                        MachineIRBuilder &B) const {
2155   Register Dst = MI.getOperand(0).getReg();
2156   Register Src0 = MI.getOperand(1).getReg();
2157   Register Src1 = MI.getOperand(2).getReg();
2158   unsigned Flags = MI.getFlags();
2159   LLT Ty = B.getMRI()->getType(Dst);
2160   B.setInstr(MI);
2161   const LLT S16 = LLT::scalar(16);
2162   const LLT S32 = LLT::scalar(32);
2163 
2164   if (Ty == S32) {
2165     auto Log = B.buildFLog2(S32, Src0, Flags);
2166     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2167       .addUse(Log.getReg(0))
2168       .addUse(Src1)
2169       .setMIFlags(Flags);
2170     B.buildFExp2(Dst, Mul, Flags);
2171   } else if (Ty == S16) {
2172     // There's no f16 fmul_legacy, so we need to convert for it.
2173     auto Log = B.buildFLog2(S16, Src0, Flags);
2174     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2175     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2176     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2177       .addUse(Ext0.getReg(0))
2178       .addUse(Ext1.getReg(0))
2179       .setMIFlags(Flags);
2180 
2181     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2182   } else
2183     return false;
2184 
2185   MI.eraseFromParent();
2186   return true;
2187 }
2188 
2189 // Find a source register, ignoring any possible source modifiers.
2190 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2191   Register ModSrc = OrigSrc;
2192   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2193     ModSrc = SrcFNeg->getOperand(1).getReg();
2194     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2195       ModSrc = SrcFAbs->getOperand(1).getReg();
2196   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2197     ModSrc = SrcFAbs->getOperand(1).getReg();
2198   return ModSrc;
2199 }
2200 
2201 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2202                                          MachineRegisterInfo &MRI,
2203                                          MachineIRBuilder &B) const {
2204   B.setInstr(MI);
2205 
2206   const LLT S1 = LLT::scalar(1);
2207   const LLT S64 = LLT::scalar(64);
2208   Register Dst = MI.getOperand(0).getReg();
2209   Register OrigSrc = MI.getOperand(1).getReg();
2210   unsigned Flags = MI.getFlags();
2211   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2212          "this should not have been custom lowered");
2213 
2214   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2215   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2216   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2217   // V_FRACT bug is:
2218   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2219   //
2220   // Convert floor(x) to (x - fract(x))
2221 
2222   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2223     .addUse(OrigSrc)
2224     .setMIFlags(Flags);
2225 
2226   // Give source modifier matching some assistance before obscuring a foldable
2227   // pattern.
2228 
2229   // TODO: We can avoid the neg on the fract? The input sign to fract
2230   // shouldn't matter?
2231   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2232 
2233   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2234 
2235   Register Min = MRI.createGenericVirtualRegister(S64);
2236 
2237   // We don't need to concern ourselves with the snan handling difference, so
2238   // use the one which will directly select.
2239   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2240   if (MFI->getMode().IEEE)
2241     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2242   else
2243     B.buildFMinNum(Min, Fract, Const, Flags);
2244 
2245   Register CorrectedFract = Min;
2246   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2247     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2248     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2249   }
2250 
2251   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2252   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2253 
2254   MI.eraseFromParent();
2255   return true;
2256 }
2257 
2258 // Turn an illegal packed v2s16 build vector into bit operations.
2259 // TODO: This should probably be a bitcast action in LegalizerHelper.
2260 bool AMDGPULegalizerInfo::legalizeBuildVector(
2261   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2262   Register Dst = MI.getOperand(0).getReg();
2263   const LLT S32 = LLT::scalar(32);
2264   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2265 
2266   Register Src0 = MI.getOperand(1).getReg();
2267   Register Src1 = MI.getOperand(2).getReg();
2268   assert(MRI.getType(Src0) == LLT::scalar(16));
2269 
2270   B.setInstr(MI);
2271   auto Merge = B.buildMerge(S32, {Src0, Src1});
2272   B.buildBitcast(Dst, Merge);
2273 
2274   MI.eraseFromParent();
2275   return true;
2276 }
2277 
2278 // Return the use branch instruction, otherwise null if the usage is invalid.
2279 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2280                                        MachineRegisterInfo &MRI,
2281                                        MachineInstr *&Br,
2282                                        MachineBasicBlock *&UncondBrTarget) {
2283   Register CondDef = MI.getOperand(0).getReg();
2284   if (!MRI.hasOneNonDBGUse(CondDef))
2285     return nullptr;
2286 
2287   MachineBasicBlock *Parent = MI.getParent();
2288   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2289   if (UseMI.getParent() != Parent ||
2290       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2291     return nullptr;
2292 
2293   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2294   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2295   if (Next == Parent->end()) {
2296     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2297     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2298       return nullptr;
2299     UncondBrTarget = &*NextMBB;
2300   } else {
2301     if (Next->getOpcode() != AMDGPU::G_BR)
2302       return nullptr;
2303     Br = &*Next;
2304     UncondBrTarget = Br->getOperand(0).getMBB();
2305   }
2306 
2307   return &UseMI;
2308 }
2309 
2310 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B,
2311                                                MachineRegisterInfo &MRI,
2312                                                Register LiveIn,
2313                                                Register PhyReg) const {
2314   assert(PhyReg.isPhysical() && "Physical register expected");
2315 
2316   // Insert the live-in copy, if required, by defining destination virtual
2317   // register.
2318   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
2319   if (!MRI.getVRegDef(LiveIn)) {
2320     // FIXME: Should have scoped insert pt
2321     MachineBasicBlock &OrigInsBB = B.getMBB();
2322     auto OrigInsPt = B.getInsertPt();
2323 
2324     MachineBasicBlock &EntryMBB = B.getMF().front();
2325     EntryMBB.addLiveIn(PhyReg);
2326     B.setInsertPt(EntryMBB, EntryMBB.begin());
2327     B.buildCopy(LiveIn, PhyReg);
2328 
2329     B.setInsertPt(OrigInsBB, OrigInsPt);
2330   }
2331 
2332   return LiveIn;
2333 }
2334 
2335 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B,
2336                                                 MachineRegisterInfo &MRI,
2337                                                 Register PhyReg, LLT Ty,
2338                                                 bool InsertLiveInCopy) const {
2339   assert(PhyReg.isPhysical() && "Physical register expected");
2340 
2341   // Get or create virtual live-in regester
2342   Register LiveIn = MRI.getLiveInVirtReg(PhyReg);
2343   if (!LiveIn) {
2344     LiveIn = MRI.createGenericVirtualRegister(Ty);
2345     MRI.addLiveIn(PhyReg, LiveIn);
2346   }
2347 
2348   // When the actual true copy required is from virtual register to physical
2349   // register (to be inserted later), live-in copy insertion from physical
2350   // to register virtual register is not required
2351   if (!InsertLiveInCopy)
2352     return LiveIn;
2353 
2354   return insertLiveInCopy(B, MRI, LiveIn, PhyReg);
2355 }
2356 
2357 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor(
2358     MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2359   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2360   const ArgDescriptor *Arg;
2361   const TargetRegisterClass *RC;
2362   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
2363   if (!Arg) {
2364     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
2365     return nullptr;
2366   }
2367   return Arg;
2368 }
2369 
2370 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2371                                          const ArgDescriptor *Arg) const {
2372   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2373     return false; // TODO: Handle these
2374 
2375   Register SrcReg = Arg->getRegister();
2376   assert(SrcReg.isPhysical() && "Physical register expected");
2377   assert(DstReg.isVirtual() && "Virtual register expected");
2378 
2379   MachineRegisterInfo &MRI = *B.getMRI();
2380 
2381   LLT Ty = MRI.getType(DstReg);
2382   Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty);
2383 
2384   if (Arg->isMasked()) {
2385     // TODO: Should we try to emit this once in the entry block?
2386     const LLT S32 = LLT::scalar(32);
2387     const unsigned Mask = Arg->getMask();
2388     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2389 
2390     Register AndMaskSrc = LiveIn;
2391 
2392     if (Shift != 0) {
2393       auto ShiftAmt = B.buildConstant(S32, Shift);
2394       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2395     }
2396 
2397     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2398   } else {
2399     B.buildCopy(DstReg, LiveIn);
2400   }
2401 
2402   return true;
2403 }
2404 
2405 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2406     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2407     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2408   B.setInstr(MI);
2409 
2410   const ArgDescriptor *Arg = getArgDescriptor(B, ArgType);
2411   if (!Arg)
2412     return false;
2413 
2414   if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg))
2415     return false;
2416 
2417   MI.eraseFromParent();
2418   return true;
2419 }
2420 
2421 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2422                                        MachineRegisterInfo &MRI,
2423                                        MachineIRBuilder &B) const {
2424   B.setInstr(MI);
2425   Register Dst = MI.getOperand(0).getReg();
2426   LLT DstTy = MRI.getType(Dst);
2427   LLT S16 = LLT::scalar(16);
2428   LLT S32 = LLT::scalar(32);
2429   LLT S64 = LLT::scalar(64);
2430 
2431   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2432     return true;
2433 
2434   if (DstTy == S16)
2435     return legalizeFDIV16(MI, MRI, B);
2436   if (DstTy == S32)
2437     return legalizeFDIV32(MI, MRI, B);
2438   if (DstTy == S64)
2439     return legalizeFDIV64(MI, MRI, B);
2440 
2441   return false;
2442 }
2443 
2444 static Register buildDivRCP(MachineIRBuilder &B, Register Src) {
2445   const LLT S32 = LLT::scalar(32);
2446 
2447   auto Cvt0 = B.buildUITOFP(S32, Src);
2448   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Cvt0});
2449   auto FPUIntMaxPlus1 = B.buildFConstant(S32, BitsToFloat(0x4f800000));
2450   auto Mul = B.buildFMul(S32, RcpIFlag, FPUIntMaxPlus1);
2451   return B.buildFPTOUI(S32, Mul).getReg(0);
2452 }
2453 
2454 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2455                                                   Register DstReg,
2456                                                   Register Num,
2457                                                   Register Den,
2458                                                   bool IsRem) const {
2459   const LLT S1 = LLT::scalar(1);
2460   const LLT S32 = LLT::scalar(32);
2461 
2462   // RCP =  URECIP(Den) = 2^32 / Den + e
2463   // e is rounding error.
2464   auto RCP = buildDivRCP(B, Den);
2465 
2466   // RCP_LO = mul(RCP, Den)
2467   auto RCP_LO = B.buildMul(S32, RCP, Den);
2468 
2469   // RCP_HI = mulhu (RCP, Den) */
2470   auto RCP_HI = B.buildUMulH(S32, RCP, Den);
2471 
2472   // NEG_RCP_LO = -RCP_LO
2473   auto Zero = B.buildConstant(S32, 0);
2474   auto NEG_RCP_LO = B.buildSub(S32, Zero, RCP_LO);
2475 
2476   // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
2477   auto CmpRcpHiZero = B.buildICmp(CmpInst::ICMP_EQ, S1, RCP_HI, Zero);
2478   auto ABS_RCP_LO = B.buildSelect(S32, CmpRcpHiZero, NEG_RCP_LO, RCP_LO);
2479 
2480   // Calculate the rounding error from the URECIP instruction
2481   // E = mulhu(ABS_RCP_LO, RCP)
2482   auto E = B.buildUMulH(S32, ABS_RCP_LO, RCP);
2483 
2484   // RCP_A_E = RCP + E
2485   auto RCP_A_E = B.buildAdd(S32, RCP, E);
2486 
2487   // RCP_S_E = RCP - E
2488   auto RCP_S_E = B.buildSub(S32, RCP, E);
2489 
2490   // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
2491   auto Tmp0 = B.buildSelect(S32, CmpRcpHiZero, RCP_A_E, RCP_S_E);
2492 
2493   // Quotient = mulhu(Tmp0, Num)stmp
2494   auto Quotient = B.buildUMulH(S32, Tmp0, Num);
2495 
2496   // Num_S_Remainder = Quotient * Den
2497   auto Num_S_Remainder = B.buildMul(S32, Quotient, Den);
2498 
2499   // Remainder = Num - Num_S_Remainder
2500   auto Remainder = B.buildSub(S32, Num, Num_S_Remainder);
2501 
2502   // Remainder_GE_Den = Remainder >= Den
2503   auto Remainder_GE_Den = B.buildICmp(CmpInst::ICMP_UGE, S1, Remainder, Den);
2504 
2505   // Remainder_GE_Zero = Num >= Num_S_Remainder;
2506   auto Remainder_GE_Zero = B.buildICmp(CmpInst::ICMP_UGE, S1,
2507                                        Num, Num_S_Remainder);
2508 
2509   // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
2510   auto Tmp1 = B.buildAnd(S1, Remainder_GE_Den, Remainder_GE_Zero);
2511 
2512   // Calculate Division result:
2513 
2514   // Quotient_A_One = Quotient + 1
2515   auto One = B.buildConstant(S32, 1);
2516   auto Quotient_A_One = B.buildAdd(S32, Quotient, One);
2517 
2518   // Quotient_S_One = Quotient - 1
2519   auto Quotient_S_One = B.buildSub(S32, Quotient, One);
2520 
2521   // Div = (Tmp1 == 0 ? Quotient_A_One : Quotient)
2522   auto Div = B.buildSelect(S32, Tmp1, Quotient, Quotient_A_One);
2523 
2524   // Div = (Remainder_GE_Zero ? Div : Quotient_S_One)
2525   if (IsRem) {
2526     Div = B.buildSelect(S32, Remainder_GE_Zero, Div, Quotient_S_One);
2527 
2528     // Calculate Rem result:
2529     auto Remainder_S_Den = B.buildSub(S32, Remainder, Den);
2530 
2531     // Remainder_A_Den = Remainder + Den
2532     auto Remainder_A_Den = B.buildAdd(S32, Remainder, Den);
2533 
2534     // Rem = (Tmp1 ? Remainder_S_Den : Remainder)
2535     auto Rem = B.buildSelect(S32, Tmp1, Remainder_S_Den, Remainder);
2536 
2537     // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den)
2538     B.buildSelect(DstReg, Remainder_GE_Zero, Rem, Remainder_A_Den);
2539   } else {
2540     B.buildSelect(DstReg, Remainder_GE_Zero, Div, Quotient_S_One);
2541   }
2542 }
2543 
2544 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2545                                               MachineRegisterInfo &MRI,
2546                                               MachineIRBuilder &B) const {
2547   B.setInstr(MI);
2548   const bool IsRem = MI.getOpcode() == AMDGPU::G_UREM;
2549   Register DstReg = MI.getOperand(0).getReg();
2550   Register Num = MI.getOperand(1).getReg();
2551   Register Den = MI.getOperand(2).getReg();
2552   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsRem);
2553   MI.eraseFromParent();
2554   return true;
2555 }
2556 
2557 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2558 //
2559 // Return lo, hi of result
2560 //
2561 // %cvt.lo = G_UITOFP Val.lo
2562 // %cvt.hi = G_UITOFP Val.hi
2563 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2564 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2565 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2566 // %mul2 = G_FMUL %mul1, 2**(-32)
2567 // %trunc = G_INTRINSIC_TRUNC %mul2
2568 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2569 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2570 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2571                                                        Register Val) {
2572   const LLT S32 = LLT::scalar(32);
2573   auto Unmerge = B.buildUnmerge(S32, Val);
2574 
2575   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2576   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2577 
2578   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2579                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2580 
2581   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2582   auto Mul1 =
2583       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2584 
2585   // 2**(-32)
2586   auto Mul2 =
2587       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2588   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2589 
2590   // -(2**32)
2591   auto Mad2 = B.buildFMAD(S32, Trunc,
2592                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2593 
2594   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2595   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2596 
2597   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2598 }
2599 
2600 bool AMDGPULegalizerInfo::legalizeUDIV_UREM64(MachineInstr &MI,
2601                                               MachineRegisterInfo &MRI,
2602                                               MachineIRBuilder &B) const {
2603   B.setInstr(MI);
2604 
2605   const bool IsDiv = MI.getOpcode() == TargetOpcode::G_UDIV;
2606   const LLT S32 = LLT::scalar(32);
2607   const LLT S64 = LLT::scalar(64);
2608   const LLT S1 = LLT::scalar(1);
2609   Register Numer = MI.getOperand(1).getReg();
2610   Register Denom = MI.getOperand(2).getReg();
2611   Register RcpLo, RcpHi;
2612 
2613   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2614 
2615   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2616 
2617   auto Zero64 = B.buildConstant(S64, 0);
2618   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2619 
2620   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2621   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2622 
2623   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2624   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2625   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2626 
2627   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2628   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2629   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2630   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2631 
2632   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2633   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2634   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2635   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2636   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2637 
2638   auto Zero32 = B.buildConstant(S32, 0);
2639   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2640   auto Add2_HiC =
2641       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2642   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2643   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2644 
2645   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2646   Register NumerLo = UnmergeNumer.getReg(0);
2647   Register NumerHi = UnmergeNumer.getReg(1);
2648 
2649   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2650   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2651   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2652   Register Mul3_Lo = UnmergeMul3.getReg(0);
2653   Register Mul3_Hi = UnmergeMul3.getReg(1);
2654   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2655   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2656   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2657   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2658 
2659   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2660   Register DenomLo = UnmergeDenom.getReg(0);
2661   Register DenomHi = UnmergeDenom.getReg(1);
2662 
2663   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2664   auto C1 = B.buildSExt(S32, CmpHi);
2665 
2666   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2667   auto C2 = B.buildSExt(S32, CmpLo);
2668 
2669   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2670   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2671 
2672   // TODO: Here and below portions of the code can be enclosed into if/endif.
2673   // Currently control flow is unconditional and we have 4 selects after
2674   // potential endif to substitute PHIs.
2675 
2676   // if C3 != 0 ...
2677   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2678   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2679   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2680   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2681 
2682   auto One64 = B.buildConstant(S64, 1);
2683   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2684 
2685   auto C4 =
2686       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2687   auto C5 =
2688       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2689   auto C6 = B.buildSelect(
2690       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2691 
2692   // if (C6 != 0)
2693   auto Add4 = B.buildAdd(S64, Add3, One64);
2694   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2695 
2696   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2697   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2698   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2699 
2700   // endif C6
2701   // endif C3
2702 
2703   if (IsDiv) {
2704     auto Sel1 = B.buildSelect(
2705         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2706     B.buildSelect(MI.getOperand(0),
2707                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2708   } else {
2709     auto Sel2 = B.buildSelect(
2710         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2711     B.buildSelect(MI.getOperand(0),
2712                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2713   }
2714 
2715   MI.eraseFromParent();
2716   return true;
2717 }
2718 
2719 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2720                                             MachineRegisterInfo &MRI,
2721                                             MachineIRBuilder &B) const {
2722   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2723   if (Ty == LLT::scalar(32))
2724     return legalizeUDIV_UREM32(MI, MRI, B);
2725   if (Ty == LLT::scalar(64))
2726     return legalizeUDIV_UREM64(MI, MRI, B);
2727   return false;
2728 }
2729 
2730 bool AMDGPULegalizerInfo::legalizeSDIV_SREM32(MachineInstr &MI,
2731                                               MachineRegisterInfo &MRI,
2732                                               MachineIRBuilder &B) const {
2733   B.setInstr(MI);
2734   const LLT S32 = LLT::scalar(32);
2735 
2736   const bool IsRem = MI.getOpcode() == AMDGPU::G_SREM;
2737   Register DstReg = MI.getOperand(0).getReg();
2738   Register LHS = MI.getOperand(1).getReg();
2739   Register RHS = MI.getOperand(2).getReg();
2740 
2741   auto ThirtyOne = B.buildConstant(S32, 31);
2742   auto LHSign = B.buildAShr(S32, LHS, ThirtyOne);
2743   auto RHSign = B.buildAShr(S32, LHS, ThirtyOne);
2744 
2745   LHS = B.buildAdd(S32, LHS, LHSign).getReg(0);
2746   RHS = B.buildAdd(S32, RHS, RHSign).getReg(0);
2747 
2748   LHS = B.buildXor(S32, LHS, LHSign).getReg(0);
2749   RHS = B.buildXor(S32, RHS, RHSign).getReg(0);
2750 
2751   Register UDivRem = MRI.createGenericVirtualRegister(S32);
2752   legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsRem);
2753 
2754   if (IsRem) {
2755     auto RSign = LHSign; // Remainder sign is the same as LHS
2756     UDivRem = B.buildXor(S32, UDivRem, RSign).getReg(0);
2757     B.buildSub(DstReg, UDivRem, RSign);
2758   } else {
2759     auto DSign = B.buildXor(S32, LHSign, RHSign);
2760     UDivRem = B.buildXor(S32, UDivRem, DSign).getReg(0);
2761     B.buildSub(DstReg, UDivRem, DSign);
2762   }
2763 
2764   MI.eraseFromParent();
2765   return true;
2766 }
2767 
2768 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2769                                             MachineRegisterInfo &MRI,
2770                                             MachineIRBuilder &B) const {
2771   if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32))
2772     return legalizeSDIV_SREM32(MI, MRI, B);
2773   return false;
2774 }
2775 
2776 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2777                                                  MachineRegisterInfo &MRI,
2778                                                  MachineIRBuilder &B) const {
2779   Register Res = MI.getOperand(0).getReg();
2780   Register LHS = MI.getOperand(1).getReg();
2781   Register RHS = MI.getOperand(2).getReg();
2782 
2783   uint16_t Flags = MI.getFlags();
2784 
2785   LLT ResTy = MRI.getType(Res);
2786   LLT S32 = LLT::scalar(32);
2787   LLT S64 = LLT::scalar(64);
2788 
2789   const MachineFunction &MF = B.getMF();
2790   bool Unsafe =
2791     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2792 
2793   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2794     return false;
2795 
2796   if (!Unsafe && ResTy == S32 &&
2797       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2798     return false;
2799 
2800   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2801     // 1 / x -> RCP(x)
2802     if (CLHS->isExactlyValue(1.0)) {
2803       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2804         .addUse(RHS)
2805         .setMIFlags(Flags);
2806 
2807       MI.eraseFromParent();
2808       return true;
2809     }
2810 
2811     // -1 / x -> RCP( FNEG(x) )
2812     if (CLHS->isExactlyValue(-1.0)) {
2813       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2814       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2815         .addUse(FNeg.getReg(0))
2816         .setMIFlags(Flags);
2817 
2818       MI.eraseFromParent();
2819       return true;
2820     }
2821   }
2822 
2823   // x / y -> x * (1.0 / y)
2824   if (Unsafe) {
2825     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2826       .addUse(RHS)
2827       .setMIFlags(Flags);
2828     B.buildFMul(Res, LHS, RCP, Flags);
2829 
2830     MI.eraseFromParent();
2831     return true;
2832   }
2833 
2834   return false;
2835 }
2836 
2837 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2838                                          MachineRegisterInfo &MRI,
2839                                          MachineIRBuilder &B) const {
2840   B.setInstr(MI);
2841   Register Res = MI.getOperand(0).getReg();
2842   Register LHS = MI.getOperand(1).getReg();
2843   Register RHS = MI.getOperand(2).getReg();
2844 
2845   uint16_t Flags = MI.getFlags();
2846 
2847   LLT S16 = LLT::scalar(16);
2848   LLT S32 = LLT::scalar(32);
2849 
2850   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2851   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2852 
2853   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2854     .addUse(RHSExt.getReg(0))
2855     .setMIFlags(Flags);
2856 
2857   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2858   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2859 
2860   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2861     .addUse(RDst.getReg(0))
2862     .addUse(RHS)
2863     .addUse(LHS)
2864     .setMIFlags(Flags);
2865 
2866   MI.eraseFromParent();
2867   return true;
2868 }
2869 
2870 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2871 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2872 static void toggleSPDenormMode(bool Enable,
2873                                MachineIRBuilder &B,
2874                                const GCNSubtarget &ST,
2875                                AMDGPU::SIModeRegisterDefaults Mode) {
2876   // Set SP denorm mode to this value.
2877   unsigned SPDenormMode =
2878     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
2879 
2880   if (ST.hasDenormModeInst()) {
2881     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2882     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
2883 
2884     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2885     B.buildInstr(AMDGPU::S_DENORM_MODE)
2886       .addImm(NewDenormModeValue);
2887 
2888   } else {
2889     // Select FP32 bit field in mode register.
2890     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2891                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2892                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2893 
2894     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2895       .addImm(SPDenormMode)
2896       .addImm(SPDenormModeBitField);
2897   }
2898 }
2899 
2900 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2901                                          MachineRegisterInfo &MRI,
2902                                          MachineIRBuilder &B) const {
2903   B.setInstr(MI);
2904   Register Res = MI.getOperand(0).getReg();
2905   Register LHS = MI.getOperand(1).getReg();
2906   Register RHS = MI.getOperand(2).getReg();
2907   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2908   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2909 
2910   uint16_t Flags = MI.getFlags();
2911 
2912   LLT S32 = LLT::scalar(32);
2913   LLT S1 = LLT::scalar(1);
2914 
2915   auto One = B.buildFConstant(S32, 1.0f);
2916 
2917   auto DenominatorScaled =
2918     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2919       .addUse(LHS)
2920       .addUse(RHS)
2921       .addImm(0)
2922       .setMIFlags(Flags);
2923   auto NumeratorScaled =
2924     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2925       .addUse(LHS)
2926       .addUse(RHS)
2927       .addImm(1)
2928       .setMIFlags(Flags);
2929 
2930   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2931     .addUse(DenominatorScaled.getReg(0))
2932     .setMIFlags(Flags);
2933   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2934 
2935   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2936   // aren't modeled as reading it.
2937   if (!Mode.allFP32Denormals())
2938     toggleSPDenormMode(true, B, ST, Mode);
2939 
2940   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2941   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2942   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2943   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2944   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2945   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2946 
2947   if (!Mode.allFP32Denormals())
2948     toggleSPDenormMode(false, B, ST, Mode);
2949 
2950   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2951     .addUse(Fma4.getReg(0))
2952     .addUse(Fma1.getReg(0))
2953     .addUse(Fma3.getReg(0))
2954     .addUse(NumeratorScaled.getReg(1))
2955     .setMIFlags(Flags);
2956 
2957   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2958     .addUse(Fmas.getReg(0))
2959     .addUse(RHS)
2960     .addUse(LHS)
2961     .setMIFlags(Flags);
2962 
2963   MI.eraseFromParent();
2964   return true;
2965 }
2966 
2967 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
2968                                          MachineRegisterInfo &MRI,
2969                                          MachineIRBuilder &B) const {
2970   B.setInstr(MI);
2971   Register Res = MI.getOperand(0).getReg();
2972   Register LHS = MI.getOperand(1).getReg();
2973   Register RHS = MI.getOperand(2).getReg();
2974 
2975   uint16_t Flags = MI.getFlags();
2976 
2977   LLT S64 = LLT::scalar(64);
2978   LLT S1 = LLT::scalar(1);
2979 
2980   auto One = B.buildFConstant(S64, 1.0);
2981 
2982   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2983     .addUse(LHS)
2984     .addUse(RHS)
2985     .addImm(0)
2986     .setMIFlags(Flags);
2987 
2988   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
2989 
2990   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
2991     .addUse(DivScale0.getReg(0))
2992     .setMIFlags(Flags);
2993 
2994   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
2995   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
2996   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
2997 
2998   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2999     .addUse(LHS)
3000     .addUse(RHS)
3001     .addImm(1)
3002     .setMIFlags(Flags);
3003 
3004   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3005   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3006   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3007 
3008   Register Scale;
3009   if (!ST.hasUsableDivScaleConditionOutput()) {
3010     // Workaround a hardware bug on SI where the condition output from div_scale
3011     // is not usable.
3012 
3013     LLT S32 = LLT::scalar(32);
3014 
3015     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3016     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3017     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3018     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3019 
3020     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3021                               Scale1Unmerge.getReg(1));
3022     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3023                               Scale0Unmerge.getReg(1));
3024     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3025   } else {
3026     Scale = DivScale1.getReg(1);
3027   }
3028 
3029   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3030     .addUse(Fma4.getReg(0))
3031     .addUse(Fma3.getReg(0))
3032     .addUse(Mul.getReg(0))
3033     .addUse(Scale)
3034     .setMIFlags(Flags);
3035 
3036   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3037     .addUse(Fmas.getReg(0))
3038     .addUse(RHS)
3039     .addUse(LHS)
3040     .setMIFlags(Flags);
3041 
3042   MI.eraseFromParent();
3043   return true;
3044 }
3045 
3046 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3047                                                  MachineRegisterInfo &MRI,
3048                                                  MachineIRBuilder &B) const {
3049   B.setInstr(MI);
3050   Register Res = MI.getOperand(0).getReg();
3051   Register LHS = MI.getOperand(2).getReg();
3052   Register RHS = MI.getOperand(3).getReg();
3053   uint16_t Flags = MI.getFlags();
3054 
3055   LLT S32 = LLT::scalar(32);
3056   LLT S1 = LLT::scalar(1);
3057 
3058   auto Abs = B.buildFAbs(S32, RHS, Flags);
3059   const APFloat C0Val(1.0f);
3060 
3061   auto C0 = B.buildConstant(S32, 0x6f800000);
3062   auto C1 = B.buildConstant(S32, 0x2f800000);
3063   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3064 
3065   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3066   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3067 
3068   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3069 
3070   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3071     .addUse(Mul0.getReg(0))
3072     .setMIFlags(Flags);
3073 
3074   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3075 
3076   B.buildFMul(Res, Sel, Mul1, Flags);
3077 
3078   MI.eraseFromParent();
3079   return true;
3080 }
3081 
3082 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3083                                                  MachineRegisterInfo &MRI,
3084                                                  MachineIRBuilder &B) const {
3085   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3086   if (!MFI->isEntryFunction()) {
3087     return legalizePreloadedArgIntrin(MI, MRI, B,
3088                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3089   }
3090 
3091   B.setInstr(MI);
3092 
3093   uint64_t Offset =
3094     ST.getTargetLowering()->getImplicitParameterOffset(
3095       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3096   Register DstReg = MI.getOperand(0).getReg();
3097   LLT DstTy = MRI.getType(DstReg);
3098   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3099 
3100   const ArgDescriptor *Arg;
3101   const TargetRegisterClass *RC;
3102   std::tie(Arg, RC)
3103     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
3104   if (!Arg)
3105     return false;
3106 
3107   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3108   if (!loadInputValue(KernargPtrReg, B, Arg))
3109     return false;
3110 
3111   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3112   MI.eraseFromParent();
3113   return true;
3114 }
3115 
3116 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3117                                               MachineRegisterInfo &MRI,
3118                                               MachineIRBuilder &B,
3119                                               unsigned AddrSpace) const {
3120   B.setInstr(MI);
3121   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3122   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
3123   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3124   MI.eraseFromParent();
3125   return true;
3126 }
3127 
3128 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3129 // offset (the offset that is included in bounds checking and swizzling, to be
3130 // split between the instruction's voffset and immoffset fields) and soffset
3131 // (the offset that is excluded from bounds checking and swizzling, to go in
3132 // the instruction's soffset field).  This function takes the first kind of
3133 // offset and figures out how to split it between voffset and immoffset.
3134 std::tuple<Register, unsigned, unsigned>
3135 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3136                                         Register OrigOffset) const {
3137   const unsigned MaxImm = 4095;
3138   Register BaseReg;
3139   unsigned TotalConstOffset;
3140   MachineInstr *OffsetDef;
3141   const LLT S32 = LLT::scalar(32);
3142 
3143   std::tie(BaseReg, TotalConstOffset, OffsetDef)
3144     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
3145 
3146   unsigned ImmOffset = TotalConstOffset;
3147 
3148   // If the immediate value is too big for the immoffset field, put the value
3149   // and -4096 into the immoffset field so that the value that is copied/added
3150   // for the voffset field is a multiple of 4096, and it stands more chance
3151   // of being CSEd with the copy/add for another similar load/store.
3152   // However, do not do that rounding down to a multiple of 4096 if that is a
3153   // negative number, as it appears to be illegal to have a negative offset
3154   // in the vgpr, even if adding the immediate offset makes it positive.
3155   unsigned Overflow = ImmOffset & ~MaxImm;
3156   ImmOffset -= Overflow;
3157   if ((int32_t)Overflow < 0) {
3158     Overflow += ImmOffset;
3159     ImmOffset = 0;
3160   }
3161 
3162   if (Overflow != 0) {
3163     if (!BaseReg) {
3164       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3165     } else {
3166       auto OverflowVal = B.buildConstant(S32, Overflow);
3167       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3168     }
3169   }
3170 
3171   if (!BaseReg)
3172     BaseReg = B.buildConstant(S32, 0).getReg(0);
3173 
3174   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3175 }
3176 
3177 /// Handle register layout difference for f16 images for some subtargets.
3178 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3179                                              MachineRegisterInfo &MRI,
3180                                              Register Reg) const {
3181   if (!ST.hasUnpackedD16VMem())
3182     return Reg;
3183 
3184   const LLT S16 = LLT::scalar(16);
3185   const LLT S32 = LLT::scalar(32);
3186   LLT StoreVT = MRI.getType(Reg);
3187   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3188 
3189   auto Unmerge = B.buildUnmerge(S16, Reg);
3190 
3191   SmallVector<Register, 4> WideRegs;
3192   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3193     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3194 
3195   int NumElts = StoreVT.getNumElements();
3196 
3197   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3198 }
3199 
3200 Register AMDGPULegalizerInfo::fixStoreSourceType(
3201   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3202   MachineRegisterInfo *MRI = B.getMRI();
3203   LLT Ty = MRI->getType(VData);
3204 
3205   const LLT S16 = LLT::scalar(16);
3206 
3207   // Fixup illegal register types for i8 stores.
3208   if (Ty == LLT::scalar(8) || Ty == S16) {
3209     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3210     return AnyExt;
3211   }
3212 
3213   if (Ty.isVector()) {
3214     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3215       if (IsFormat)
3216         return handleD16VData(B, *MRI, VData);
3217     }
3218   }
3219 
3220   return VData;
3221 }
3222 
3223 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3224                                               MachineRegisterInfo &MRI,
3225                                               MachineIRBuilder &B,
3226                                               bool IsTyped,
3227                                               bool IsFormat) const {
3228   B.setInstr(MI);
3229 
3230   Register VData = MI.getOperand(1).getReg();
3231   LLT Ty = MRI.getType(VData);
3232   LLT EltTy = Ty.getScalarType();
3233   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3234   const LLT S32 = LLT::scalar(32);
3235 
3236   VData = fixStoreSourceType(B, VData, IsFormat);
3237   Register RSrc = MI.getOperand(2).getReg();
3238 
3239   MachineMemOperand *MMO = *MI.memoperands_begin();
3240   const int MemSize = MMO->getSize();
3241 
3242   unsigned ImmOffset;
3243   unsigned TotalOffset;
3244 
3245   // The typed intrinsics add an immediate after the registers.
3246   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3247 
3248   // The struct intrinsic variants add one additional operand over raw.
3249   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3250   Register VIndex;
3251   int OpOffset = 0;
3252   if (HasVIndex) {
3253     VIndex = MI.getOperand(3).getReg();
3254     OpOffset = 1;
3255   }
3256 
3257   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3258   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3259 
3260   unsigned Format = 0;
3261   if (IsTyped) {
3262     Format = MI.getOperand(5 + OpOffset).getImm();
3263     ++OpOffset;
3264   }
3265 
3266   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3267 
3268   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3269   if (TotalOffset != 0)
3270     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3271 
3272   unsigned Opc;
3273   if (IsTyped) {
3274     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3275                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3276   } else if (IsFormat) {
3277     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3278                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3279   } else {
3280     switch (MemSize) {
3281     case 1:
3282       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3283       break;
3284     case 2:
3285       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3286       break;
3287     default:
3288       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3289       break;
3290     }
3291   }
3292 
3293   if (!VIndex)
3294     VIndex = B.buildConstant(S32, 0).getReg(0);
3295 
3296   auto MIB = B.buildInstr(Opc)
3297     .addUse(VData)              // vdata
3298     .addUse(RSrc)               // rsrc
3299     .addUse(VIndex)             // vindex
3300     .addUse(VOffset)            // voffset
3301     .addUse(SOffset)            // soffset
3302     .addImm(ImmOffset);         // offset(imm)
3303 
3304   if (IsTyped)
3305     MIB.addImm(Format);
3306 
3307   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3308      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3309      .addMemOperand(MMO);
3310 
3311   MI.eraseFromParent();
3312   return true;
3313 }
3314 
3315 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3316                                              MachineRegisterInfo &MRI,
3317                                              MachineIRBuilder &B,
3318                                              bool IsFormat,
3319                                              bool IsTyped) const {
3320   B.setInstr(MI);
3321 
3322   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3323   MachineMemOperand *MMO = *MI.memoperands_begin();
3324   const int MemSize = MMO->getSize();
3325   const LLT S32 = LLT::scalar(32);
3326 
3327   Register Dst = MI.getOperand(0).getReg();
3328   Register RSrc = MI.getOperand(2).getReg();
3329 
3330   // The typed intrinsics add an immediate after the registers.
3331   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3332 
3333   // The struct intrinsic variants add one additional operand over raw.
3334   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3335   Register VIndex;
3336   int OpOffset = 0;
3337   if (HasVIndex) {
3338     VIndex = MI.getOperand(3).getReg();
3339     OpOffset = 1;
3340   }
3341 
3342   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3343   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3344 
3345   unsigned Format = 0;
3346   if (IsTyped) {
3347     Format = MI.getOperand(5 + OpOffset).getImm();
3348     ++OpOffset;
3349   }
3350 
3351   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3352   unsigned ImmOffset;
3353   unsigned TotalOffset;
3354 
3355   LLT Ty = MRI.getType(Dst);
3356   LLT EltTy = Ty.getScalarType();
3357   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3358   const bool Unpacked = ST.hasUnpackedD16VMem();
3359 
3360   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3361   if (TotalOffset != 0)
3362     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3363 
3364   unsigned Opc;
3365 
3366   if (IsTyped) {
3367     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3368                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3369   } else if (IsFormat) {
3370     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3371                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3372   } else {
3373     switch (MemSize) {
3374     case 1:
3375       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3376       break;
3377     case 2:
3378       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3379       break;
3380     default:
3381       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3382       break;
3383     }
3384   }
3385 
3386   Register LoadDstReg;
3387 
3388   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3389   LLT UnpackedTy = Ty.changeElementSize(32);
3390 
3391   if (IsExtLoad)
3392     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3393   else if (Unpacked && IsD16 && Ty.isVector())
3394     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3395   else
3396     LoadDstReg = Dst;
3397 
3398   if (!VIndex)
3399     VIndex = B.buildConstant(S32, 0).getReg(0);
3400 
3401   auto MIB = B.buildInstr(Opc)
3402     .addDef(LoadDstReg)         // vdata
3403     .addUse(RSrc)               // rsrc
3404     .addUse(VIndex)             // vindex
3405     .addUse(VOffset)            // voffset
3406     .addUse(SOffset)            // soffset
3407     .addImm(ImmOffset);         // offset(imm)
3408 
3409   if (IsTyped)
3410     MIB.addImm(Format);
3411 
3412   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3413      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3414      .addMemOperand(MMO);
3415 
3416   if (LoadDstReg != Dst) {
3417     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3418 
3419     // Widen result for extending loads was widened.
3420     if (IsExtLoad)
3421       B.buildTrunc(Dst, LoadDstReg);
3422     else {
3423       // Repack to original 16-bit vector result
3424       // FIXME: G_TRUNC should work, but legalization currently fails
3425       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3426       SmallVector<Register, 4> Repack;
3427       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3428         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3429       B.buildMerge(Dst, Repack);
3430     }
3431   }
3432 
3433   MI.eraseFromParent();
3434   return true;
3435 }
3436 
3437 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3438                                                MachineIRBuilder &B,
3439                                                bool IsInc) const {
3440   B.setInstr(MI);
3441   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3442                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3443   B.buildInstr(Opc)
3444     .addDef(MI.getOperand(0).getReg())
3445     .addUse(MI.getOperand(2).getReg())
3446     .addUse(MI.getOperand(3).getReg())
3447     .cloneMemRefs(MI);
3448   MI.eraseFromParent();
3449   return true;
3450 }
3451 
3452 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3453   switch (IntrID) {
3454   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3455   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3456     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3457   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3458   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3459     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3460   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3461   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3462     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3463   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3464   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3465     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3466   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3467   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3468     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3469   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3470   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3471     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3472   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3473   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3474     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3475   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3476   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3477     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3478   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3479   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3480     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3481   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3482   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3483     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3484   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3485   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3486     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3487   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3488   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3489     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3490   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3491   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3492     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3493   default:
3494     llvm_unreachable("unhandled atomic opcode");
3495   }
3496 }
3497 
3498 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3499                                                MachineIRBuilder &B,
3500                                                Intrinsic::ID IID) const {
3501   B.setInstr(MI);
3502 
3503   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3504                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3505 
3506   Register Dst = MI.getOperand(0).getReg();
3507   Register VData = MI.getOperand(2).getReg();
3508 
3509   Register CmpVal;
3510   int OpOffset = 0;
3511 
3512   if (IsCmpSwap) {
3513     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3514     ++OpOffset;
3515   }
3516 
3517   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3518   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
3519 
3520   // The struct intrinsic variants add one additional operand over raw.
3521   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3522   Register VIndex;
3523   if (HasVIndex) {
3524     VIndex = MI.getOperand(4 + OpOffset).getReg();
3525     ++OpOffset;
3526   }
3527 
3528   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3529   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3530   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3531 
3532   MachineMemOperand *MMO = *MI.memoperands_begin();
3533 
3534   unsigned ImmOffset;
3535   unsigned TotalOffset;
3536   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3537   if (TotalOffset != 0)
3538     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3539 
3540   if (!VIndex)
3541     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3542 
3543   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
3544     .addDef(Dst)
3545     .addUse(VData); // vdata
3546 
3547   if (IsCmpSwap)
3548     MIB.addReg(CmpVal);
3549 
3550   MIB.addUse(RSrc)               // rsrc
3551      .addUse(VIndex)             // vindex
3552      .addUse(VOffset)            // voffset
3553      .addUse(SOffset)            // soffset
3554      .addImm(ImmOffset)          // offset(imm)
3555      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3556      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3557      .addMemOperand(MMO);
3558 
3559   MI.eraseFromParent();
3560   return true;
3561 }
3562 
3563 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3564 /// vector with s16 typed elements.
3565 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI,
3566                                         SmallVectorImpl<Register> &PackedAddrs,
3567                                         int AddrIdx, int DimIdx, int NumVAddrs,
3568                                         int NumGradients) {
3569   const LLT S16 = LLT::scalar(16);
3570   const LLT V2S16 = LLT::vector(2, 16);
3571 
3572   for (int I = AddrIdx; I < AddrIdx + NumVAddrs; ++I) {
3573     MachineOperand &SrcOp = MI.getOperand(I);
3574     if (!SrcOp.isReg())
3575       continue; // _L to _LZ may have eliminated this.
3576 
3577     Register AddrReg = SrcOp.getReg();
3578 
3579     if (I < DimIdx) {
3580       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
3581       PackedAddrs.push_back(AddrReg);
3582     } else {
3583       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
3584       // derivatives dx/dh and dx/dv are packed with undef.
3585       if (((I + 1) >= (AddrIdx + NumVAddrs)) ||
3586           ((NumGradients / 2) % 2 == 1 &&
3587            (I == DimIdx + (NumGradients / 2) - 1 ||
3588             I == DimIdx + NumGradients - 1)) ||
3589           // Check for _L to _LZ optimization
3590           !MI.getOperand(I + 1).isReg()) {
3591         PackedAddrs.push_back(
3592             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
3593                 .getReg(0));
3594       } else {
3595         PackedAddrs.push_back(
3596             B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()})
3597                 .getReg(0));
3598         ++I;
3599       }
3600     }
3601   }
3602 }
3603 
3604 /// Convert from separate vaddr components to a single vector address register,
3605 /// and replace the remaining operands with $noreg.
3606 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
3607                                      int DimIdx, int NumVAddrs) {
3608   const LLT S32 = LLT::scalar(32);
3609 
3610   SmallVector<Register, 8> AddrRegs;
3611   for (int I = 0; I != NumVAddrs; ++I) {
3612     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3613     if (SrcOp.isReg()) {
3614       AddrRegs.push_back(SrcOp.getReg());
3615       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
3616     }
3617   }
3618 
3619   int NumAddrRegs = AddrRegs.size();
3620   if (NumAddrRegs != 1) {
3621     // Round up to 8 elements for v5-v7
3622     // FIXME: Missing intermediate sized register classes and instructions.
3623     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
3624       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
3625       auto Undef = B.buildUndef(S32);
3626       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
3627       NumAddrRegs = RoundedNumRegs;
3628     }
3629 
3630     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
3631     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
3632   }
3633 
3634   for (int I = 1; I != NumVAddrs; ++I) {
3635     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3636     if (SrcOp.isReg())
3637       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
3638   }
3639 }
3640 
3641 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
3642 ///
3643 /// Depending on the subtarget, load/store with 16-bit element data need to be
3644 /// rewritten to use the low half of 32-bit registers, or directly use a packed
3645 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
3646 /// registers.
3647 ///
3648 /// We don't want to directly select image instructions just yet, but also want
3649 /// to exposes all register repacking to the legalizer/combiners. We also don't
3650 /// want a selected instrution entering RegBankSelect. In order to avoid
3651 /// defining a multitude of intermediate image instructions, directly hack on
3652 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
3653 /// now unnecessary arguments with $noreg.
3654 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3655     MachineInstr &MI, MachineIRBuilder &B,
3656     GISelChangeObserver &Observer,
3657     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3658   B.setInstr(MI);
3659 
3660   const int NumDefs = MI.getNumExplicitDefs();
3661   bool IsTFE = NumDefs == 2;
3662   // We are only processing the operands of d16 image operations on subtargets
3663   // that use the unpacked register layout, or need to repack the TFE result.
3664 
3665   // TODO: Do we need to guard against already legalized intrinsics?
3666   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3667     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3668 
3669   MachineRegisterInfo *MRI = B.getMRI();
3670   const LLT S32 = LLT::scalar(32);
3671   const LLT S16 = LLT::scalar(16);
3672   const LLT V2S16 = LLT::vector(2, 16);
3673 
3674   // Index of first address argument
3675   const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs);
3676 
3677   // Check for 16 bit addresses and pack if true.
3678   int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
3679   LLT AddrTy = MRI->getType(MI.getOperand(DimIdx).getReg());
3680   const bool IsA16 = AddrTy == S16;
3681 
3682   int NumVAddrs, NumGradients;
3683   std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode);
3684   const int DMaskIdx = BaseOpcode->Atomic ? -1 :
3685     getDMaskIdx(BaseOpcode, NumDefs);
3686   unsigned DMask = 0;
3687 
3688   int DMaskLanes = 0;
3689   if (!BaseOpcode->Atomic) {
3690     DMask = MI.getOperand(DMaskIdx).getImm();
3691     if (BaseOpcode->Gather4) {
3692       DMaskLanes = 4;
3693     } else if (DMask != 0) {
3694       DMaskLanes = countPopulation(DMask);
3695     } else if (!IsTFE && !BaseOpcode->Store) {
3696       // If dmask is 0, this is a no-op load. This can be eliminated.
3697       B.buildUndef(MI.getOperand(0));
3698       MI.eraseFromParent();
3699       return true;
3700     }
3701   }
3702 
3703   Observer.changingInstr(MI);
3704   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
3705 
3706   unsigned NewOpcode = NumDefs == 0 ?
3707     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
3708 
3709   // Track that we legalized this
3710   MI.setDesc(B.getTII().get(NewOpcode));
3711 
3712   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
3713   // dmask to be at least 1 otherwise the instruction will fail
3714   if (IsTFE && DMask == 0) {
3715     DMask = 0x1;
3716     DMaskLanes = 1;
3717     MI.getOperand(DMaskIdx).setImm(DMask);
3718   }
3719 
3720   if (BaseOpcode->Atomic) {
3721     Register VData0 = MI.getOperand(2).getReg();
3722     LLT Ty = MRI->getType(VData0);
3723 
3724     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
3725     if (Ty.isVector())
3726       return false;
3727 
3728     if (BaseOpcode->AtomicX2) {
3729       Register VData1 = MI.getOperand(3).getReg();
3730       // The two values are packed in one register.
3731       LLT PackedTy = LLT::vector(2, Ty);
3732       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
3733       MI.getOperand(2).setReg(Concat.getReg(0));
3734       MI.getOperand(3).setReg(AMDGPU::NoRegister);
3735     }
3736   }
3737 
3738   int CorrectedNumVAddrs = NumVAddrs;
3739 
3740   // Optimize _L to _LZ when _L is zero
3741   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
3742         AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
3743     const ConstantFP *ConstantLod;
3744     const int LodIdx = AddrIdx + NumVAddrs - 1;
3745 
3746     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) {
3747       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
3748         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
3749         ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode(
3750           LZMappingInfo->LZ, ImageDimIntr->Dim);
3751 
3752         // The starting indexes should remain in the same place.
3753         --NumVAddrs;
3754         --CorrectedNumVAddrs;
3755 
3756         MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID(
3757           static_cast<Intrinsic::ID>(ImageDimIntr->Intr));
3758         MI.RemoveOperand(LodIdx);
3759       }
3760     }
3761   }
3762 
3763   // Optimize _mip away, when 'lod' is zero
3764   if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
3765     int64_t ConstantLod;
3766     const int LodIdx = AddrIdx + NumVAddrs - 1;
3767 
3768     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) {
3769       if (ConstantLod == 0) {
3770         // TODO: Change intrinsic opcode and remove operand instead or replacing
3771         // it with 0, as the _L to _LZ handling is done above.
3772         MI.getOperand(LodIdx).ChangeToImmediate(0);
3773         --CorrectedNumVAddrs;
3774       }
3775     }
3776   }
3777 
3778   // If the register allocator cannot place the address registers contiguously
3779   // without introducing moves, then using the non-sequential address encoding
3780   // is always preferable, since it saves VALU instructions and is usually a
3781   // wash in terms of code size or even better.
3782   //
3783   // However, we currently have no way of hinting to the register allocator
3784   // that MIMG addresses should be placed contiguously when it is possible to
3785   // do so, so force non-NSA for the common 2-address case as a heuristic.
3786   //
3787   // SIShrinkInstructions will convert NSA encodings to non-NSA after register
3788   // allocation when possible.
3789   const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
3790 
3791   // Rewrite the addressing register layout before doing anything else.
3792   if (IsA16) {
3793     // FIXME: this feature is missing from gfx10. When that is fixed, this check
3794     // should be introduced.
3795     if (!ST.hasR128A16() && !ST.hasGFX10A16())
3796       return false;
3797 
3798     if (NumVAddrs > 1) {
3799       SmallVector<Register, 4> PackedRegs;
3800       packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, NumVAddrs,
3801                                   NumGradients);
3802 
3803       if (!UseNSA && PackedRegs.size() > 1) {
3804         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
3805         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
3806         PackedRegs[0] = Concat.getReg(0);
3807         PackedRegs.resize(1);
3808       }
3809 
3810       const int NumPacked = PackedRegs.size();
3811       for (int I = 0; I != NumVAddrs; ++I) {
3812         MachineOperand &SrcOp = MI.getOperand(AddrIdx + I);
3813         if (!SrcOp.isReg()) {
3814           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
3815           continue;
3816         }
3817 
3818         assert(SrcOp.getReg() != AMDGPU::NoRegister);
3819 
3820         if (I < NumPacked)
3821           SrcOp.setReg(PackedRegs[I]);
3822         else
3823           SrcOp.setReg(AMDGPU::NoRegister);
3824       }
3825     }
3826   } else if (!UseNSA && NumVAddrs > 1) {
3827     convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
3828   }
3829 
3830 
3831   if (BaseOpcode->Store) { // No TFE for stores?
3832     // TODO: Handle dmask trim
3833     Register VData = MI.getOperand(1).getReg();
3834     LLT Ty = MRI->getType(VData);
3835     if (!Ty.isVector() || Ty.getElementType() != S16)
3836       return true;
3837 
3838     B.setInstr(MI);
3839 
3840     Register RepackedReg = handleD16VData(B, *MRI, VData);
3841     if (RepackedReg != VData) {
3842       MI.getOperand(1).setReg(RepackedReg);
3843     }
3844 
3845     return true;
3846   }
3847 
3848   Register DstReg = MI.getOperand(0).getReg();
3849   LLT Ty = MRI->getType(DstReg);
3850   const LLT EltTy = Ty.getScalarType();
3851   const bool IsD16 = Ty.getScalarType() == S16;
3852   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
3853 
3854   // Confirm that the return type is large enough for the dmask specified
3855   if (NumElts < DMaskLanes)
3856     return false;
3857 
3858   if (NumElts > 4 || DMaskLanes > 4)
3859     return false;
3860 
3861   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
3862   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
3863 
3864   // The raw dword aligned data component of the load. The only legal cases
3865   // where this matters should be when using the packed D16 format, for
3866   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
3867   LLT RoundedTy;
3868 
3869   // S32 vector to to cover all data, plus TFE result element.
3870   LLT TFETy;
3871 
3872   // Register type to use for each loaded component. Will be S32 or V2S16.
3873   LLT RegTy;
3874 
3875   if (IsD16 && ST.hasUnpackedD16VMem()) {
3876     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
3877     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
3878     RegTy = S32;
3879   } else {
3880     unsigned EltSize = EltTy.getSizeInBits();
3881     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
3882     unsigned RoundedSize = 32 * RoundedElts;
3883     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
3884     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
3885     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
3886   }
3887 
3888   // The return type does not need adjustment.
3889   // TODO: Should we change s16 case to s32 or <2 x s16>?
3890   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
3891     return true;
3892 
3893   Register Dst1Reg;
3894 
3895   // Insert after the instruction.
3896   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3897 
3898   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
3899   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
3900   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
3901   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
3902 
3903   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
3904 
3905   MI.getOperand(0).setReg(NewResultReg);
3906 
3907   // In the IR, TFE is supposed to be used with a 2 element struct return
3908   // type. The intruction really returns these two values in one contiguous
3909   // register, with one additional dword beyond the loaded data. Rewrite the
3910   // return type to use a single register result.
3911 
3912   if (IsTFE) {
3913     Dst1Reg = MI.getOperand(1).getReg();
3914     if (MRI->getType(Dst1Reg) != S32)
3915       return false;
3916 
3917     // TODO: Make sure the TFE operand bit is set.
3918     MI.RemoveOperand(1);
3919 
3920     // Handle the easy case that requires no repack instructions.
3921     if (Ty == S32) {
3922       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
3923       return true;
3924     }
3925   }
3926 
3927   // Now figure out how to copy the new result register back into the old
3928   // result.
3929   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
3930 
3931   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
3932 
3933   if (ResultNumRegs == 1) {
3934     assert(!IsTFE);
3935     ResultRegs[0] = NewResultReg;
3936   } else {
3937     // We have to repack into a new vector of some kind.
3938     for (int I = 0; I != NumDataRegs; ++I)
3939       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
3940     B.buildUnmerge(ResultRegs, NewResultReg);
3941 
3942     // Drop the final TFE element to get the data part. The TFE result is
3943     // directly written to the right place already.
3944     if (IsTFE)
3945       ResultRegs.resize(NumDataRegs);
3946   }
3947 
3948   // For an s16 scalar result, we form an s32 result with a truncate regardless
3949   // of packed vs. unpacked.
3950   if (IsD16 && !Ty.isVector()) {
3951     B.buildTrunc(DstReg, ResultRegs[0]);
3952     return true;
3953   }
3954 
3955   // Avoid a build/concat_vector of 1 entry.
3956   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
3957     B.buildBitcast(DstReg, ResultRegs[0]);
3958     return true;
3959   }
3960 
3961   assert(Ty.isVector());
3962 
3963   if (IsD16) {
3964     // For packed D16 results with TFE enabled, all the data components are
3965     // S32. Cast back to the expected type.
3966     //
3967     // TODO: We don't really need to use load s32 elements. We would only need one
3968     // cast for the TFE result if a multiple of v2s16 was used.
3969     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
3970       for (Register &Reg : ResultRegs)
3971         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
3972     } else if (ST.hasUnpackedD16VMem()) {
3973       for (Register &Reg : ResultRegs)
3974         Reg = B.buildTrunc(S16, Reg).getReg(0);
3975     }
3976   }
3977 
3978   auto padWithUndef = [&](LLT Ty, int NumElts) {
3979     if (NumElts == 0)
3980       return;
3981     Register Undef = B.buildUndef(Ty).getReg(0);
3982     for (int I = 0; I != NumElts; ++I)
3983       ResultRegs.push_back(Undef);
3984   };
3985 
3986   // Pad out any elements eliminated due to the dmask.
3987   LLT ResTy = MRI->getType(ResultRegs[0]);
3988   if (!ResTy.isVector()) {
3989     padWithUndef(ResTy, NumElts - ResultRegs.size());
3990     B.buildBuildVector(DstReg, ResultRegs);
3991     return true;
3992   }
3993 
3994   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
3995   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
3996 
3997   // Deal with the one annoying legal case.
3998   const LLT V3S16 = LLT::vector(3, 16);
3999   if (Ty == V3S16) {
4000     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4001     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4002     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4003     return true;
4004   }
4005 
4006   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4007   B.buildConcatVectors(DstReg, ResultRegs);
4008   return true;
4009 }
4010 
4011 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4012   MachineInstr &MI, MachineIRBuilder &B,
4013   GISelChangeObserver &Observer) const {
4014   Register Dst = MI.getOperand(0).getReg();
4015   LLT Ty = B.getMRI()->getType(Dst);
4016   unsigned Size = Ty.getSizeInBits();
4017   MachineFunction &MF = B.getMF();
4018 
4019   Observer.changingInstr(MI);
4020 
4021   // FIXME: We don't really need this intermediate instruction. The intrinsic
4022   // should be fixed to have a memory operand. Since it's readnone, we're not
4023   // allowed to add one.
4024   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4025   MI.RemoveOperand(1); // Remove intrinsic ID
4026 
4027   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4028   // TODO: Should this use datalayout alignment?
4029   const unsigned MemSize = (Size + 7) / 8;
4030   const Align MemAlign(4);
4031   MachineMemOperand *MMO = MF.getMachineMemOperand(
4032       MachinePointerInfo(),
4033       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4034           MachineMemOperand::MOInvariant,
4035       MemSize, MemAlign);
4036   MI.addMemOperand(MF, MMO);
4037 
4038   // There are no 96-bit result scalar loads, but widening to 128-bit should
4039   // always be legal. We may need to restore this to a 96-bit result if it turns
4040   // out this needs to be converted to a vector load during RegBankSelect.
4041   if (!isPowerOf2_32(Size)) {
4042     LegalizerHelper Helper(MF, *this, Observer, B);
4043     B.setInstr(MI);
4044 
4045     if (Ty.isVector())
4046       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4047     else
4048       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4049   }
4050 
4051   Observer.changedInstr(MI);
4052   return true;
4053 }
4054 
4055 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4056                                                 MachineRegisterInfo &MRI,
4057                                                 MachineIRBuilder &B) const {
4058   B.setInstr(MI);
4059 
4060   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
4061   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4062       !ST.isTrapHandlerEnabled()) {
4063     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4064   } else {
4065     // Pass queue pointer to trap handler as input, and insert trap instruction
4066     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4067     const ArgDescriptor *Arg =
4068         getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR);
4069     if (!Arg)
4070       return false;
4071     MachineRegisterInfo &MRI = *B.getMRI();
4072     Register SGPR01(AMDGPU::SGPR0_SGPR1);
4073     Register LiveIn = getLiveInRegister(
4074         B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64),
4075         /*InsertLiveInCopy=*/false);
4076     if (!loadInputValue(LiveIn, B, Arg))
4077       return false;
4078     B.buildCopy(SGPR01, LiveIn);
4079     B.buildInstr(AMDGPU::S_TRAP)
4080         .addImm(GCNSubtarget::TrapIDLLVMTrap)
4081         .addReg(SGPR01, RegState::Implicit);
4082   }
4083 
4084   MI.eraseFromParent();
4085   return true;
4086 }
4087 
4088 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4089     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4090   B.setInstr(MI);
4091 
4092   // Is non-HSA path or trap-handler disabled? then, report a warning
4093   // accordingly
4094   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4095       !ST.isTrapHandlerEnabled()) {
4096     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4097                                      "debugtrap handler not supported",
4098                                      MI.getDebugLoc(), DS_Warning);
4099     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4100     Ctx.diagnose(NoTrap);
4101   } else {
4102     // Insert debug-trap instruction
4103     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4104   }
4105 
4106   MI.eraseFromParent();
4107   return true;
4108 }
4109 
4110 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
4111                                             MachineIRBuilder &B,
4112                                             GISelChangeObserver &Observer) const {
4113   MachineRegisterInfo &MRI = *B.getMRI();
4114 
4115   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4116   auto IntrID = MI.getIntrinsicID();
4117   switch (IntrID) {
4118   case Intrinsic::amdgcn_if:
4119   case Intrinsic::amdgcn_else: {
4120     MachineInstr *Br = nullptr;
4121     MachineBasicBlock *UncondBrTarget = nullptr;
4122     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4123       const SIRegisterInfo *TRI
4124         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4125 
4126       B.setInstr(*BrCond);
4127       Register Def = MI.getOperand(1).getReg();
4128       Register Use = MI.getOperand(3).getReg();
4129 
4130       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4131       if (IntrID == Intrinsic::amdgcn_if) {
4132         B.buildInstr(AMDGPU::SI_IF)
4133           .addDef(Def)
4134           .addUse(Use)
4135           .addMBB(UncondBrTarget);
4136       } else {
4137         B.buildInstr(AMDGPU::SI_ELSE)
4138           .addDef(Def)
4139           .addUse(Use)
4140           .addMBB(UncondBrTarget)
4141           .addImm(0);
4142       }
4143 
4144       if (Br) {
4145         Br->getOperand(0).setMBB(CondBrTarget);
4146       } else {
4147         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4148         // since we're swapping branch targets it needs to be reinserted.
4149         // FIXME: IRTranslator should probably not do this
4150         B.buildBr(*CondBrTarget);
4151       }
4152 
4153       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4154       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4155       MI.eraseFromParent();
4156       BrCond->eraseFromParent();
4157       return true;
4158     }
4159 
4160     return false;
4161   }
4162   case Intrinsic::amdgcn_loop: {
4163     MachineInstr *Br = nullptr;
4164     MachineBasicBlock *UncondBrTarget = nullptr;
4165     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4166       const SIRegisterInfo *TRI
4167         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4168 
4169       B.setInstr(*BrCond);
4170 
4171       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4172       Register Reg = MI.getOperand(2).getReg();
4173       B.buildInstr(AMDGPU::SI_LOOP)
4174         .addUse(Reg)
4175         .addMBB(UncondBrTarget);
4176 
4177       if (Br)
4178         Br->getOperand(0).setMBB(CondBrTarget);
4179       else
4180         B.buildBr(*CondBrTarget);
4181 
4182       MI.eraseFromParent();
4183       BrCond->eraseFromParent();
4184       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4185       return true;
4186     }
4187 
4188     return false;
4189   }
4190   case Intrinsic::amdgcn_kernarg_segment_ptr:
4191     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4192       B.setInstr(MI);
4193       // This only makes sense to call in a kernel, so just lower to null.
4194       B.buildConstant(MI.getOperand(0).getReg(), 0);
4195       MI.eraseFromParent();
4196       return true;
4197     }
4198 
4199     return legalizePreloadedArgIntrin(
4200       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4201   case Intrinsic::amdgcn_implicitarg_ptr:
4202     return legalizeImplicitArgPtr(MI, MRI, B);
4203   case Intrinsic::amdgcn_workitem_id_x:
4204     return legalizePreloadedArgIntrin(MI, MRI, B,
4205                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4206   case Intrinsic::amdgcn_workitem_id_y:
4207     return legalizePreloadedArgIntrin(MI, MRI, B,
4208                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4209   case Intrinsic::amdgcn_workitem_id_z:
4210     return legalizePreloadedArgIntrin(MI, MRI, B,
4211                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4212   case Intrinsic::amdgcn_workgroup_id_x:
4213     return legalizePreloadedArgIntrin(MI, MRI, B,
4214                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4215   case Intrinsic::amdgcn_workgroup_id_y:
4216     return legalizePreloadedArgIntrin(MI, MRI, B,
4217                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4218   case Intrinsic::amdgcn_workgroup_id_z:
4219     return legalizePreloadedArgIntrin(MI, MRI, B,
4220                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4221   case Intrinsic::amdgcn_dispatch_ptr:
4222     return legalizePreloadedArgIntrin(MI, MRI, B,
4223                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4224   case Intrinsic::amdgcn_queue_ptr:
4225     return legalizePreloadedArgIntrin(MI, MRI, B,
4226                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4227   case Intrinsic::amdgcn_implicit_buffer_ptr:
4228     return legalizePreloadedArgIntrin(
4229       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4230   case Intrinsic::amdgcn_dispatch_id:
4231     return legalizePreloadedArgIntrin(MI, MRI, B,
4232                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4233   case Intrinsic::amdgcn_fdiv_fast:
4234     return legalizeFDIVFastIntrin(MI, MRI, B);
4235   case Intrinsic::amdgcn_is_shared:
4236     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4237   case Intrinsic::amdgcn_is_private:
4238     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4239   case Intrinsic::amdgcn_wavefrontsize: {
4240     B.setInstr(MI);
4241     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4242     MI.eraseFromParent();
4243     return true;
4244   }
4245   case Intrinsic::amdgcn_s_buffer_load:
4246     return legalizeSBufferLoad(MI, B, Observer);
4247   case Intrinsic::amdgcn_raw_buffer_store:
4248   case Intrinsic::amdgcn_struct_buffer_store:
4249     return legalizeBufferStore(MI, MRI, B, false, false);
4250   case Intrinsic::amdgcn_raw_buffer_store_format:
4251   case Intrinsic::amdgcn_struct_buffer_store_format:
4252     return legalizeBufferStore(MI, MRI, B, false, true);
4253   case Intrinsic::amdgcn_raw_tbuffer_store:
4254   case Intrinsic::amdgcn_struct_tbuffer_store:
4255     return legalizeBufferStore(MI, MRI, B, true, true);
4256   case Intrinsic::amdgcn_raw_buffer_load:
4257   case Intrinsic::amdgcn_struct_buffer_load:
4258     return legalizeBufferLoad(MI, MRI, B, false, false);
4259   case Intrinsic::amdgcn_raw_buffer_load_format:
4260   case Intrinsic::amdgcn_struct_buffer_load_format:
4261     return legalizeBufferLoad(MI, MRI, B, true, false);
4262   case Intrinsic::amdgcn_raw_tbuffer_load:
4263   case Intrinsic::amdgcn_struct_tbuffer_load:
4264     return legalizeBufferLoad(MI, MRI, B, true, true);
4265   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4266   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4267   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4268   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4269   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4270   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4271   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4272   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4273   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4274   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4275   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4276   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4277   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4278   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4279   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4280   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4281   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4282   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4283   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4284   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4285   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4286   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4287   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4288   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4289   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4290   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4291     return legalizeBufferAtomic(MI, B, IntrID);
4292   case Intrinsic::amdgcn_atomic_inc:
4293     return legalizeAtomicIncDec(MI, B, true);
4294   case Intrinsic::amdgcn_atomic_dec:
4295     return legalizeAtomicIncDec(MI, B, false);
4296   case Intrinsic::trap:
4297     return legalizeTrapIntrinsic(MI, MRI, B);
4298   case Intrinsic::debugtrap:
4299     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4300   default: {
4301     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4302             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4303       return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr);
4304     return true;
4305   }
4306   }
4307 
4308   return true;
4309 }
4310