1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
19 #endif
20 
21 #include "AMDGPULegalizerInfo.h"
22 
23 #include "AMDGPU.h"
24 #include "AMDGPUGlobalISelUtils.h"
25 #include "AMDGPUTargetMachine.h"
26 #include "SIMachineFunctionInfo.h"
27 #include "llvm/ADT/ScopeExit.h"
28 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
29 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
30 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
31 #include "llvm/CodeGen/TargetOpcodes.h"
32 #include "llvm/CodeGen/ValueTypes.h"
33 #include "llvm/IR/DerivedTypes.h"
34 #include "llvm/IR/DiagnosticInfo.h"
35 #include "llvm/IR/Type.h"
36 #include "llvm/Support/Debug.h"
37 
38 #define DEBUG_TYPE "amdgpu-legalinfo"
39 
40 using namespace llvm;
41 using namespace LegalizeActions;
42 using namespace LegalizeMutations;
43 using namespace LegalityPredicates;
44 using namespace MIPatternMatch;
45 
46 // Round the number of elements to the next power of two elements
47 static LLT getPow2VectorType(LLT Ty) {
48   unsigned NElts = Ty.getNumElements();
49   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
50   return Ty.changeNumElements(Pow2NElts);
51 }
52 
53 // Round the number of bits to the next power of two bits
54 static LLT getPow2ScalarType(LLT Ty) {
55   unsigned Bits = Ty.getSizeInBits();
56   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
57   return LLT::scalar(Pow2Bits);
58 }
59 
60 static LegalityPredicate isMultiple32(unsigned TypeIdx,
61                                       unsigned MaxSize = 1024) {
62   return [=](const LegalityQuery &Query) {
63     const LLT Ty = Query.Types[TypeIdx];
64     const LLT EltTy = Ty.getScalarType();
65     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
66   };
67 }
68 
69 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
70   return [=](const LegalityQuery &Query) {
71     return Query.Types[TypeIdx].getSizeInBits() == Size;
72   };
73 }
74 
75 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
76   return [=](const LegalityQuery &Query) {
77     const LLT Ty = Query.Types[TypeIdx];
78     return Ty.isVector() &&
79            Ty.getNumElements() % 2 != 0 &&
80            Ty.getElementType().getSizeInBits() < 32 &&
81            Ty.getSizeInBits() % 32 != 0;
82   };
83 }
84 
85 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
86   return [=](const LegalityQuery &Query) {
87     const LLT Ty = Query.Types[TypeIdx];
88     const LLT EltTy = Ty.getScalarType();
89     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
90   };
91 }
92 
93 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
94   return [=](const LegalityQuery &Query) {
95     const LLT Ty = Query.Types[TypeIdx];
96     const LLT EltTy = Ty.getElementType();
97     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
98   };
99 }
100 
101 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
102   return [=](const LegalityQuery &Query) {
103     const LLT Ty = Query.Types[TypeIdx];
104     const LLT EltTy = Ty.getElementType();
105     unsigned Size = Ty.getSizeInBits();
106     unsigned Pieces = (Size + 63) / 64;
107     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
108     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
109   };
110 }
111 
112 // Increase the number of vector elements to reach the next multiple of 32-bit
113 // type.
114 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
115   return [=](const LegalityQuery &Query) {
116     const LLT Ty = Query.Types[TypeIdx];
117 
118     const LLT EltTy = Ty.getElementType();
119     const int Size = Ty.getSizeInBits();
120     const int EltSize = EltTy.getSizeInBits();
121     const int NextMul32 = (Size + 31) / 32;
122 
123     assert(EltSize < 32);
124 
125     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
126     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
127   };
128 }
129 
130 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
131   return [=](const LegalityQuery &Query) {
132     const LLT QueryTy = Query.Types[TypeIdx];
133     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
134   };
135 }
136 
137 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
138   return [=](const LegalityQuery &Query) {
139     const LLT QueryTy = Query.Types[TypeIdx];
140     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
141   };
142 }
143 
144 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
145   return [=](const LegalityQuery &Query) {
146     const LLT QueryTy = Query.Types[TypeIdx];
147     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
148   };
149 }
150 
151 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
152 // v2s16.
153 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
154   return [=](const LegalityQuery &Query) {
155     const LLT Ty = Query.Types[TypeIdx];
156     if (Ty.isVector()) {
157       const int EltSize = Ty.getElementType().getSizeInBits();
158       return EltSize == 32 || EltSize == 64 ||
159             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
160              EltSize == 128 || EltSize == 256;
161     }
162 
163     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
164   };
165 }
166 
167 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
168   return [=](const LegalityQuery &Query) {
169     const LLT QueryTy = Query.Types[TypeIdx];
170     return QueryTy.isVector() && QueryTy.getElementType() == Type;
171   };
172 }
173 
174 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
175   return [=](const LegalityQuery &Query) {
176     const LLT QueryTy = Query.Types[TypeIdx];
177     if (!QueryTy.isVector())
178       return false;
179     const LLT EltTy = QueryTy.getElementType();
180     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
181   };
182 }
183 
184 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
185   return [=](const LegalityQuery &Query) {
186     const LLT Ty = Query.Types[TypeIdx];
187     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
188            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
189   };
190 }
191 
192 static LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1) {
193   return [=](const LegalityQuery &Query) {
194     return Query.Types[TypeIdx0].getSizeInBits() <
195            Query.Types[TypeIdx1].getSizeInBits();
196   };
197 }
198 
199 static LegalityPredicate greaterThan(unsigned TypeIdx0, unsigned TypeIdx1) {
200   return [=](const LegalityQuery &Query) {
201     return Query.Types[TypeIdx0].getSizeInBits() >
202            Query.Types[TypeIdx1].getSizeInBits();
203   };
204 }
205 
206 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
207                                          const GCNTargetMachine &TM)
208   :  ST(ST_) {
209   using namespace TargetOpcode;
210 
211   auto GetAddrSpacePtr = [&TM](unsigned AS) {
212     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
213   };
214 
215   const LLT S1 = LLT::scalar(1);
216   const LLT S16 = LLT::scalar(16);
217   const LLT S32 = LLT::scalar(32);
218   const LLT S64 = LLT::scalar(64);
219   const LLT S128 = LLT::scalar(128);
220   const LLT S256 = LLT::scalar(256);
221   const LLT S512 = LLT::scalar(512);
222   const LLT S1024 = LLT::scalar(1024);
223 
224   const LLT V2S16 = LLT::vector(2, 16);
225   const LLT V4S16 = LLT::vector(4, 16);
226 
227   const LLT V2S32 = LLT::vector(2, 32);
228   const LLT V3S32 = LLT::vector(3, 32);
229   const LLT V4S32 = LLT::vector(4, 32);
230   const LLT V5S32 = LLT::vector(5, 32);
231   const LLT V6S32 = LLT::vector(6, 32);
232   const LLT V7S32 = LLT::vector(7, 32);
233   const LLT V8S32 = LLT::vector(8, 32);
234   const LLT V9S32 = LLT::vector(9, 32);
235   const LLT V10S32 = LLT::vector(10, 32);
236   const LLT V11S32 = LLT::vector(11, 32);
237   const LLT V12S32 = LLT::vector(12, 32);
238   const LLT V13S32 = LLT::vector(13, 32);
239   const LLT V14S32 = LLT::vector(14, 32);
240   const LLT V15S32 = LLT::vector(15, 32);
241   const LLT V16S32 = LLT::vector(16, 32);
242   const LLT V32S32 = LLT::vector(32, 32);
243 
244   const LLT V2S64 = LLT::vector(2, 64);
245   const LLT V3S64 = LLT::vector(3, 64);
246   const LLT V4S64 = LLT::vector(4, 64);
247   const LLT V5S64 = LLT::vector(5, 64);
248   const LLT V6S64 = LLT::vector(6, 64);
249   const LLT V7S64 = LLT::vector(7, 64);
250   const LLT V8S64 = LLT::vector(8, 64);
251   const LLT V16S64 = LLT::vector(16, 64);
252 
253   std::initializer_list<LLT> AllS32Vectors =
254     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
255      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
256   std::initializer_list<LLT> AllS64Vectors =
257     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
258 
259   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
260   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
261   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
262   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
263   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
264   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
265   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
266 
267   const LLT CodePtr = FlatPtr;
268 
269   const std::initializer_list<LLT> AddrSpaces64 = {
270     GlobalPtr, ConstantPtr, FlatPtr
271   };
272 
273   const std::initializer_list<LLT> AddrSpaces32 = {
274     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
275   };
276 
277   const std::initializer_list<LLT> FPTypesBase = {
278     S32, S64
279   };
280 
281   const std::initializer_list<LLT> FPTypes16 = {
282     S32, S64, S16
283   };
284 
285   const std::initializer_list<LLT> FPTypesPK16 = {
286     S32, S64, S16, V2S16
287   };
288 
289   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
290 
291   setAction({G_BRCOND, S1}, Legal); // VCC branches
292   setAction({G_BRCOND, S32}, Legal); // SCC branches
293 
294   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
295   // elements for v3s16
296   getActionDefinitionsBuilder(G_PHI)
297     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
298     .legalFor(AllS32Vectors)
299     .legalFor(AllS64Vectors)
300     .legalFor(AddrSpaces64)
301     .legalFor(AddrSpaces32)
302     .clampScalar(0, S32, S256)
303     .widenScalarToNextPow2(0, 32)
304     .clampMaxNumElements(0, S32, 16)
305     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
306     .legalIf(isPointer(0));
307 
308   if (ST.hasVOP3PInsts()) {
309     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
310       .legalFor({S32, S16, V2S16})
311       .clampScalar(0, S16, S32)
312       .clampMaxNumElements(0, S16, 2)
313       .scalarize(0)
314       .widenScalarToNextPow2(0, 32);
315   } else if (ST.has16BitInsts()) {
316     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
317       .legalFor({S32, S16})
318       .clampScalar(0, S16, S32)
319       .scalarize(0)
320       .widenScalarToNextPow2(0, 32);
321   } else {
322     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
323       .legalFor({S32})
324       .clampScalar(0, S32, S32)
325       .scalarize(0);
326   }
327 
328   // FIXME: Not really legal. Placeholder for custom lowering.
329   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
330     .customFor({S32, S64})
331     .clampScalar(0, S32, S64)
332     .widenScalarToNextPow2(0, 32)
333     .scalarize(0);
334 
335   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
336     .legalFor({S32})
337     .clampScalar(0, S32, S32)
338     .scalarize(0);
339 
340   // Report legal for any types we can handle anywhere. For the cases only legal
341   // on the SALU, RegBankSelect will be able to re-legalize.
342   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
343     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
344     .clampScalar(0, S32, S64)
345     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
346     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
347     .widenScalarToNextPow2(0)
348     .scalarize(0);
349 
350   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
351                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
352     .legalFor({{S32, S1}, {S32, S32}})
353     .minScalar(0, S32)
354     // TODO: .scalarize(0)
355     .lower();
356 
357   getActionDefinitionsBuilder(G_BITCAST)
358     // Don't worry about the size constraint.
359     .legalIf(all(isRegisterType(0), isRegisterType(1)))
360     .lower();
361 
362 
363   getActionDefinitionsBuilder(G_CONSTANT)
364     .legalFor({S1, S32, S64, S16, GlobalPtr,
365                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
366     .clampScalar(0, S32, S64)
367     .widenScalarToNextPow2(0)
368     .legalIf(isPointer(0));
369 
370   getActionDefinitionsBuilder(G_FCONSTANT)
371     .legalFor({S32, S64, S16})
372     .clampScalar(0, S16, S64);
373 
374   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
375       .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
376                  ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
377       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
378       .clampScalarOrElt(0, S32, S1024)
379       .legalIf(isMultiple32(0))
380       .widenScalarToNextPow2(0, 32)
381       .clampMaxNumElements(0, S32, 16);
382 
383   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
384   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
385     .unsupportedFor({PrivatePtr})
386     .custom();
387   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
388 
389   auto &FPOpActions = getActionDefinitionsBuilder(
390     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
391     .legalFor({S32, S64});
392   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
393     .customFor({S32, S64});
394   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
395     .customFor({S32, S64});
396 
397   if (ST.has16BitInsts()) {
398     if (ST.hasVOP3PInsts())
399       FPOpActions.legalFor({S16, V2S16});
400     else
401       FPOpActions.legalFor({S16});
402 
403     TrigActions.customFor({S16});
404     FDIVActions.customFor({S16});
405   }
406 
407   auto &MinNumMaxNum = getActionDefinitionsBuilder({
408       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
409 
410   if (ST.hasVOP3PInsts()) {
411     MinNumMaxNum.customFor(FPTypesPK16)
412       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
413       .clampMaxNumElements(0, S16, 2)
414       .clampScalar(0, S16, S64)
415       .scalarize(0);
416   } else if (ST.has16BitInsts()) {
417     MinNumMaxNum.customFor(FPTypes16)
418       .clampScalar(0, S16, S64)
419       .scalarize(0);
420   } else {
421     MinNumMaxNum.customFor(FPTypesBase)
422       .clampScalar(0, S32, S64)
423       .scalarize(0);
424   }
425 
426   if (ST.hasVOP3PInsts())
427     FPOpActions.clampMaxNumElements(0, S16, 2);
428 
429   FPOpActions
430     .scalarize(0)
431     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
432 
433   TrigActions
434     .scalarize(0)
435     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
436 
437   FDIVActions
438     .scalarize(0)
439     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
440 
441   getActionDefinitionsBuilder({G_FNEG, G_FABS})
442     .legalFor(FPTypesPK16)
443     .clampMaxNumElements(0, S16, 2)
444     .scalarize(0)
445     .clampScalar(0, S16, S64);
446 
447   if (ST.has16BitInsts()) {
448     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
449       .legalFor({S32, S64, S16})
450       .scalarize(0)
451       .clampScalar(0, S16, S64);
452   } else {
453     getActionDefinitionsBuilder(G_FSQRT)
454       .legalFor({S32, S64})
455       .scalarize(0)
456       .clampScalar(0, S32, S64);
457 
458     if (ST.hasFractBug()) {
459       getActionDefinitionsBuilder(G_FFLOOR)
460         .customFor({S64})
461         .legalFor({S32, S64})
462         .scalarize(0)
463         .clampScalar(0, S32, S64);
464     } else {
465       getActionDefinitionsBuilder(G_FFLOOR)
466         .legalFor({S32, S64})
467         .scalarize(0)
468         .clampScalar(0, S32, S64);
469     }
470   }
471 
472   getActionDefinitionsBuilder(G_FPTRUNC)
473     .legalFor({{S32, S64}, {S16, S32}})
474     .scalarize(0)
475     .lower();
476 
477   getActionDefinitionsBuilder(G_FPEXT)
478     .legalFor({{S64, S32}, {S32, S16}})
479     .lowerFor({{S64, S16}}) // FIXME: Implement
480     .scalarize(0);
481 
482   getActionDefinitionsBuilder(G_FSUB)
483       // Use actual fsub instruction
484       .legalFor({S32})
485       // Must use fadd + fneg
486       .lowerFor({S64, S16, V2S16})
487       .scalarize(0)
488       .clampScalar(0, S32, S64);
489 
490   // Whether this is legal depends on the floating point mode for the function.
491   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
492   if (ST.hasMadF16())
493     FMad.customFor({S32, S16});
494   else
495     FMad.customFor({S32});
496   FMad.scalarize(0)
497       .lower();
498 
499   // TODO: Do we need to clamp maximum bitwidth?
500   getActionDefinitionsBuilder(G_TRUNC)
501     .legalIf(isScalar(0))
502     .legalFor({{V2S16, V2S32}})
503     .clampMaxNumElements(0, S16, 2)
504     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
505     // situations (like an invalid implicit use), we don't want to infinite loop
506     // in the legalizer.
507     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
508     .alwaysLegal();
509 
510   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
511     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
512                {S32, S1}, {S64, S1}, {S16, S1}})
513     .scalarize(0)
514     .clampScalar(0, S32, S64)
515     .widenScalarToNextPow2(1, 32);
516 
517   // TODO: Split s1->s64 during regbankselect for VALU.
518   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
519     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
520     .lowerFor({{S32, S64}})
521     .lowerIf(typeIs(1, S1))
522     .customFor({{S64, S64}});
523   if (ST.has16BitInsts())
524     IToFP.legalFor({{S16, S16}});
525   IToFP.clampScalar(1, S32, S64)
526        .scalarize(0)
527        .widenScalarToNextPow2(1);
528 
529   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
530     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
531     .customFor({{S64, S64}});
532   if (ST.has16BitInsts())
533     FPToI.legalFor({{S16, S16}});
534   else
535     FPToI.minScalar(1, S32);
536 
537   FPToI.minScalar(0, S32)
538        .scalarize(0)
539        .lower();
540 
541   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
542     .scalarize(0)
543     .lower();
544 
545   if (ST.has16BitInsts()) {
546     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
547       .legalFor({S16, S32, S64})
548       .clampScalar(0, S16, S64)
549       .scalarize(0);
550   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
551     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
552       .legalFor({S32, S64})
553       .clampScalar(0, S32, S64)
554       .scalarize(0);
555   } else {
556     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
557       .legalFor({S32})
558       .customFor({S64})
559       .clampScalar(0, S32, S64)
560       .scalarize(0);
561   }
562 
563   getActionDefinitionsBuilder(G_PTR_ADD)
564     .scalarize(0)
565     .alwaysLegal();
566 
567   // TODO: Clamp mask to pointer sizes
568   getActionDefinitionsBuilder(G_PTRMASK)
569     .scalarize(0)
570     .alwaysLegal();
571 
572   auto &CmpBuilder =
573     getActionDefinitionsBuilder(G_ICMP)
574     // The compare output type differs based on the register bank of the output,
575     // so make both s1 and s32 legal.
576     //
577     // Scalar compares producing output in scc will be promoted to s32, as that
578     // is the allocatable register type that will be needed for the copy from
579     // scc. This will be promoted during RegBankSelect, and we assume something
580     // before that won't try to use s32 result types.
581     //
582     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
583     // bank.
584     .legalForCartesianProduct(
585       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
586     .legalForCartesianProduct(
587       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
588   if (ST.has16BitInsts()) {
589     CmpBuilder.legalFor({{S1, S16}});
590   }
591 
592   CmpBuilder
593     .widenScalarToNextPow2(1)
594     .clampScalar(1, S32, S64)
595     .scalarize(0)
596     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
597 
598   getActionDefinitionsBuilder(G_FCMP)
599     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
600     .widenScalarToNextPow2(1)
601     .clampScalar(1, S32, S64)
602     .scalarize(0);
603 
604   // FIXME: fpow has a selection pattern that should move to custom lowering.
605   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
606   if (ST.has16BitInsts())
607     Exp2Ops.legalFor({S32, S16});
608   else
609     Exp2Ops.legalFor({S32});
610   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
611   Exp2Ops.scalarize(0);
612 
613   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
614   if (ST.has16BitInsts())
615     ExpOps.customFor({{S32}, {S16}});
616   else
617     ExpOps.customFor({S32});
618   ExpOps.clampScalar(0, MinScalarFPTy, S32)
619         .scalarize(0);
620 
621   // The 64-bit versions produce 32-bit results, but only on the SALU.
622   getActionDefinitionsBuilder(G_CTPOP)
623     .legalFor({{S32, S32}, {S32, S64}})
624     .clampScalar(0, S32, S32)
625     .clampScalar(1, S32, S64)
626     .scalarize(0)
627     .widenScalarToNextPow2(0, 32)
628     .widenScalarToNextPow2(1, 32);
629 
630   // The hardware instructions return a different result on 0 than the generic
631   // instructions expect. The hardware produces -1, but these produce the
632   // bitwidth.
633   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
634     .scalarize(0)
635     .clampScalar(0, S32, S32)
636     .clampScalar(1, S32, S64)
637     .widenScalarToNextPow2(0, 32)
638     .widenScalarToNextPow2(1, 32)
639     .lower();
640 
641   // The 64-bit versions produce 32-bit results, but only on the SALU.
642   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
643     .legalFor({{S32, S32}, {S32, S64}})
644     .clampScalar(0, S32, S32)
645     .clampScalar(1, S32, S64)
646     .scalarize(0)
647     .widenScalarToNextPow2(0, 32)
648     .widenScalarToNextPow2(1, 32);
649 
650   getActionDefinitionsBuilder(G_BITREVERSE)
651     .legalFor({S32})
652     .clampScalar(0, S32, S32)
653     .scalarize(0);
654 
655   if (ST.has16BitInsts()) {
656     getActionDefinitionsBuilder(G_BSWAP)
657       .legalFor({S16, S32, V2S16})
658       .clampMaxNumElements(0, S16, 2)
659       // FIXME: Fixing non-power-of-2 before clamp is workaround for
660       // narrowScalar limitation.
661       .widenScalarToNextPow2(0)
662       .clampScalar(0, S16, S32)
663       .scalarize(0);
664 
665     if (ST.hasVOP3PInsts()) {
666       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
667         .legalFor({S32, S16, V2S16})
668         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
669         .clampMaxNumElements(0, S16, 2)
670         .minScalar(0, S16)
671         .widenScalarToNextPow2(0)
672         .scalarize(0)
673         .lower();
674     } else {
675       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
676         .legalFor({S32, S16})
677         .widenScalarToNextPow2(0)
678         .minScalar(0, S16)
679         .scalarize(0)
680         .lower();
681     }
682   } else {
683     // TODO: Should have same legality without v_perm_b32
684     getActionDefinitionsBuilder(G_BSWAP)
685       .legalFor({S32})
686       .lowerIf(narrowerThan(0, 32))
687       // FIXME: Fixing non-power-of-2 before clamp is workaround for
688       // narrowScalar limitation.
689       .widenScalarToNextPow2(0)
690       .maxScalar(0, S32)
691       .scalarize(0)
692       .lower();
693 
694     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
695       .legalFor({S32})
696       .minScalar(0, S32)
697       .widenScalarToNextPow2(0)
698       .scalarize(0)
699       .lower();
700   }
701 
702   getActionDefinitionsBuilder(G_INTTOPTR)
703     // List the common cases
704     .legalForCartesianProduct(AddrSpaces64, {S64})
705     .legalForCartesianProduct(AddrSpaces32, {S32})
706     .scalarize(0)
707     // Accept any address space as long as the size matches
708     .legalIf(sameSize(0, 1))
709     .widenScalarIf(smallerThan(1, 0),
710       [](const LegalityQuery &Query) {
711         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
712       })
713     .narrowScalarIf(greaterThan(1, 0),
714       [](const LegalityQuery &Query) {
715         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
716       });
717 
718   getActionDefinitionsBuilder(G_PTRTOINT)
719     // List the common cases
720     .legalForCartesianProduct(AddrSpaces64, {S64})
721     .legalForCartesianProduct(AddrSpaces32, {S32})
722     .scalarize(0)
723     // Accept any address space as long as the size matches
724     .legalIf(sameSize(0, 1))
725     .widenScalarIf(smallerThan(0, 1),
726       [](const LegalityQuery &Query) {
727         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
728       })
729     .narrowScalarIf(
730       greaterThan(0, 1),
731       [](const LegalityQuery &Query) {
732         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
733       });
734 
735   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
736     .scalarize(0)
737     .custom();
738 
739   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
740   // handle some operations by just promoting the register during
741   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
742   auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned {
743     switch (AS) {
744     // FIXME: Private element size.
745     case AMDGPUAS::PRIVATE_ADDRESS:
746       return 32;
747     // FIXME: Check subtarget
748     case AMDGPUAS::LOCAL_ADDRESS:
749       return ST.useDS128() ? 128 : 64;
750 
751     // Treat constant and global as identical. SMRD loads are sometimes usable
752     // for global loads (ideally constant address space should be eliminated)
753     // depending on the context. Legality cannot be context dependent, but
754     // RegBankSelect can split the load as necessary depending on the pointer
755     // register bank/uniformity and if the memory is invariant or not written in
756     // a kernel.
757     case AMDGPUAS::CONSTANT_ADDRESS:
758     case AMDGPUAS::GLOBAL_ADDRESS:
759       return IsLoad ? 512 : 128;
760     default:
761       return 128;
762     }
763   };
764 
765   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
766                                     bool IsLoad) -> bool {
767     const LLT DstTy = Query.Types[0];
768 
769     // Split vector extloads.
770     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
771     unsigned Align = Query.MMODescrs[0].AlignInBits;
772 
773     if (MemSize < DstTy.getSizeInBits())
774       MemSize = std::max(MemSize, Align);
775 
776     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
777       return true;
778 
779     const LLT PtrTy = Query.Types[1];
780     unsigned AS = PtrTy.getAddressSpace();
781     if (MemSize > maxSizeForAddrSpace(AS, IsLoad))
782       return true;
783 
784     // Catch weird sized loads that don't evenly divide into the access sizes
785     // TODO: May be able to widen depending on alignment etc.
786     unsigned NumRegs = (MemSize + 31) / 32;
787     if (NumRegs == 3) {
788       if (!ST.hasDwordx3LoadStores())
789         return true;
790     } else {
791       // If the alignment allows, these should have been widened.
792       if (!isPowerOf2_32(NumRegs))
793         return true;
794     }
795 
796     if (Align < MemSize) {
797       const SITargetLowering *TLI = ST.getTargetLowering();
798       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
799     }
800 
801     return false;
802   };
803 
804   const auto shouldWidenLoadResult = [=](const LegalityQuery &Query) -> bool {
805     unsigned Size = Query.Types[0].getSizeInBits();
806     if (isPowerOf2_32(Size))
807       return false;
808 
809     if (Size == 96 && ST.hasDwordx3LoadStores())
810       return false;
811 
812     unsigned AddrSpace = Query.Types[1].getAddressSpace();
813     if (Size >= maxSizeForAddrSpace(AddrSpace, true))
814       return false;
815 
816     unsigned Align = Query.MMODescrs[0].AlignInBits;
817     unsigned RoundedSize = NextPowerOf2(Size);
818     return (Align >= RoundedSize);
819   };
820 
821   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
822   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
823   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
824 
825   // TODO: Refine based on subtargets which support unaligned access or 128-bit
826   // LDS
827   // TODO: Unsupported flat for SI.
828 
829   for (unsigned Op : {G_LOAD, G_STORE}) {
830     const bool IsStore = Op == G_STORE;
831 
832     auto &Actions = getActionDefinitionsBuilder(Op);
833     // Whitelist the common cases.
834     // TODO: Loads to s16 on gfx9
835     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
836                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
837                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
838                                       {S128, GlobalPtr, 128, GlobalAlign32},
839                                       {S64, GlobalPtr, 64, GlobalAlign32},
840                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
841                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
842                                       {S32, GlobalPtr, 8, GlobalAlign8},
843                                       {S32, GlobalPtr, 16, GlobalAlign16},
844 
845                                       {S32, LocalPtr, 32, 32},
846                                       {S64, LocalPtr, 64, 32},
847                                       {V2S32, LocalPtr, 64, 32},
848                                       {S32, LocalPtr, 8, 8},
849                                       {S32, LocalPtr, 16, 16},
850                                       {V2S16, LocalPtr, 32, 32},
851 
852                                       {S32, PrivatePtr, 32, 32},
853                                       {S32, PrivatePtr, 8, 8},
854                                       {S32, PrivatePtr, 16, 16},
855                                       {V2S16, PrivatePtr, 32, 32},
856 
857                                       {S32, FlatPtr, 32, GlobalAlign32},
858                                       {S32, FlatPtr, 16, GlobalAlign16},
859                                       {S32, FlatPtr, 8, GlobalAlign8},
860                                       {V2S16, FlatPtr, 32, GlobalAlign32},
861 
862                                       {S32, ConstantPtr, 32, GlobalAlign32},
863                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
864                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
865                                       {S64, ConstantPtr, 64, GlobalAlign32},
866                                       {S128, ConstantPtr, 128, GlobalAlign32},
867                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
868     Actions
869         .customIf(typeIs(1, Constant32Ptr))
870         // Widen suitably aligned loads by loading extra elements.
871         .moreElementsIf([=](const LegalityQuery &Query) {
872             const LLT Ty = Query.Types[0];
873             return Op == G_LOAD && Ty.isVector() &&
874                    shouldWidenLoadResult(Query);
875           }, moreElementsToNextPow2(0))
876         .widenScalarIf([=](const LegalityQuery &Query) {
877             const LLT Ty = Query.Types[0];
878             return Op == G_LOAD && !Ty.isVector() &&
879                    shouldWidenLoadResult(Query);
880           }, widenScalarOrEltToNextPow2(0))
881         .narrowScalarIf(
882             [=](const LegalityQuery &Query) -> bool {
883               return !Query.Types[0].isVector() &&
884                      needToSplitMemOp(Query, Op == G_LOAD);
885             },
886             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
887               const LLT DstTy = Query.Types[0];
888               const LLT PtrTy = Query.Types[1];
889 
890               const unsigned DstSize = DstTy.getSizeInBits();
891               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
892 
893               // Split extloads.
894               if (DstSize > MemSize)
895                 return std::make_pair(0, LLT::scalar(MemSize));
896 
897               if (!isPowerOf2_32(DstSize)) {
898                 // We're probably decomposing an odd sized store. Try to split
899                 // to the widest type. TODO: Account for alignment. As-is it
900                 // should be OK, since the new parts will be further legalized.
901                 unsigned FloorSize = PowerOf2Floor(DstSize);
902                 return std::make_pair(0, LLT::scalar(FloorSize));
903               }
904 
905               if (DstSize > 32 && (DstSize % 32 != 0)) {
906                 // FIXME: Need a way to specify non-extload of larger size if
907                 // suitably aligned.
908                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
909               }
910 
911               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
912                                                      Op == G_LOAD);
913               if (MemSize > MaxSize)
914                 return std::make_pair(0, LLT::scalar(MaxSize));
915 
916               unsigned Align = Query.MMODescrs[0].AlignInBits;
917               return std::make_pair(0, LLT::scalar(Align));
918             })
919         .fewerElementsIf(
920             [=](const LegalityQuery &Query) -> bool {
921               return Query.Types[0].isVector() &&
922                      needToSplitMemOp(Query, Op == G_LOAD);
923             },
924             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
925               const LLT DstTy = Query.Types[0];
926               const LLT PtrTy = Query.Types[1];
927 
928               LLT EltTy = DstTy.getElementType();
929               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
930                                                      Op == G_LOAD);
931 
932               // FIXME: Handle widened to power of 2 results better. This ends
933               // up scalarizing.
934               // FIXME: 3 element stores scalarized on SI
935 
936               // Split if it's too large for the address space.
937               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
938                 unsigned NumElts = DstTy.getNumElements();
939                 unsigned EltSize = EltTy.getSizeInBits();
940 
941                 if (MaxSize % EltSize == 0) {
942                   return std::make_pair(
943                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
944                 }
945 
946                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
947 
948                 // FIXME: Refine when odd breakdowns handled
949                 // The scalars will need to be re-legalized.
950                 if (NumPieces == 1 || NumPieces >= NumElts ||
951                     NumElts % NumPieces != 0)
952                   return std::make_pair(0, EltTy);
953 
954                 return std::make_pair(0,
955                                       LLT::vector(NumElts / NumPieces, EltTy));
956               }
957 
958               // FIXME: We could probably handle weird extending loads better.
959               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
960               if (DstTy.getSizeInBits() > MemSize)
961                 return std::make_pair(0, EltTy);
962 
963               unsigned EltSize = EltTy.getSizeInBits();
964               unsigned DstSize = DstTy.getSizeInBits();
965               if (!isPowerOf2_32(DstSize)) {
966                 // We're probably decomposing an odd sized store. Try to split
967                 // to the widest type. TODO: Account for alignment. As-is it
968                 // should be OK, since the new parts will be further legalized.
969                 unsigned FloorSize = PowerOf2Floor(DstSize);
970                 return std::make_pair(
971                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
972               }
973 
974               // Need to split because of alignment.
975               unsigned Align = Query.MMODescrs[0].AlignInBits;
976               if (EltSize > Align &&
977                   (EltSize / Align < DstTy.getNumElements())) {
978                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
979               }
980 
981               // May need relegalization for the scalars.
982               return std::make_pair(0, EltTy);
983             })
984         .minScalar(0, S32);
985 
986     if (IsStore)
987       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
988 
989     // TODO: Need a bitcast lower option?
990     Actions
991         .legalIf([=](const LegalityQuery &Query) {
992           const LLT Ty0 = Query.Types[0];
993           unsigned Size = Ty0.getSizeInBits();
994           unsigned MemSize = Query.MMODescrs[0].SizeInBits;
995           unsigned Align = Query.MMODescrs[0].AlignInBits;
996 
997           // FIXME: Widening store from alignment not valid.
998           if (MemSize < Size)
999             MemSize = std::max(MemSize, Align);
1000 
1001           // No extending vector loads.
1002           if (Size > MemSize && Ty0.isVector())
1003             return false;
1004 
1005           switch (MemSize) {
1006           case 8:
1007           case 16:
1008             return Size == 32;
1009           case 32:
1010           case 64:
1011           case 128:
1012             return true;
1013           case 96:
1014             return ST.hasDwordx3LoadStores();
1015           case 256:
1016           case 512:
1017             return true;
1018           default:
1019             return false;
1020           }
1021         })
1022         .widenScalarToNextPow2(0)
1023         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
1024   }
1025 
1026   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1027                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1028                                                   {S32, GlobalPtr, 16, 2 * 8},
1029                                                   {S32, LocalPtr, 8, 8},
1030                                                   {S32, LocalPtr, 16, 16},
1031                                                   {S32, PrivatePtr, 8, 8},
1032                                                   {S32, PrivatePtr, 16, 16},
1033                                                   {S32, ConstantPtr, 8, 8},
1034                                                   {S32, ConstantPtr, 16, 2 * 8}});
1035   if (ST.hasFlatAddressSpace()) {
1036     ExtLoads.legalForTypesWithMemDesc(
1037         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1038   }
1039 
1040   ExtLoads.clampScalar(0, S32, S32)
1041           .widenScalarToNextPow2(0)
1042           .unsupportedIfMemSizeNotPow2()
1043           .lower();
1044 
1045   auto &Atomics = getActionDefinitionsBuilder(
1046     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1047      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1048      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1049      G_ATOMICRMW_UMIN})
1050     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1051                {S64, GlobalPtr}, {S64, LocalPtr}});
1052   if (ST.hasFlatAddressSpace()) {
1053     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1054   }
1055 
1056   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1057     .legalFor({{S32, LocalPtr}});
1058 
1059   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1060   // demarshalling
1061   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1062     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1063                 {S32, FlatPtr}, {S64, FlatPtr}})
1064     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1065                {S32, RegionPtr}, {S64, RegionPtr}});
1066   // TODO: Pointer types, any 32-bit or 64-bit vector
1067 
1068   // Condition should be s32 for scalar, s1 for vector.
1069   getActionDefinitionsBuilder(G_SELECT)
1070     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1071           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1072           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1073     .clampScalar(0, S16, S64)
1074     .scalarize(1)
1075     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1076     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1077     .clampMaxNumElements(0, S32, 2)
1078     .clampMaxNumElements(0, LocalPtr, 2)
1079     .clampMaxNumElements(0, PrivatePtr, 2)
1080     .scalarize(0)
1081     .widenScalarToNextPow2(0)
1082     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1083 
1084   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1085   // be more flexible with the shift amount type.
1086   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1087     .legalFor({{S32, S32}, {S64, S32}});
1088   if (ST.has16BitInsts()) {
1089     if (ST.hasVOP3PInsts()) {
1090       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1091             .clampMaxNumElements(0, S16, 2);
1092     } else
1093       Shifts.legalFor({{S16, S16}});
1094 
1095     // TODO: Support 16-bit shift amounts for all types
1096     Shifts.widenScalarIf(
1097       [=](const LegalityQuery &Query) {
1098         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1099         // 32-bit amount.
1100         const LLT ValTy = Query.Types[0];
1101         const LLT AmountTy = Query.Types[1];
1102         return ValTy.getSizeInBits() <= 16 &&
1103                AmountTy.getSizeInBits() < 16;
1104       }, changeTo(1, S16));
1105     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1106     Shifts.clampScalar(1, S32, S32);
1107     Shifts.clampScalar(0, S16, S64);
1108     Shifts.widenScalarToNextPow2(0, 16);
1109   } else {
1110     // Make sure we legalize the shift amount type first, as the general
1111     // expansion for the shifted type will produce much worse code if it hasn't
1112     // been truncated already.
1113     Shifts.clampScalar(1, S32, S32);
1114     Shifts.clampScalar(0, S32, S64);
1115     Shifts.widenScalarToNextPow2(0, 32);
1116   }
1117   Shifts.scalarize(0);
1118 
1119   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1120     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1121     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1122     unsigned IdxTypeIdx = 2;
1123 
1124     getActionDefinitionsBuilder(Op)
1125       .customIf([=](const LegalityQuery &Query) {
1126           const LLT EltTy = Query.Types[EltTypeIdx];
1127           const LLT VecTy = Query.Types[VecTypeIdx];
1128           const LLT IdxTy = Query.Types[IdxTypeIdx];
1129           return (EltTy.getSizeInBits() == 16 ||
1130                   EltTy.getSizeInBits() % 32 == 0) &&
1131                  VecTy.getSizeInBits() % 32 == 0 &&
1132                  VecTy.getSizeInBits() <= 1024 &&
1133                  IdxTy.getSizeInBits() == 32;
1134         })
1135       .clampScalar(EltTypeIdx, S32, S64)
1136       .clampScalar(VecTypeIdx, S32, S64)
1137       .clampScalar(IdxTypeIdx, S32, S32);
1138   }
1139 
1140   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1141     .unsupportedIf([=](const LegalityQuery &Query) {
1142         const LLT &EltTy = Query.Types[1].getElementType();
1143         return Query.Types[0] != EltTy;
1144       });
1145 
1146   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1147     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1148     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1149 
1150     // FIXME: Doesn't handle extract of illegal sizes.
1151     getActionDefinitionsBuilder(Op)
1152       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1153       // FIXME: Multiples of 16 should not be legal.
1154       .legalIf([=](const LegalityQuery &Query) {
1155           const LLT BigTy = Query.Types[BigTyIdx];
1156           const LLT LitTy = Query.Types[LitTyIdx];
1157           return (BigTy.getSizeInBits() % 32 == 0) &&
1158                  (LitTy.getSizeInBits() % 16 == 0);
1159         })
1160       .widenScalarIf(
1161         [=](const LegalityQuery &Query) {
1162           const LLT BigTy = Query.Types[BigTyIdx];
1163           return (BigTy.getScalarSizeInBits() < 16);
1164         },
1165         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1166       .widenScalarIf(
1167         [=](const LegalityQuery &Query) {
1168           const LLT LitTy = Query.Types[LitTyIdx];
1169           return (LitTy.getScalarSizeInBits() < 16);
1170         },
1171         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1172       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1173       .widenScalarToNextPow2(BigTyIdx, 32);
1174 
1175   }
1176 
1177   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1178     .legalForCartesianProduct(AllS32Vectors, {S32})
1179     .legalForCartesianProduct(AllS64Vectors, {S64})
1180     .clampNumElements(0, V16S32, V32S32)
1181     .clampNumElements(0, V2S64, V16S64)
1182     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1183 
1184   if (ST.hasScalarPackInsts()) {
1185     BuildVector
1186       // FIXME: Should probably widen s1 vectors straight to s32
1187       .minScalarOrElt(0, S16)
1188       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1189       .minScalar(1, S32);
1190 
1191     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1192       .legalFor({V2S16, S32})
1193       .lower();
1194     BuildVector.minScalarOrElt(0, S32);
1195   } else {
1196     BuildVector.customFor({V2S16, S16});
1197     BuildVector.minScalarOrElt(0, S32);
1198 
1199     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1200       .customFor({V2S16, S32})
1201       .lower();
1202   }
1203 
1204   BuildVector.legalIf(isRegisterType(0));
1205 
1206   // FIXME: Clamp maximum size
1207   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1208     .legalIf(isRegisterType(0));
1209 
1210   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1211   // pre-legalize.
1212   if (ST.hasVOP3PInsts()) {
1213     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1214       .customFor({V2S16, V2S16})
1215       .lower();
1216   } else
1217     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1218 
1219   // Merge/Unmerge
1220   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1221     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1222     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1223 
1224     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1225       const LLT Ty = Query.Types[TypeIdx];
1226       if (Ty.isVector()) {
1227         const LLT &EltTy = Ty.getElementType();
1228         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1229           return true;
1230         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1231           return true;
1232       }
1233       return false;
1234     };
1235 
1236     auto &Builder = getActionDefinitionsBuilder(Op)
1237       .lowerFor({{S16, V2S16}})
1238       .lowerIf([=](const LegalityQuery &Query) {
1239           const LLT BigTy = Query.Types[BigTyIdx];
1240           return BigTy.getSizeInBits() == 32;
1241         })
1242       // Try to widen to s16 first for small types.
1243       // TODO: Only do this on targets with legal s16 shifts
1244       .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1245       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1246       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1247       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1248                            elementTypeIs(1, S16)),
1249                        changeTo(1, V2S16))
1250       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1251       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1252       // valid.
1253       .clampScalar(LitTyIdx, S32, S512)
1254       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1255       // Break up vectors with weird elements into scalars
1256       .fewerElementsIf(
1257         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1258         scalarize(0))
1259       .fewerElementsIf(
1260         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1261         scalarize(1))
1262       .clampScalar(BigTyIdx, S32, S1024);
1263 
1264     if (Op == G_MERGE_VALUES) {
1265       Builder.widenScalarIf(
1266         // TODO: Use 16-bit shifts if legal for 8-bit values?
1267         [=](const LegalityQuery &Query) {
1268           const LLT Ty = Query.Types[LitTyIdx];
1269           return Ty.getSizeInBits() < 32;
1270         },
1271         changeTo(LitTyIdx, S32));
1272     }
1273 
1274     Builder.widenScalarIf(
1275       [=](const LegalityQuery &Query) {
1276         const LLT Ty = Query.Types[BigTyIdx];
1277         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1278           Ty.getSizeInBits() % 16 != 0;
1279       },
1280       [=](const LegalityQuery &Query) {
1281         // Pick the next power of 2, or a multiple of 64 over 128.
1282         // Whichever is smaller.
1283         const LLT &Ty = Query.Types[BigTyIdx];
1284         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1285         if (NewSizeInBits >= 256) {
1286           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1287           if (RoundedTo < NewSizeInBits)
1288             NewSizeInBits = RoundedTo;
1289         }
1290         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1291       })
1292       .legalIf([=](const LegalityQuery &Query) {
1293           const LLT &BigTy = Query.Types[BigTyIdx];
1294           const LLT &LitTy = Query.Types[LitTyIdx];
1295 
1296           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1297             return false;
1298           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1299             return false;
1300 
1301           return BigTy.getSizeInBits() % 16 == 0 &&
1302                  LitTy.getSizeInBits() % 16 == 0 &&
1303                  BigTy.getSizeInBits() <= 1024;
1304         })
1305       // Any vectors left are the wrong size. Scalarize them.
1306       .scalarize(0)
1307       .scalarize(1);
1308   }
1309 
1310   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1311   // RegBankSelect.
1312   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1313     .legalFor({{S32}, {S64}});
1314 
1315   if (ST.hasVOP3PInsts()) {
1316     SextInReg.lowerFor({{V2S16}})
1317       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1318       // get more vector shift opportunities, since we'll get those when
1319       // expanded.
1320       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1321   } else if (ST.has16BitInsts()) {
1322     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1323   } else {
1324     // Prefer to promote to s32 before lowering if we don't have 16-bit
1325     // shifts. This avoid a lot of intermediate truncate and extend operations.
1326     SextInReg.lowerFor({{S32}, {S64}});
1327   }
1328 
1329   SextInReg
1330     .scalarize(0)
1331     .clampScalar(0, S32, S64)
1332     .lower();
1333 
1334   getActionDefinitionsBuilder(G_FSHR)
1335     .legalFor({{S32, S32}})
1336     .scalarize(0)
1337     .lower();
1338 
1339   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1340     .legalFor({S64});
1341 
1342   getActionDefinitionsBuilder({
1343       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1344       G_FCOPYSIGN,
1345 
1346       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1347       G_READ_REGISTER,
1348       G_WRITE_REGISTER,
1349 
1350       G_SADDO, G_SSUBO,
1351 
1352        // TODO: Implement
1353       G_FMINIMUM, G_FMAXIMUM,
1354       G_FSHL
1355     }).lower();
1356 
1357   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1358         G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1359         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1360     .unsupported();
1361 
1362   computeTables();
1363   verify(*ST.getInstrInfo());
1364 }
1365 
1366 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1367                                          MachineRegisterInfo &MRI,
1368                                          MachineIRBuilder &B,
1369                                          GISelChangeObserver &Observer) const {
1370   switch (MI.getOpcode()) {
1371   case TargetOpcode::G_ADDRSPACE_CAST:
1372     return legalizeAddrSpaceCast(MI, MRI, B);
1373   case TargetOpcode::G_FRINT:
1374     return legalizeFrint(MI, MRI, B);
1375   case TargetOpcode::G_FCEIL:
1376     return legalizeFceil(MI, MRI, B);
1377   case TargetOpcode::G_INTRINSIC_TRUNC:
1378     return legalizeIntrinsicTrunc(MI, MRI, B);
1379   case TargetOpcode::G_SITOFP:
1380     return legalizeITOFP(MI, MRI, B, true);
1381   case TargetOpcode::G_UITOFP:
1382     return legalizeITOFP(MI, MRI, B, false);
1383   case TargetOpcode::G_FPTOSI:
1384     return legalizeFPTOI(MI, MRI, B, true);
1385   case TargetOpcode::G_FPTOUI:
1386     return legalizeFPTOI(MI, MRI, B, false);
1387   case TargetOpcode::G_FMINNUM:
1388   case TargetOpcode::G_FMAXNUM:
1389   case TargetOpcode::G_FMINNUM_IEEE:
1390   case TargetOpcode::G_FMAXNUM_IEEE:
1391     return legalizeMinNumMaxNum(MI, MRI, B);
1392   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1393     return legalizeExtractVectorElt(MI, MRI, B);
1394   case TargetOpcode::G_INSERT_VECTOR_ELT:
1395     return legalizeInsertVectorElt(MI, MRI, B);
1396   case TargetOpcode::G_SHUFFLE_VECTOR:
1397     return legalizeShuffleVector(MI, MRI, B);
1398   case TargetOpcode::G_FSIN:
1399   case TargetOpcode::G_FCOS:
1400     return legalizeSinCos(MI, MRI, B);
1401   case TargetOpcode::G_GLOBAL_VALUE:
1402     return legalizeGlobalValue(MI, MRI, B);
1403   case TargetOpcode::G_LOAD:
1404     return legalizeLoad(MI, MRI, B, Observer);
1405   case TargetOpcode::G_FMAD:
1406     return legalizeFMad(MI, MRI, B);
1407   case TargetOpcode::G_FDIV:
1408     return legalizeFDIV(MI, MRI, B);
1409   case TargetOpcode::G_UDIV:
1410   case TargetOpcode::G_UREM:
1411     return legalizeUDIV_UREM(MI, MRI, B);
1412   case TargetOpcode::G_SDIV:
1413   case TargetOpcode::G_SREM:
1414     return legalizeSDIV_SREM(MI, MRI, B);
1415   case TargetOpcode::G_ATOMIC_CMPXCHG:
1416     return legalizeAtomicCmpXChg(MI, MRI, B);
1417   case TargetOpcode::G_FLOG:
1418     return legalizeFlog(MI, B, 1.0f / numbers::log2ef);
1419   case TargetOpcode::G_FLOG10:
1420     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1421   case TargetOpcode::G_FEXP:
1422     return legalizeFExp(MI, B);
1423   case TargetOpcode::G_FPOW:
1424     return legalizeFPow(MI, B);
1425   case TargetOpcode::G_FFLOOR:
1426     return legalizeFFloor(MI, MRI, B);
1427   case TargetOpcode::G_BUILD_VECTOR:
1428     return legalizeBuildVector(MI, MRI, B);
1429   default:
1430     return false;
1431   }
1432 
1433   llvm_unreachable("expected switch to return");
1434 }
1435 
1436 Register AMDGPULegalizerInfo::getSegmentAperture(
1437   unsigned AS,
1438   MachineRegisterInfo &MRI,
1439   MachineIRBuilder &B) const {
1440   MachineFunction &MF = B.getMF();
1441   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1442   const LLT S32 = LLT::scalar(32);
1443 
1444   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1445 
1446   if (ST.hasApertureRegs()) {
1447     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1448     // getreg.
1449     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1450         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1451         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1452     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1453         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1454         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1455     unsigned Encoding =
1456         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1457         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1458         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1459 
1460     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1461 
1462     B.buildInstr(AMDGPU::S_GETREG_B32)
1463       .addDef(GetReg)
1464       .addImm(Encoding);
1465     MRI.setType(GetReg, S32);
1466 
1467     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1468     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1469   }
1470 
1471   Register QueuePtr = MRI.createGenericVirtualRegister(
1472     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1473 
1474   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1475   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1476     return Register();
1477 
1478   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1479   // private_segment_aperture_base_hi.
1480   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1481 
1482   // TODO: can we be smarter about machine pointer info?
1483   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1484   MachineMemOperand *MMO = MF.getMachineMemOperand(
1485       PtrInfo,
1486       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1487           MachineMemOperand::MOInvariant,
1488       4, commonAlignment(Align(64), StructOffset));
1489 
1490   Register LoadAddr;
1491 
1492   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1493   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1494 }
1495 
1496 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1497   MachineInstr &MI, MachineRegisterInfo &MRI,
1498   MachineIRBuilder &B) const {
1499   MachineFunction &MF = B.getMF();
1500 
1501   B.setInstr(MI);
1502 
1503   const LLT S32 = LLT::scalar(32);
1504   Register Dst = MI.getOperand(0).getReg();
1505   Register Src = MI.getOperand(1).getReg();
1506 
1507   LLT DstTy = MRI.getType(Dst);
1508   LLT SrcTy = MRI.getType(Src);
1509   unsigned DestAS = DstTy.getAddressSpace();
1510   unsigned SrcAS = SrcTy.getAddressSpace();
1511 
1512   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1513   // vector element.
1514   assert(!DstTy.isVector());
1515 
1516   const AMDGPUTargetMachine &TM
1517     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1518 
1519   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1520   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1521     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1522     return true;
1523   }
1524 
1525   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1526     // Truncate.
1527     B.buildExtract(Dst, Src, 0);
1528     MI.eraseFromParent();
1529     return true;
1530   }
1531 
1532   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1533     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1534     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1535 
1536     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1537     // another. Merge operands are required to be the same type, but creating an
1538     // extra ptrtoint would be kind of pointless.
1539     auto HighAddr = B.buildConstant(
1540       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1541     B.buildMerge(Dst, {Src, HighAddr});
1542     MI.eraseFromParent();
1543     return true;
1544   }
1545 
1546   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1547     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1548            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1549     unsigned NullVal = TM.getNullPointerValue(DestAS);
1550 
1551     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1552     auto FlatNull = B.buildConstant(SrcTy, 0);
1553 
1554     // Extract low 32-bits of the pointer.
1555     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1556 
1557     auto CmpRes =
1558         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1559     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1560 
1561     MI.eraseFromParent();
1562     return true;
1563   }
1564 
1565   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1566     return false;
1567 
1568   if (!ST.hasFlatAddressSpace())
1569     return false;
1570 
1571   auto SegmentNull =
1572       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1573   auto FlatNull =
1574       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1575 
1576   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1577   if (!ApertureReg.isValid())
1578     return false;
1579 
1580   auto CmpRes =
1581       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1582 
1583   // Coerce the type of the low half of the result so we can use merge_values.
1584   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1585 
1586   // TODO: Should we allow mismatched types but matching sizes in merges to
1587   // avoid the ptrtoint?
1588   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1589   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1590 
1591   MI.eraseFromParent();
1592   return true;
1593 }
1594 
1595 bool AMDGPULegalizerInfo::legalizeFrint(
1596   MachineInstr &MI, MachineRegisterInfo &MRI,
1597   MachineIRBuilder &B) const {
1598   B.setInstr(MI);
1599 
1600   Register Src = MI.getOperand(1).getReg();
1601   LLT Ty = MRI.getType(Src);
1602   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1603 
1604   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1605   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1606 
1607   auto C1 = B.buildFConstant(Ty, C1Val);
1608   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1609 
1610   // TODO: Should this propagate fast-math-flags?
1611   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1612   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1613 
1614   auto C2 = B.buildFConstant(Ty, C2Val);
1615   auto Fabs = B.buildFAbs(Ty, Src);
1616 
1617   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1618   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1619   return true;
1620 }
1621 
1622 bool AMDGPULegalizerInfo::legalizeFceil(
1623   MachineInstr &MI, MachineRegisterInfo &MRI,
1624   MachineIRBuilder &B) const {
1625   B.setInstr(MI);
1626 
1627   const LLT S1 = LLT::scalar(1);
1628   const LLT S64 = LLT::scalar(64);
1629 
1630   Register Src = MI.getOperand(1).getReg();
1631   assert(MRI.getType(Src) == S64);
1632 
1633   // result = trunc(src)
1634   // if (src > 0.0 && src != result)
1635   //   result += 1.0
1636 
1637   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1638 
1639   const auto Zero = B.buildFConstant(S64, 0.0);
1640   const auto One = B.buildFConstant(S64, 1.0);
1641   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1642   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1643   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1644   auto Add = B.buildSelect(S64, And, One, Zero);
1645 
1646   // TODO: Should this propagate fast-math-flags?
1647   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1648   return true;
1649 }
1650 
1651 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1652                                               MachineIRBuilder &B) {
1653   const unsigned FractBits = 52;
1654   const unsigned ExpBits = 11;
1655   LLT S32 = LLT::scalar(32);
1656 
1657   auto Const0 = B.buildConstant(S32, FractBits - 32);
1658   auto Const1 = B.buildConstant(S32, ExpBits);
1659 
1660   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1661     .addUse(Const0.getReg(0))
1662     .addUse(Const1.getReg(0));
1663 
1664   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1665 }
1666 
1667 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1668   MachineInstr &MI, MachineRegisterInfo &MRI,
1669   MachineIRBuilder &B) const {
1670   B.setInstr(MI);
1671 
1672   const LLT S1 = LLT::scalar(1);
1673   const LLT S32 = LLT::scalar(32);
1674   const LLT S64 = LLT::scalar(64);
1675 
1676   Register Src = MI.getOperand(1).getReg();
1677   assert(MRI.getType(Src) == S64);
1678 
1679   // TODO: Should this use extract since the low half is unused?
1680   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1681   Register Hi = Unmerge.getReg(1);
1682 
1683   // Extract the upper half, since this is where we will find the sign and
1684   // exponent.
1685   auto Exp = extractF64Exponent(Hi, B);
1686 
1687   const unsigned FractBits = 52;
1688 
1689   // Extract the sign bit.
1690   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1691   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1692 
1693   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1694 
1695   const auto Zero32 = B.buildConstant(S32, 0);
1696 
1697   // Extend back to 64-bits.
1698   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1699 
1700   auto Shr = B.buildAShr(S64, FractMask, Exp);
1701   auto Not = B.buildNot(S64, Shr);
1702   auto Tmp0 = B.buildAnd(S64, Src, Not);
1703   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1704 
1705   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1706   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1707 
1708   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1709   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1710   return true;
1711 }
1712 
1713 bool AMDGPULegalizerInfo::legalizeITOFP(
1714   MachineInstr &MI, MachineRegisterInfo &MRI,
1715   MachineIRBuilder &B, bool Signed) const {
1716   B.setInstr(MI);
1717 
1718   Register Dst = MI.getOperand(0).getReg();
1719   Register Src = MI.getOperand(1).getReg();
1720 
1721   const LLT S64 = LLT::scalar(64);
1722   const LLT S32 = LLT::scalar(32);
1723 
1724   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1725 
1726   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1727 
1728   auto CvtHi = Signed ?
1729     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1730     B.buildUITOFP(S64, Unmerge.getReg(1));
1731 
1732   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1733 
1734   auto ThirtyTwo = B.buildConstant(S32, 32);
1735   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1736     .addUse(CvtHi.getReg(0))
1737     .addUse(ThirtyTwo.getReg(0));
1738 
1739   // TODO: Should this propagate fast-math-flags?
1740   B.buildFAdd(Dst, LdExp, CvtLo);
1741   MI.eraseFromParent();
1742   return true;
1743 }
1744 
1745 // TODO: Copied from DAG implementation. Verify logic and document how this
1746 // actually works.
1747 bool AMDGPULegalizerInfo::legalizeFPTOI(
1748   MachineInstr &MI, MachineRegisterInfo &MRI,
1749   MachineIRBuilder &B, bool Signed) const {
1750   B.setInstr(MI);
1751 
1752   Register Dst = MI.getOperand(0).getReg();
1753   Register Src = MI.getOperand(1).getReg();
1754 
1755   const LLT S64 = LLT::scalar(64);
1756   const LLT S32 = LLT::scalar(32);
1757 
1758   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1759 
1760   unsigned Flags = MI.getFlags();
1761 
1762   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
1763   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
1764   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
1765 
1766   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
1767   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
1768   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
1769 
1770   auto Hi = Signed ?
1771     B.buildFPTOSI(S32, FloorMul) :
1772     B.buildFPTOUI(S32, FloorMul);
1773   auto Lo = B.buildFPTOUI(S32, Fma);
1774 
1775   B.buildMerge(Dst, { Lo, Hi });
1776   MI.eraseFromParent();
1777 
1778   return true;
1779 }
1780 
1781 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1782   MachineInstr &MI, MachineRegisterInfo &MRI,
1783   MachineIRBuilder &B) const {
1784   MachineFunction &MF = B.getMF();
1785   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1786 
1787   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1788                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1789 
1790   // With ieee_mode disabled, the instructions have the correct behavior
1791   // already for G_FMINNUM/G_FMAXNUM
1792   if (!MFI->getMode().IEEE)
1793     return !IsIEEEOp;
1794 
1795   if (IsIEEEOp)
1796     return true;
1797 
1798   MachineIRBuilder HelperBuilder(MI);
1799   GISelObserverWrapper DummyObserver;
1800   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1801   HelperBuilder.setInstr(MI);
1802   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1803 }
1804 
1805 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1806   MachineInstr &MI, MachineRegisterInfo &MRI,
1807   MachineIRBuilder &B) const {
1808   // TODO: Should move some of this into LegalizerHelper.
1809 
1810   // TODO: Promote dynamic indexing of s16 to s32
1811 
1812   // FIXME: Artifact combiner probably should have replaced the truncated
1813   // constant before this, so we shouldn't need
1814   // getConstantVRegValWithLookThrough.
1815   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1816     MI.getOperand(2).getReg(), MRI);
1817   if (!IdxVal) // Dynamic case will be selected to register indexing.
1818     return true;
1819 
1820   Register Dst = MI.getOperand(0).getReg();
1821   Register Vec = MI.getOperand(1).getReg();
1822 
1823   LLT VecTy = MRI.getType(Vec);
1824   LLT EltTy = VecTy.getElementType();
1825   assert(EltTy == MRI.getType(Dst));
1826 
1827   B.setInstr(MI);
1828 
1829   if (IdxVal->Value < VecTy.getNumElements())
1830     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
1831   else
1832     B.buildUndef(Dst);
1833 
1834   MI.eraseFromParent();
1835   return true;
1836 }
1837 
1838 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1839   MachineInstr &MI, MachineRegisterInfo &MRI,
1840   MachineIRBuilder &B) const {
1841   // TODO: Should move some of this into LegalizerHelper.
1842 
1843   // TODO: Promote dynamic indexing of s16 to s32
1844 
1845   // FIXME: Artifact combiner probably should have replaced the truncated
1846   // constant before this, so we shouldn't need
1847   // getConstantVRegValWithLookThrough.
1848   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1849     MI.getOperand(3).getReg(), MRI);
1850   if (!IdxVal) // Dynamic case will be selected to register indexing.
1851     return true;
1852 
1853   Register Dst = MI.getOperand(0).getReg();
1854   Register Vec = MI.getOperand(1).getReg();
1855   Register Ins = MI.getOperand(2).getReg();
1856 
1857   LLT VecTy = MRI.getType(Vec);
1858   LLT EltTy = VecTy.getElementType();
1859   assert(EltTy == MRI.getType(Ins));
1860 
1861   B.setInstr(MI);
1862 
1863   if (IdxVal->Value < VecTy.getNumElements())
1864     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
1865   else
1866     B.buildUndef(Dst);
1867 
1868   MI.eraseFromParent();
1869   return true;
1870 }
1871 
1872 bool AMDGPULegalizerInfo::legalizeShuffleVector(
1873   MachineInstr &MI, MachineRegisterInfo &MRI,
1874   MachineIRBuilder &B) const {
1875   const LLT V2S16 = LLT::vector(2, 16);
1876 
1877   Register Dst = MI.getOperand(0).getReg();
1878   Register Src0 = MI.getOperand(1).getReg();
1879   LLT DstTy = MRI.getType(Dst);
1880   LLT SrcTy = MRI.getType(Src0);
1881 
1882   if (SrcTy == V2S16 && DstTy == V2S16 &&
1883       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
1884     return true;
1885 
1886   MachineIRBuilder HelperBuilder(MI);
1887   GISelObserverWrapper DummyObserver;
1888   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
1889   HelperBuilder.setInstr(MI);
1890   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
1891 }
1892 
1893 bool AMDGPULegalizerInfo::legalizeSinCos(
1894   MachineInstr &MI, MachineRegisterInfo &MRI,
1895   MachineIRBuilder &B) const {
1896   B.setInstr(MI);
1897 
1898   Register DstReg = MI.getOperand(0).getReg();
1899   Register SrcReg = MI.getOperand(1).getReg();
1900   LLT Ty = MRI.getType(DstReg);
1901   unsigned Flags = MI.getFlags();
1902 
1903   Register TrigVal;
1904   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1905   if (ST.hasTrigReducedRange()) {
1906     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1907     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1908       .addUse(MulVal.getReg(0))
1909       .setMIFlags(Flags).getReg(0);
1910   } else
1911     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1912 
1913   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1914     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1915   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1916     .addUse(TrigVal)
1917     .setMIFlags(Flags);
1918   MI.eraseFromParent();
1919   return true;
1920 }
1921 
1922 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1923   Register DstReg, LLT PtrTy,
1924   MachineIRBuilder &B, const GlobalValue *GV,
1925   unsigned Offset, unsigned GAFlags) const {
1926   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1927   // to the following code sequence:
1928   //
1929   // For constant address space:
1930   //   s_getpc_b64 s[0:1]
1931   //   s_add_u32 s0, s0, $symbol
1932   //   s_addc_u32 s1, s1, 0
1933   //
1934   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1935   //   a fixup or relocation is emitted to replace $symbol with a literal
1936   //   constant, which is a pc-relative offset from the encoding of the $symbol
1937   //   operand to the global variable.
1938   //
1939   // For global address space:
1940   //   s_getpc_b64 s[0:1]
1941   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1942   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1943   //
1944   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1945   //   fixups or relocations are emitted to replace $symbol@*@lo and
1946   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1947   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
1948   //   operand to the global variable.
1949   //
1950   // What we want here is an offset from the value returned by s_getpc
1951   // (which is the address of the s_add_u32 instruction) to the global
1952   // variable, but since the encoding of $symbol starts 4 bytes after the start
1953   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1954   // small. This requires us to add 4 to the global variable offset in order to
1955   // compute the correct address.
1956 
1957   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1958 
1959   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1960     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1961 
1962   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1963     .addDef(PCReg);
1964 
1965   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1966   if (GAFlags == SIInstrInfo::MO_NONE)
1967     MIB.addImm(0);
1968   else
1969     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1970 
1971   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1972 
1973   if (PtrTy.getSizeInBits() == 32)
1974     B.buildExtract(DstReg, PCReg, 0);
1975   return true;
1976  }
1977 
1978 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1979   MachineInstr &MI, MachineRegisterInfo &MRI,
1980   MachineIRBuilder &B) const {
1981   Register DstReg = MI.getOperand(0).getReg();
1982   LLT Ty = MRI.getType(DstReg);
1983   unsigned AS = Ty.getAddressSpace();
1984 
1985   const GlobalValue *GV = MI.getOperand(1).getGlobal();
1986   MachineFunction &MF = B.getMF();
1987   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1988   B.setInstr(MI);
1989 
1990   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1991     if (!MFI->isEntryFunction()) {
1992       const Function &Fn = MF.getFunction();
1993       DiagnosticInfoUnsupported BadLDSDecl(
1994         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
1995         DS_Warning);
1996       Fn.getContext().diagnose(BadLDSDecl);
1997 
1998       // We currently don't have a way to correctly allocate LDS objects that
1999       // aren't directly associated with a kernel. We do force inlining of
2000       // functions that use local objects. However, if these dead functions are
2001       // not eliminated, we don't want a compile time error. Just emit a warning
2002       // and a trap, since there should be no callable path here.
2003       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2004       B.buildUndef(DstReg);
2005       MI.eraseFromParent();
2006       return true;
2007     }
2008 
2009     // TODO: We could emit code to handle the initialization somewhere.
2010     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2011       const SITargetLowering *TLI = ST.getTargetLowering();
2012       if (!TLI->shouldUseLDSConstAddress(GV)) {
2013         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2014         return true; // Leave in place;
2015       }
2016 
2017       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
2018       MI.eraseFromParent();
2019       return true;
2020     }
2021 
2022     const Function &Fn = MF.getFunction();
2023     DiagnosticInfoUnsupported BadInit(
2024       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2025     Fn.getContext().diagnose(BadInit);
2026     return true;
2027   }
2028 
2029   const SITargetLowering *TLI = ST.getTargetLowering();
2030 
2031   if (TLI->shouldEmitFixup(GV)) {
2032     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2033     MI.eraseFromParent();
2034     return true;
2035   }
2036 
2037   if (TLI->shouldEmitPCReloc(GV)) {
2038     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2039     MI.eraseFromParent();
2040     return true;
2041   }
2042 
2043   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2044   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2045 
2046   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2047       MachinePointerInfo::getGOT(MF),
2048       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2049           MachineMemOperand::MOInvariant,
2050       8 /*Size*/, Align(8));
2051 
2052   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2053 
2054   if (Ty.getSizeInBits() == 32) {
2055     // Truncate if this is a 32-bit constant adrdess.
2056     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2057     B.buildExtract(DstReg, Load, 0);
2058   } else
2059     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2060 
2061   MI.eraseFromParent();
2062   return true;
2063 }
2064 
2065 bool AMDGPULegalizerInfo::legalizeLoad(
2066   MachineInstr &MI, MachineRegisterInfo &MRI,
2067   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
2068   B.setInstr(MI);
2069   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2070   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
2071   Observer.changingInstr(MI);
2072   MI.getOperand(1).setReg(Cast.getReg(0));
2073   Observer.changedInstr(MI);
2074   return true;
2075 }
2076 
2077 bool AMDGPULegalizerInfo::legalizeFMad(
2078   MachineInstr &MI, MachineRegisterInfo &MRI,
2079   MachineIRBuilder &B) const {
2080   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2081   assert(Ty.isScalar());
2082 
2083   MachineFunction &MF = B.getMF();
2084   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2085 
2086   // TODO: Always legal with future ftz flag.
2087   // FIXME: Do we need just output?
2088   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2089     return true;
2090   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2091     return true;
2092 
2093   MachineIRBuilder HelperBuilder(MI);
2094   GISelObserverWrapper DummyObserver;
2095   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2096   HelperBuilder.setInstr(MI);
2097   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2098 }
2099 
2100 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2101   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2102   Register DstReg = MI.getOperand(0).getReg();
2103   Register PtrReg = MI.getOperand(1).getReg();
2104   Register CmpVal = MI.getOperand(2).getReg();
2105   Register NewVal = MI.getOperand(3).getReg();
2106 
2107   assert(SITargetLowering::isFlatGlobalAddrSpace(
2108            MRI.getType(PtrReg).getAddressSpace()) &&
2109          "this should not have been custom lowered");
2110 
2111   LLT ValTy = MRI.getType(CmpVal);
2112   LLT VecTy = LLT::vector(2, ValTy);
2113 
2114   B.setInstr(MI);
2115   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2116 
2117   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2118     .addDef(DstReg)
2119     .addUse(PtrReg)
2120     .addUse(PackedVal)
2121     .setMemRefs(MI.memoperands());
2122 
2123   MI.eraseFromParent();
2124   return true;
2125 }
2126 
2127 bool AMDGPULegalizerInfo::legalizeFlog(
2128   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2129   Register Dst = MI.getOperand(0).getReg();
2130   Register Src = MI.getOperand(1).getReg();
2131   LLT Ty = B.getMRI()->getType(Dst);
2132   unsigned Flags = MI.getFlags();
2133   B.setInstr(MI);
2134 
2135   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2136   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2137 
2138   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2139   MI.eraseFromParent();
2140   return true;
2141 }
2142 
2143 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2144                                        MachineIRBuilder &B) const {
2145   Register Dst = MI.getOperand(0).getReg();
2146   Register Src = MI.getOperand(1).getReg();
2147   unsigned Flags = MI.getFlags();
2148   LLT Ty = B.getMRI()->getType(Dst);
2149   B.setInstr(MI);
2150 
2151   auto K = B.buildFConstant(Ty, numbers::log2e);
2152   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2153   B.buildFExp2(Dst, Mul, Flags);
2154   MI.eraseFromParent();
2155   return true;
2156 }
2157 
2158 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2159                                        MachineIRBuilder &B) const {
2160   Register Dst = MI.getOperand(0).getReg();
2161   Register Src0 = MI.getOperand(1).getReg();
2162   Register Src1 = MI.getOperand(2).getReg();
2163   unsigned Flags = MI.getFlags();
2164   LLT Ty = B.getMRI()->getType(Dst);
2165   B.setInstr(MI);
2166   const LLT S16 = LLT::scalar(16);
2167   const LLT S32 = LLT::scalar(32);
2168 
2169   if (Ty == S32) {
2170     auto Log = B.buildFLog2(S32, Src0, Flags);
2171     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2172       .addUse(Log.getReg(0))
2173       .addUse(Src1)
2174       .setMIFlags(Flags);
2175     B.buildFExp2(Dst, Mul, Flags);
2176   } else if (Ty == S16) {
2177     // There's no f16 fmul_legacy, so we need to convert for it.
2178     auto Log = B.buildFLog2(S16, Src0, Flags);
2179     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2180     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2181     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2182       .addUse(Ext0.getReg(0))
2183       .addUse(Ext1.getReg(0))
2184       .setMIFlags(Flags);
2185 
2186     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2187   } else
2188     return false;
2189 
2190   MI.eraseFromParent();
2191   return true;
2192 }
2193 
2194 // Find a source register, ignoring any possible source modifiers.
2195 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2196   Register ModSrc = OrigSrc;
2197   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2198     ModSrc = SrcFNeg->getOperand(1).getReg();
2199     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2200       ModSrc = SrcFAbs->getOperand(1).getReg();
2201   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2202     ModSrc = SrcFAbs->getOperand(1).getReg();
2203   return ModSrc;
2204 }
2205 
2206 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2207                                          MachineRegisterInfo &MRI,
2208                                          MachineIRBuilder &B) const {
2209   B.setInstr(MI);
2210 
2211   const LLT S1 = LLT::scalar(1);
2212   const LLT S64 = LLT::scalar(64);
2213   Register Dst = MI.getOperand(0).getReg();
2214   Register OrigSrc = MI.getOperand(1).getReg();
2215   unsigned Flags = MI.getFlags();
2216   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2217          "this should not have been custom lowered");
2218 
2219   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2220   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2221   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2222   // V_FRACT bug is:
2223   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2224   //
2225   // Convert floor(x) to (x - fract(x))
2226 
2227   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2228     .addUse(OrigSrc)
2229     .setMIFlags(Flags);
2230 
2231   // Give source modifier matching some assistance before obscuring a foldable
2232   // pattern.
2233 
2234   // TODO: We can avoid the neg on the fract? The input sign to fract
2235   // shouldn't matter?
2236   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2237 
2238   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2239 
2240   Register Min = MRI.createGenericVirtualRegister(S64);
2241 
2242   // We don't need to concern ourselves with the snan handling difference, so
2243   // use the one which will directly select.
2244   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2245   if (MFI->getMode().IEEE)
2246     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2247   else
2248     B.buildFMinNum(Min, Fract, Const, Flags);
2249 
2250   Register CorrectedFract = Min;
2251   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2252     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2253     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2254   }
2255 
2256   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2257   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2258 
2259   MI.eraseFromParent();
2260   return true;
2261 }
2262 
2263 // Turn an illegal packed v2s16 build vector into bit operations.
2264 // TODO: This should probably be a bitcast action in LegalizerHelper.
2265 bool AMDGPULegalizerInfo::legalizeBuildVector(
2266   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2267   Register Dst = MI.getOperand(0).getReg();
2268   const LLT S32 = LLT::scalar(32);
2269   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2270 
2271   Register Src0 = MI.getOperand(1).getReg();
2272   Register Src1 = MI.getOperand(2).getReg();
2273   assert(MRI.getType(Src0) == LLT::scalar(16));
2274 
2275   B.setInstr(MI);
2276   auto Merge = B.buildMerge(S32, {Src0, Src1});
2277   B.buildBitcast(Dst, Merge);
2278 
2279   MI.eraseFromParent();
2280   return true;
2281 }
2282 
2283 // Return the use branch instruction, otherwise null if the usage is invalid.
2284 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2285                                        MachineRegisterInfo &MRI,
2286                                        MachineInstr *&Br,
2287                                        MachineBasicBlock *&UncondBrTarget) {
2288   Register CondDef = MI.getOperand(0).getReg();
2289   if (!MRI.hasOneNonDBGUse(CondDef))
2290     return nullptr;
2291 
2292   MachineBasicBlock *Parent = MI.getParent();
2293   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2294   if (UseMI.getParent() != Parent ||
2295       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2296     return nullptr;
2297 
2298   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2299   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2300   if (Next == Parent->end()) {
2301     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2302     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2303       return nullptr;
2304     UncondBrTarget = &*NextMBB;
2305   } else {
2306     if (Next->getOpcode() != AMDGPU::G_BR)
2307       return nullptr;
2308     Br = &*Next;
2309     UncondBrTarget = Br->getOperand(0).getMBB();
2310   }
2311 
2312   return &UseMI;
2313 }
2314 
2315 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B,
2316                                                MachineRegisterInfo &MRI,
2317                                                Register LiveIn,
2318                                                Register PhyReg) const {
2319   assert(PhyReg.isPhysical() && "Physical register expected");
2320 
2321   // Insert the live-in copy, if required, by defining destination virtual
2322   // register.
2323   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
2324   if (!MRI.getVRegDef(LiveIn)) {
2325     // FIXME: Should have scoped insert pt
2326     MachineBasicBlock &OrigInsBB = B.getMBB();
2327     auto OrigInsPt = B.getInsertPt();
2328 
2329     MachineBasicBlock &EntryMBB = B.getMF().front();
2330     EntryMBB.addLiveIn(PhyReg);
2331     B.setInsertPt(EntryMBB, EntryMBB.begin());
2332     B.buildCopy(LiveIn, PhyReg);
2333 
2334     B.setInsertPt(OrigInsBB, OrigInsPt);
2335   }
2336 
2337   return LiveIn;
2338 }
2339 
2340 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B,
2341                                                 MachineRegisterInfo &MRI,
2342                                                 Register PhyReg, LLT Ty,
2343                                                 bool InsertLiveInCopy) const {
2344   assert(PhyReg.isPhysical() && "Physical register expected");
2345 
2346   // Get or create virtual live-in regester
2347   Register LiveIn = MRI.getLiveInVirtReg(PhyReg);
2348   if (!LiveIn) {
2349     LiveIn = MRI.createGenericVirtualRegister(Ty);
2350     MRI.addLiveIn(PhyReg, LiveIn);
2351   }
2352 
2353   // When the actual true copy required is from virtual register to physical
2354   // register (to be inserted later), live-in copy insertion from physical
2355   // to register virtual register is not required
2356   if (!InsertLiveInCopy)
2357     return LiveIn;
2358 
2359   return insertLiveInCopy(B, MRI, LiveIn, PhyReg);
2360 }
2361 
2362 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor(
2363     MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2364   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2365   const ArgDescriptor *Arg;
2366   const TargetRegisterClass *RC;
2367   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
2368   if (!Arg) {
2369     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
2370     return nullptr;
2371   }
2372   return Arg;
2373 }
2374 
2375 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2376                                          const ArgDescriptor *Arg) const {
2377   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2378     return false; // TODO: Handle these
2379 
2380   Register SrcReg = Arg->getRegister();
2381   assert(SrcReg.isPhysical() && "Physical register expected");
2382   assert(DstReg.isVirtual() && "Virtual register expected");
2383 
2384   MachineRegisterInfo &MRI = *B.getMRI();
2385 
2386   LLT Ty = MRI.getType(DstReg);
2387   Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty);
2388 
2389   if (Arg->isMasked()) {
2390     // TODO: Should we try to emit this once in the entry block?
2391     const LLT S32 = LLT::scalar(32);
2392     const unsigned Mask = Arg->getMask();
2393     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2394 
2395     Register AndMaskSrc = LiveIn;
2396 
2397     if (Shift != 0) {
2398       auto ShiftAmt = B.buildConstant(S32, Shift);
2399       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2400     }
2401 
2402     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2403   } else {
2404     B.buildCopy(DstReg, LiveIn);
2405   }
2406 
2407   return true;
2408 }
2409 
2410 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2411     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2412     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2413   B.setInstr(MI);
2414 
2415   const ArgDescriptor *Arg = getArgDescriptor(B, ArgType);
2416   if (!Arg)
2417     return false;
2418 
2419   if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg))
2420     return false;
2421 
2422   MI.eraseFromParent();
2423   return true;
2424 }
2425 
2426 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2427                                        MachineRegisterInfo &MRI,
2428                                        MachineIRBuilder &B) const {
2429   B.setInstr(MI);
2430   Register Dst = MI.getOperand(0).getReg();
2431   LLT DstTy = MRI.getType(Dst);
2432   LLT S16 = LLT::scalar(16);
2433   LLT S32 = LLT::scalar(32);
2434   LLT S64 = LLT::scalar(64);
2435 
2436   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2437     return true;
2438 
2439   if (DstTy == S16)
2440     return legalizeFDIV16(MI, MRI, B);
2441   if (DstTy == S32)
2442     return legalizeFDIV32(MI, MRI, B);
2443   if (DstTy == S64)
2444     return legalizeFDIV64(MI, MRI, B);
2445 
2446   return false;
2447 }
2448 
2449 static Register buildDivRCP(MachineIRBuilder &B, Register Src) {
2450   const LLT S32 = LLT::scalar(32);
2451 
2452   auto Cvt0 = B.buildUITOFP(S32, Src);
2453   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Cvt0});
2454   auto FPUIntMaxPlus1 = B.buildFConstant(S32, BitsToFloat(0x4f800000));
2455   auto Mul = B.buildFMul(S32, RcpIFlag, FPUIntMaxPlus1);
2456   return B.buildFPTOUI(S32, Mul).getReg(0);
2457 }
2458 
2459 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2460                                                   Register DstReg,
2461                                                   Register Num,
2462                                                   Register Den,
2463                                                   bool IsRem) const {
2464   const LLT S1 = LLT::scalar(1);
2465   const LLT S32 = LLT::scalar(32);
2466 
2467   // RCP =  URECIP(Den) = 2^32 / Den + e
2468   // e is rounding error.
2469   auto RCP = buildDivRCP(B, Den);
2470 
2471   // RCP_LO = mul(RCP, Den)
2472   auto RCP_LO = B.buildMul(S32, RCP, Den);
2473 
2474   // RCP_HI = mulhu (RCP, Den) */
2475   auto RCP_HI = B.buildUMulH(S32, RCP, Den);
2476 
2477   // NEG_RCP_LO = -RCP_LO
2478   auto Zero = B.buildConstant(S32, 0);
2479   auto NEG_RCP_LO = B.buildSub(S32, Zero, RCP_LO);
2480 
2481   // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
2482   auto CmpRcpHiZero = B.buildICmp(CmpInst::ICMP_EQ, S1, RCP_HI, Zero);
2483   auto ABS_RCP_LO = B.buildSelect(S32, CmpRcpHiZero, NEG_RCP_LO, RCP_LO);
2484 
2485   // Calculate the rounding error from the URECIP instruction
2486   // E = mulhu(ABS_RCP_LO, RCP)
2487   auto E = B.buildUMulH(S32, ABS_RCP_LO, RCP);
2488 
2489   // RCP_A_E = RCP + E
2490   auto RCP_A_E = B.buildAdd(S32, RCP, E);
2491 
2492   // RCP_S_E = RCP - E
2493   auto RCP_S_E = B.buildSub(S32, RCP, E);
2494 
2495   // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
2496   auto Tmp0 = B.buildSelect(S32, CmpRcpHiZero, RCP_A_E, RCP_S_E);
2497 
2498   // Quotient = mulhu(Tmp0, Num)stmp
2499   auto Quotient = B.buildUMulH(S32, Tmp0, Num);
2500 
2501   // Num_S_Remainder = Quotient * Den
2502   auto Num_S_Remainder = B.buildMul(S32, Quotient, Den);
2503 
2504   // Remainder = Num - Num_S_Remainder
2505   auto Remainder = B.buildSub(S32, Num, Num_S_Remainder);
2506 
2507   // Remainder_GE_Den = Remainder >= Den
2508   auto Remainder_GE_Den = B.buildICmp(CmpInst::ICMP_UGE, S1, Remainder, Den);
2509 
2510   // Remainder_GE_Zero = Num >= Num_S_Remainder;
2511   auto Remainder_GE_Zero = B.buildICmp(CmpInst::ICMP_UGE, S1,
2512                                        Num, Num_S_Remainder);
2513 
2514   // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
2515   auto Tmp1 = B.buildAnd(S1, Remainder_GE_Den, Remainder_GE_Zero);
2516 
2517   // Calculate Division result:
2518 
2519   // Quotient_A_One = Quotient + 1
2520   auto One = B.buildConstant(S32, 1);
2521   auto Quotient_A_One = B.buildAdd(S32, Quotient, One);
2522 
2523   // Quotient_S_One = Quotient - 1
2524   auto Quotient_S_One = B.buildSub(S32, Quotient, One);
2525 
2526   // Div = (Tmp1 == 0 ? Quotient_A_One : Quotient)
2527   auto Div = B.buildSelect(S32, Tmp1, Quotient, Quotient_A_One);
2528 
2529   // Div = (Remainder_GE_Zero ? Div : Quotient_S_One)
2530   if (IsRem) {
2531     Div = B.buildSelect(S32, Remainder_GE_Zero, Div, Quotient_S_One);
2532 
2533     // Calculate Rem result:
2534     auto Remainder_S_Den = B.buildSub(S32, Remainder, Den);
2535 
2536     // Remainder_A_Den = Remainder + Den
2537     auto Remainder_A_Den = B.buildAdd(S32, Remainder, Den);
2538 
2539     // Rem = (Tmp1 ? Remainder_S_Den : Remainder)
2540     auto Rem = B.buildSelect(S32, Tmp1, Remainder_S_Den, Remainder);
2541 
2542     // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den)
2543     B.buildSelect(DstReg, Remainder_GE_Zero, Rem, Remainder_A_Den);
2544   } else {
2545     B.buildSelect(DstReg, Remainder_GE_Zero, Div, Quotient_S_One);
2546   }
2547 }
2548 
2549 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2550                                               MachineRegisterInfo &MRI,
2551                                               MachineIRBuilder &B) const {
2552   B.setInstr(MI);
2553   const bool IsRem = MI.getOpcode() == AMDGPU::G_UREM;
2554   Register DstReg = MI.getOperand(0).getReg();
2555   Register Num = MI.getOperand(1).getReg();
2556   Register Den = MI.getOperand(2).getReg();
2557   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsRem);
2558   MI.eraseFromParent();
2559   return true;
2560 }
2561 
2562 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2563 //
2564 // Return lo, hi of result
2565 //
2566 // %cvt.lo = G_UITOFP Val.lo
2567 // %cvt.hi = G_UITOFP Val.hi
2568 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2569 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2570 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2571 // %mul2 = G_FMUL %mul1, 2**(-32)
2572 // %trunc = G_INTRINSIC_TRUNC %mul2
2573 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2574 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2575 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2576                                                        Register Val) {
2577   const LLT S32 = LLT::scalar(32);
2578   auto Unmerge = B.buildUnmerge(S32, Val);
2579 
2580   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2581   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2582 
2583   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2584                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2585 
2586   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2587   auto Mul1 =
2588       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2589 
2590   // 2**(-32)
2591   auto Mul2 =
2592       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2593   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2594 
2595   // -(2**32)
2596   auto Mad2 = B.buildFMAD(S32, Trunc,
2597                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2598 
2599   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2600   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2601 
2602   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2603 }
2604 
2605 bool AMDGPULegalizerInfo::legalizeUDIV_UREM64(MachineInstr &MI,
2606                                               MachineRegisterInfo &MRI,
2607                                               MachineIRBuilder &B) const {
2608   B.setInstr(MI);
2609 
2610   const bool IsDiv = MI.getOpcode() == TargetOpcode::G_UDIV;
2611   const LLT S32 = LLT::scalar(32);
2612   const LLT S64 = LLT::scalar(64);
2613   const LLT S1 = LLT::scalar(1);
2614   Register Numer = MI.getOperand(1).getReg();
2615   Register Denom = MI.getOperand(2).getReg();
2616   Register RcpLo, RcpHi;
2617 
2618   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2619 
2620   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2621 
2622   auto Zero64 = B.buildConstant(S64, 0);
2623   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2624 
2625   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2626   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2627 
2628   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2629   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2630   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2631 
2632   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2633   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2634   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2635   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2636 
2637   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2638   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2639   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2640   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2641   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2642 
2643   auto Zero32 = B.buildConstant(S32, 0);
2644   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2645   auto Add2_HiC =
2646       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2647   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2648   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2649 
2650   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2651   Register NumerLo = UnmergeNumer.getReg(0);
2652   Register NumerHi = UnmergeNumer.getReg(1);
2653 
2654   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2655   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2656   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2657   Register Mul3_Lo = UnmergeMul3.getReg(0);
2658   Register Mul3_Hi = UnmergeMul3.getReg(1);
2659   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2660   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2661   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2662   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2663 
2664   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2665   Register DenomLo = UnmergeDenom.getReg(0);
2666   Register DenomHi = UnmergeDenom.getReg(1);
2667 
2668   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2669   auto C1 = B.buildSExt(S32, CmpHi);
2670 
2671   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2672   auto C2 = B.buildSExt(S32, CmpLo);
2673 
2674   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2675   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2676 
2677   // TODO: Here and below portions of the code can be enclosed into if/endif.
2678   // Currently control flow is unconditional and we have 4 selects after
2679   // potential endif to substitute PHIs.
2680 
2681   // if C3 != 0 ...
2682   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2683   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2684   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2685   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2686 
2687   auto One64 = B.buildConstant(S64, 1);
2688   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2689 
2690   auto C4 =
2691       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2692   auto C5 =
2693       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2694   auto C6 = B.buildSelect(
2695       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2696 
2697   // if (C6 != 0)
2698   auto Add4 = B.buildAdd(S64, Add3, One64);
2699   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2700 
2701   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2702   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2703   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2704 
2705   // endif C6
2706   // endif C3
2707 
2708   if (IsDiv) {
2709     auto Sel1 = B.buildSelect(
2710         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2711     B.buildSelect(MI.getOperand(0),
2712                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2713   } else {
2714     auto Sel2 = B.buildSelect(
2715         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2716     B.buildSelect(MI.getOperand(0),
2717                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2718   }
2719 
2720   MI.eraseFromParent();
2721   return true;
2722 }
2723 
2724 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2725                                             MachineRegisterInfo &MRI,
2726                                             MachineIRBuilder &B) const {
2727   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2728   if (Ty == LLT::scalar(32))
2729     return legalizeUDIV_UREM32(MI, MRI, B);
2730   if (Ty == LLT::scalar(64))
2731     return legalizeUDIV_UREM64(MI, MRI, B);
2732   return false;
2733 }
2734 
2735 bool AMDGPULegalizerInfo::legalizeSDIV_SREM32(MachineInstr &MI,
2736                                               MachineRegisterInfo &MRI,
2737                                               MachineIRBuilder &B) const {
2738   B.setInstr(MI);
2739   const LLT S32 = LLT::scalar(32);
2740 
2741   const bool IsRem = MI.getOpcode() == AMDGPU::G_SREM;
2742   Register DstReg = MI.getOperand(0).getReg();
2743   Register LHS = MI.getOperand(1).getReg();
2744   Register RHS = MI.getOperand(2).getReg();
2745 
2746   auto ThirtyOne = B.buildConstant(S32, 31);
2747   auto LHSign = B.buildAShr(S32, LHS, ThirtyOne);
2748   auto RHSign = B.buildAShr(S32, LHS, ThirtyOne);
2749 
2750   LHS = B.buildAdd(S32, LHS, LHSign).getReg(0);
2751   RHS = B.buildAdd(S32, RHS, RHSign).getReg(0);
2752 
2753   LHS = B.buildXor(S32, LHS, LHSign).getReg(0);
2754   RHS = B.buildXor(S32, RHS, RHSign).getReg(0);
2755 
2756   Register UDivRem = MRI.createGenericVirtualRegister(S32);
2757   legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsRem);
2758 
2759   if (IsRem) {
2760     auto RSign = LHSign; // Remainder sign is the same as LHS
2761     UDivRem = B.buildXor(S32, UDivRem, RSign).getReg(0);
2762     B.buildSub(DstReg, UDivRem, RSign);
2763   } else {
2764     auto DSign = B.buildXor(S32, LHSign, RHSign);
2765     UDivRem = B.buildXor(S32, UDivRem, DSign).getReg(0);
2766     B.buildSub(DstReg, UDivRem, DSign);
2767   }
2768 
2769   MI.eraseFromParent();
2770   return true;
2771 }
2772 
2773 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2774                                             MachineRegisterInfo &MRI,
2775                                             MachineIRBuilder &B) const {
2776   if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32))
2777     return legalizeSDIV_SREM32(MI, MRI, B);
2778   return false;
2779 }
2780 
2781 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2782                                                  MachineRegisterInfo &MRI,
2783                                                  MachineIRBuilder &B) const {
2784   Register Res = MI.getOperand(0).getReg();
2785   Register LHS = MI.getOperand(1).getReg();
2786   Register RHS = MI.getOperand(2).getReg();
2787 
2788   uint16_t Flags = MI.getFlags();
2789 
2790   LLT ResTy = MRI.getType(Res);
2791   LLT S32 = LLT::scalar(32);
2792   LLT S64 = LLT::scalar(64);
2793 
2794   const MachineFunction &MF = B.getMF();
2795   bool Unsafe =
2796     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2797 
2798   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2799     return false;
2800 
2801   if (!Unsafe && ResTy == S32 &&
2802       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2803     return false;
2804 
2805   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2806     // 1 / x -> RCP(x)
2807     if (CLHS->isExactlyValue(1.0)) {
2808       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2809         .addUse(RHS)
2810         .setMIFlags(Flags);
2811 
2812       MI.eraseFromParent();
2813       return true;
2814     }
2815 
2816     // -1 / x -> RCP( FNEG(x) )
2817     if (CLHS->isExactlyValue(-1.0)) {
2818       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2819       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2820         .addUse(FNeg.getReg(0))
2821         .setMIFlags(Flags);
2822 
2823       MI.eraseFromParent();
2824       return true;
2825     }
2826   }
2827 
2828   // x / y -> x * (1.0 / y)
2829   if (Unsafe) {
2830     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2831       .addUse(RHS)
2832       .setMIFlags(Flags);
2833     B.buildFMul(Res, LHS, RCP, Flags);
2834 
2835     MI.eraseFromParent();
2836     return true;
2837   }
2838 
2839   return false;
2840 }
2841 
2842 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2843                                          MachineRegisterInfo &MRI,
2844                                          MachineIRBuilder &B) const {
2845   B.setInstr(MI);
2846   Register Res = MI.getOperand(0).getReg();
2847   Register LHS = MI.getOperand(1).getReg();
2848   Register RHS = MI.getOperand(2).getReg();
2849 
2850   uint16_t Flags = MI.getFlags();
2851 
2852   LLT S16 = LLT::scalar(16);
2853   LLT S32 = LLT::scalar(32);
2854 
2855   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2856   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2857 
2858   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2859     .addUse(RHSExt.getReg(0))
2860     .setMIFlags(Flags);
2861 
2862   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2863   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2864 
2865   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2866     .addUse(RDst.getReg(0))
2867     .addUse(RHS)
2868     .addUse(LHS)
2869     .setMIFlags(Flags);
2870 
2871   MI.eraseFromParent();
2872   return true;
2873 }
2874 
2875 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2876 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2877 static void toggleSPDenormMode(bool Enable,
2878                                MachineIRBuilder &B,
2879                                const GCNSubtarget &ST,
2880                                AMDGPU::SIModeRegisterDefaults Mode) {
2881   // Set SP denorm mode to this value.
2882   unsigned SPDenormMode =
2883     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
2884 
2885   if (ST.hasDenormModeInst()) {
2886     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2887     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
2888 
2889     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2890     B.buildInstr(AMDGPU::S_DENORM_MODE)
2891       .addImm(NewDenormModeValue);
2892 
2893   } else {
2894     // Select FP32 bit field in mode register.
2895     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2896                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2897                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2898 
2899     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2900       .addImm(SPDenormMode)
2901       .addImm(SPDenormModeBitField);
2902   }
2903 }
2904 
2905 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2906                                          MachineRegisterInfo &MRI,
2907                                          MachineIRBuilder &B) const {
2908   B.setInstr(MI);
2909   Register Res = MI.getOperand(0).getReg();
2910   Register LHS = MI.getOperand(1).getReg();
2911   Register RHS = MI.getOperand(2).getReg();
2912   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2913   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2914 
2915   uint16_t Flags = MI.getFlags();
2916 
2917   LLT S32 = LLT::scalar(32);
2918   LLT S1 = LLT::scalar(1);
2919 
2920   auto One = B.buildFConstant(S32, 1.0f);
2921 
2922   auto DenominatorScaled =
2923     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2924       .addUse(LHS)
2925       .addUse(RHS)
2926       .addImm(0)
2927       .setMIFlags(Flags);
2928   auto NumeratorScaled =
2929     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2930       .addUse(LHS)
2931       .addUse(RHS)
2932       .addImm(1)
2933       .setMIFlags(Flags);
2934 
2935   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2936     .addUse(DenominatorScaled.getReg(0))
2937     .setMIFlags(Flags);
2938   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2939 
2940   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2941   // aren't modeled as reading it.
2942   if (!Mode.allFP32Denormals())
2943     toggleSPDenormMode(true, B, ST, Mode);
2944 
2945   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2946   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2947   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2948   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2949   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2950   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2951 
2952   if (!Mode.allFP32Denormals())
2953     toggleSPDenormMode(false, B, ST, Mode);
2954 
2955   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2956     .addUse(Fma4.getReg(0))
2957     .addUse(Fma1.getReg(0))
2958     .addUse(Fma3.getReg(0))
2959     .addUse(NumeratorScaled.getReg(1))
2960     .setMIFlags(Flags);
2961 
2962   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2963     .addUse(Fmas.getReg(0))
2964     .addUse(RHS)
2965     .addUse(LHS)
2966     .setMIFlags(Flags);
2967 
2968   MI.eraseFromParent();
2969   return true;
2970 }
2971 
2972 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
2973                                          MachineRegisterInfo &MRI,
2974                                          MachineIRBuilder &B) const {
2975   B.setInstr(MI);
2976   Register Res = MI.getOperand(0).getReg();
2977   Register LHS = MI.getOperand(1).getReg();
2978   Register RHS = MI.getOperand(2).getReg();
2979 
2980   uint16_t Flags = MI.getFlags();
2981 
2982   LLT S64 = LLT::scalar(64);
2983   LLT S1 = LLT::scalar(1);
2984 
2985   auto One = B.buildFConstant(S64, 1.0);
2986 
2987   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2988     .addUse(LHS)
2989     .addUse(RHS)
2990     .addImm(0)
2991     .setMIFlags(Flags);
2992 
2993   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
2994 
2995   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
2996     .addUse(DivScale0.getReg(0))
2997     .setMIFlags(Flags);
2998 
2999   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3000   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3001   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3002 
3003   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3004     .addUse(LHS)
3005     .addUse(RHS)
3006     .addImm(1)
3007     .setMIFlags(Flags);
3008 
3009   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3010   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3011   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3012 
3013   Register Scale;
3014   if (!ST.hasUsableDivScaleConditionOutput()) {
3015     // Workaround a hardware bug on SI where the condition output from div_scale
3016     // is not usable.
3017 
3018     LLT S32 = LLT::scalar(32);
3019 
3020     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3021     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3022     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3023     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3024 
3025     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3026                               Scale1Unmerge.getReg(1));
3027     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3028                               Scale0Unmerge.getReg(1));
3029     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3030   } else {
3031     Scale = DivScale1.getReg(1);
3032   }
3033 
3034   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3035     .addUse(Fma4.getReg(0))
3036     .addUse(Fma3.getReg(0))
3037     .addUse(Mul.getReg(0))
3038     .addUse(Scale)
3039     .setMIFlags(Flags);
3040 
3041   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3042     .addUse(Fmas.getReg(0))
3043     .addUse(RHS)
3044     .addUse(LHS)
3045     .setMIFlags(Flags);
3046 
3047   MI.eraseFromParent();
3048   return true;
3049 }
3050 
3051 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3052                                                  MachineRegisterInfo &MRI,
3053                                                  MachineIRBuilder &B) const {
3054   B.setInstr(MI);
3055   Register Res = MI.getOperand(0).getReg();
3056   Register LHS = MI.getOperand(2).getReg();
3057   Register RHS = MI.getOperand(3).getReg();
3058   uint16_t Flags = MI.getFlags();
3059 
3060   LLT S32 = LLT::scalar(32);
3061   LLT S1 = LLT::scalar(1);
3062 
3063   auto Abs = B.buildFAbs(S32, RHS, Flags);
3064   const APFloat C0Val(1.0f);
3065 
3066   auto C0 = B.buildConstant(S32, 0x6f800000);
3067   auto C1 = B.buildConstant(S32, 0x2f800000);
3068   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3069 
3070   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3071   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3072 
3073   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3074 
3075   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3076     .addUse(Mul0.getReg(0))
3077     .setMIFlags(Flags);
3078 
3079   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3080 
3081   B.buildFMul(Res, Sel, Mul1, Flags);
3082 
3083   MI.eraseFromParent();
3084   return true;
3085 }
3086 
3087 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3088                                                  MachineRegisterInfo &MRI,
3089                                                  MachineIRBuilder &B) const {
3090   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3091   if (!MFI->isEntryFunction()) {
3092     return legalizePreloadedArgIntrin(MI, MRI, B,
3093                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3094   }
3095 
3096   B.setInstr(MI);
3097 
3098   uint64_t Offset =
3099     ST.getTargetLowering()->getImplicitParameterOffset(
3100       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3101   Register DstReg = MI.getOperand(0).getReg();
3102   LLT DstTy = MRI.getType(DstReg);
3103   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3104 
3105   const ArgDescriptor *Arg;
3106   const TargetRegisterClass *RC;
3107   std::tie(Arg, RC)
3108     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
3109   if (!Arg)
3110     return false;
3111 
3112   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3113   if (!loadInputValue(KernargPtrReg, B, Arg))
3114     return false;
3115 
3116   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3117   MI.eraseFromParent();
3118   return true;
3119 }
3120 
3121 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3122                                               MachineRegisterInfo &MRI,
3123                                               MachineIRBuilder &B,
3124                                               unsigned AddrSpace) const {
3125   B.setInstr(MI);
3126   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3127   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
3128   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3129   MI.eraseFromParent();
3130   return true;
3131 }
3132 
3133 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3134 // offset (the offset that is included in bounds checking and swizzling, to be
3135 // split between the instruction's voffset and immoffset fields) and soffset
3136 // (the offset that is excluded from bounds checking and swizzling, to go in
3137 // the instruction's soffset field).  This function takes the first kind of
3138 // offset and figures out how to split it between voffset and immoffset.
3139 std::tuple<Register, unsigned, unsigned>
3140 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3141                                         Register OrigOffset) const {
3142   const unsigned MaxImm = 4095;
3143   Register BaseReg;
3144   unsigned TotalConstOffset;
3145   MachineInstr *OffsetDef;
3146   const LLT S32 = LLT::scalar(32);
3147 
3148   std::tie(BaseReg, TotalConstOffset, OffsetDef)
3149     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
3150 
3151   unsigned ImmOffset = TotalConstOffset;
3152 
3153   // If the immediate value is too big for the immoffset field, put the value
3154   // and -4096 into the immoffset field so that the value that is copied/added
3155   // for the voffset field is a multiple of 4096, and it stands more chance
3156   // of being CSEd with the copy/add for another similar load/store.
3157   // However, do not do that rounding down to a multiple of 4096 if that is a
3158   // negative number, as it appears to be illegal to have a negative offset
3159   // in the vgpr, even if adding the immediate offset makes it positive.
3160   unsigned Overflow = ImmOffset & ~MaxImm;
3161   ImmOffset -= Overflow;
3162   if ((int32_t)Overflow < 0) {
3163     Overflow += ImmOffset;
3164     ImmOffset = 0;
3165   }
3166 
3167   if (Overflow != 0) {
3168     if (!BaseReg) {
3169       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3170     } else {
3171       auto OverflowVal = B.buildConstant(S32, Overflow);
3172       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3173     }
3174   }
3175 
3176   if (!BaseReg)
3177     BaseReg = B.buildConstant(S32, 0).getReg(0);
3178 
3179   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3180 }
3181 
3182 /// Handle register layout difference for f16 images for some subtargets.
3183 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3184                                              MachineRegisterInfo &MRI,
3185                                              Register Reg) const {
3186   if (!ST.hasUnpackedD16VMem())
3187     return Reg;
3188 
3189   const LLT S16 = LLT::scalar(16);
3190   const LLT S32 = LLT::scalar(32);
3191   LLT StoreVT = MRI.getType(Reg);
3192   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3193 
3194   auto Unmerge = B.buildUnmerge(S16, Reg);
3195 
3196   SmallVector<Register, 4> WideRegs;
3197   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3198     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3199 
3200   int NumElts = StoreVT.getNumElements();
3201 
3202   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3203 }
3204 
3205 Register AMDGPULegalizerInfo::fixStoreSourceType(
3206   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3207   MachineRegisterInfo *MRI = B.getMRI();
3208   LLT Ty = MRI->getType(VData);
3209 
3210   const LLT S16 = LLT::scalar(16);
3211 
3212   // Fixup illegal register types for i8 stores.
3213   if (Ty == LLT::scalar(8) || Ty == S16) {
3214     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3215     return AnyExt;
3216   }
3217 
3218   if (Ty.isVector()) {
3219     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3220       if (IsFormat)
3221         return handleD16VData(B, *MRI, VData);
3222     }
3223   }
3224 
3225   return VData;
3226 }
3227 
3228 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3229                                               MachineRegisterInfo &MRI,
3230                                               MachineIRBuilder &B,
3231                                               bool IsTyped,
3232                                               bool IsFormat) const {
3233   B.setInstr(MI);
3234 
3235   Register VData = MI.getOperand(1).getReg();
3236   LLT Ty = MRI.getType(VData);
3237   LLT EltTy = Ty.getScalarType();
3238   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3239   const LLT S32 = LLT::scalar(32);
3240 
3241   VData = fixStoreSourceType(B, VData, IsFormat);
3242   Register RSrc = MI.getOperand(2).getReg();
3243 
3244   MachineMemOperand *MMO = *MI.memoperands_begin();
3245   const int MemSize = MMO->getSize();
3246 
3247   unsigned ImmOffset;
3248   unsigned TotalOffset;
3249 
3250   // The typed intrinsics add an immediate after the registers.
3251   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3252 
3253   // The struct intrinsic variants add one additional operand over raw.
3254   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3255   Register VIndex;
3256   int OpOffset = 0;
3257   if (HasVIndex) {
3258     VIndex = MI.getOperand(3).getReg();
3259     OpOffset = 1;
3260   }
3261 
3262   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3263   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3264 
3265   unsigned Format = 0;
3266   if (IsTyped) {
3267     Format = MI.getOperand(5 + OpOffset).getImm();
3268     ++OpOffset;
3269   }
3270 
3271   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3272 
3273   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3274   if (TotalOffset != 0)
3275     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3276 
3277   unsigned Opc;
3278   if (IsTyped) {
3279     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3280                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3281   } else if (IsFormat) {
3282     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3283                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3284   } else {
3285     switch (MemSize) {
3286     case 1:
3287       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3288       break;
3289     case 2:
3290       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3291       break;
3292     default:
3293       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3294       break;
3295     }
3296   }
3297 
3298   if (!VIndex)
3299     VIndex = B.buildConstant(S32, 0).getReg(0);
3300 
3301   auto MIB = B.buildInstr(Opc)
3302     .addUse(VData)              // vdata
3303     .addUse(RSrc)               // rsrc
3304     .addUse(VIndex)             // vindex
3305     .addUse(VOffset)            // voffset
3306     .addUse(SOffset)            // soffset
3307     .addImm(ImmOffset);         // offset(imm)
3308 
3309   if (IsTyped)
3310     MIB.addImm(Format);
3311 
3312   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3313      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3314      .addMemOperand(MMO);
3315 
3316   MI.eraseFromParent();
3317   return true;
3318 }
3319 
3320 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3321                                              MachineRegisterInfo &MRI,
3322                                              MachineIRBuilder &B,
3323                                              bool IsFormat,
3324                                              bool IsTyped) const {
3325   B.setInstr(MI);
3326 
3327   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3328   MachineMemOperand *MMO = *MI.memoperands_begin();
3329   const int MemSize = MMO->getSize();
3330   const LLT S32 = LLT::scalar(32);
3331 
3332   Register Dst = MI.getOperand(0).getReg();
3333   Register RSrc = MI.getOperand(2).getReg();
3334 
3335   // The typed intrinsics add an immediate after the registers.
3336   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3337 
3338   // The struct intrinsic variants add one additional operand over raw.
3339   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3340   Register VIndex;
3341   int OpOffset = 0;
3342   if (HasVIndex) {
3343     VIndex = MI.getOperand(3).getReg();
3344     OpOffset = 1;
3345   }
3346 
3347   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3348   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3349 
3350   unsigned Format = 0;
3351   if (IsTyped) {
3352     Format = MI.getOperand(5 + OpOffset).getImm();
3353     ++OpOffset;
3354   }
3355 
3356   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3357   unsigned ImmOffset;
3358   unsigned TotalOffset;
3359 
3360   LLT Ty = MRI.getType(Dst);
3361   LLT EltTy = Ty.getScalarType();
3362   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3363   const bool Unpacked = ST.hasUnpackedD16VMem();
3364 
3365   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3366   if (TotalOffset != 0)
3367     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3368 
3369   unsigned Opc;
3370 
3371   if (IsTyped) {
3372     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3373                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3374   } else if (IsFormat) {
3375     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3376                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3377   } else {
3378     switch (MemSize) {
3379     case 1:
3380       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3381       break;
3382     case 2:
3383       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3384       break;
3385     default:
3386       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3387       break;
3388     }
3389   }
3390 
3391   Register LoadDstReg;
3392 
3393   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3394   LLT UnpackedTy = Ty.changeElementSize(32);
3395 
3396   if (IsExtLoad)
3397     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3398   else if (Unpacked && IsD16 && Ty.isVector())
3399     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3400   else
3401     LoadDstReg = Dst;
3402 
3403   if (!VIndex)
3404     VIndex = B.buildConstant(S32, 0).getReg(0);
3405 
3406   auto MIB = B.buildInstr(Opc)
3407     .addDef(LoadDstReg)         // vdata
3408     .addUse(RSrc)               // rsrc
3409     .addUse(VIndex)             // vindex
3410     .addUse(VOffset)            // voffset
3411     .addUse(SOffset)            // soffset
3412     .addImm(ImmOffset);         // offset(imm)
3413 
3414   if (IsTyped)
3415     MIB.addImm(Format);
3416 
3417   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3418      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3419      .addMemOperand(MMO);
3420 
3421   if (LoadDstReg != Dst) {
3422     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3423 
3424     // Widen result for extending loads was widened.
3425     if (IsExtLoad)
3426       B.buildTrunc(Dst, LoadDstReg);
3427     else {
3428       // Repack to original 16-bit vector result
3429       // FIXME: G_TRUNC should work, but legalization currently fails
3430       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3431       SmallVector<Register, 4> Repack;
3432       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3433         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3434       B.buildMerge(Dst, Repack);
3435     }
3436   }
3437 
3438   MI.eraseFromParent();
3439   return true;
3440 }
3441 
3442 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3443                                                MachineIRBuilder &B,
3444                                                bool IsInc) const {
3445   B.setInstr(MI);
3446   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3447                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3448   B.buildInstr(Opc)
3449     .addDef(MI.getOperand(0).getReg())
3450     .addUse(MI.getOperand(2).getReg())
3451     .addUse(MI.getOperand(3).getReg())
3452     .cloneMemRefs(MI);
3453   MI.eraseFromParent();
3454   return true;
3455 }
3456 
3457 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3458   switch (IntrID) {
3459   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3460   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3461     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3462   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3463   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3464     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3465   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3466   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3467     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3468   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3469   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3470     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3471   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3472   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3473     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3474   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3475   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3476     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3477   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3478   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3479     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3480   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3481   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3482     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3483   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3484   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3485     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3486   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3487   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3488     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3489   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3490   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3491     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3492   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3493   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3494     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3495   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3496   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3497     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3498   default:
3499     llvm_unreachable("unhandled atomic opcode");
3500   }
3501 }
3502 
3503 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3504                                                MachineIRBuilder &B,
3505                                                Intrinsic::ID IID) const {
3506   B.setInstr(MI);
3507 
3508   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3509                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3510 
3511   Register Dst = MI.getOperand(0).getReg();
3512   Register VData = MI.getOperand(2).getReg();
3513 
3514   Register CmpVal;
3515   int OpOffset = 0;
3516 
3517   if (IsCmpSwap) {
3518     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3519     ++OpOffset;
3520   }
3521 
3522   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3523   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
3524 
3525   // The struct intrinsic variants add one additional operand over raw.
3526   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3527   Register VIndex;
3528   if (HasVIndex) {
3529     VIndex = MI.getOperand(4 + OpOffset).getReg();
3530     ++OpOffset;
3531   }
3532 
3533   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3534   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3535   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3536 
3537   MachineMemOperand *MMO = *MI.memoperands_begin();
3538 
3539   unsigned ImmOffset;
3540   unsigned TotalOffset;
3541   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3542   if (TotalOffset != 0)
3543     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3544 
3545   if (!VIndex)
3546     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3547 
3548   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
3549     .addDef(Dst)
3550     .addUse(VData); // vdata
3551 
3552   if (IsCmpSwap)
3553     MIB.addReg(CmpVal);
3554 
3555   MIB.addUse(RSrc)               // rsrc
3556      .addUse(VIndex)             // vindex
3557      .addUse(VOffset)            // voffset
3558      .addUse(SOffset)            // soffset
3559      .addImm(ImmOffset)          // offset(imm)
3560      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3561      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3562      .addMemOperand(MMO);
3563 
3564   MI.eraseFromParent();
3565   return true;
3566 }
3567 
3568 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3569 /// vector with s16 typed elements.
3570 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI,
3571                                         SmallVectorImpl<Register> &PackedAddrs,
3572                                         int AddrIdx, int DimIdx, int NumVAddrs,
3573                                         int NumGradients) {
3574   const LLT S16 = LLT::scalar(16);
3575   const LLT V2S16 = LLT::vector(2, 16);
3576 
3577   for (int I = AddrIdx; I < AddrIdx + NumVAddrs; ++I) {
3578     MachineOperand &SrcOp = MI.getOperand(I);
3579     if (!SrcOp.isReg())
3580       continue; // _L to _LZ may have eliminated this.
3581 
3582     Register AddrReg = SrcOp.getReg();
3583 
3584     if (I < DimIdx) {
3585       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
3586       PackedAddrs.push_back(AddrReg);
3587     } else {
3588       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
3589       // derivatives dx/dh and dx/dv are packed with undef.
3590       if (((I + 1) >= (AddrIdx + NumVAddrs)) ||
3591           ((NumGradients / 2) % 2 == 1 &&
3592            (I == DimIdx + (NumGradients / 2) - 1 ||
3593             I == DimIdx + NumGradients - 1)) ||
3594           // Check for _L to _LZ optimization
3595           !MI.getOperand(I + 1).isReg()) {
3596         PackedAddrs.push_back(
3597             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
3598                 .getReg(0));
3599       } else {
3600         PackedAddrs.push_back(
3601             B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()})
3602                 .getReg(0));
3603         ++I;
3604       }
3605     }
3606   }
3607 }
3608 
3609 /// Convert from separate vaddr components to a single vector address register,
3610 /// and replace the remaining operands with $noreg.
3611 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
3612                                      int DimIdx, int NumVAddrs) {
3613   const LLT S32 = LLT::scalar(32);
3614 
3615   SmallVector<Register, 8> AddrRegs;
3616   for (int I = 0; I != NumVAddrs; ++I) {
3617     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3618     if (SrcOp.isReg()) {
3619       AddrRegs.push_back(SrcOp.getReg());
3620       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
3621     }
3622   }
3623 
3624   int NumAddrRegs = AddrRegs.size();
3625   if (NumAddrRegs != 1) {
3626     // Round up to 8 elements for v5-v7
3627     // FIXME: Missing intermediate sized register classes and instructions.
3628     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
3629       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
3630       auto Undef = B.buildUndef(S32);
3631       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
3632       NumAddrRegs = RoundedNumRegs;
3633     }
3634 
3635     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
3636     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
3637   }
3638 
3639   for (int I = 1; I != NumVAddrs; ++I) {
3640     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3641     if (SrcOp.isReg())
3642       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
3643   }
3644 }
3645 
3646 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
3647 ///
3648 /// Depending on the subtarget, load/store with 16-bit element data need to be
3649 /// rewritten to use the low half of 32-bit registers, or directly use a packed
3650 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
3651 /// registers.
3652 ///
3653 /// We don't want to directly select image instructions just yet, but also want
3654 /// to exposes all register repacking to the legalizer/combiners. We also don't
3655 /// want a selected instrution entering RegBankSelect. In order to avoid
3656 /// defining a multitude of intermediate image instructions, directly hack on
3657 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
3658 /// now unnecessary arguments with $noreg.
3659 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3660     MachineInstr &MI, MachineIRBuilder &B,
3661     GISelChangeObserver &Observer,
3662     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3663   B.setInstr(MI);
3664 
3665   const int NumDefs = MI.getNumExplicitDefs();
3666   bool IsTFE = NumDefs == 2;
3667   // We are only processing the operands of d16 image operations on subtargets
3668   // that use the unpacked register layout, or need to repack the TFE result.
3669 
3670   // TODO: Do we need to guard against already legalized intrinsics?
3671   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3672     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3673 
3674   MachineRegisterInfo *MRI = B.getMRI();
3675   const LLT S32 = LLT::scalar(32);
3676   const LLT S16 = LLT::scalar(16);
3677   const LLT V2S16 = LLT::vector(2, 16);
3678 
3679   // Index of first address argument
3680   const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs);
3681 
3682   // Check for 16 bit addresses and pack if true.
3683   int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
3684   LLT AddrTy = MRI->getType(MI.getOperand(DimIdx).getReg());
3685   const bool IsA16 = AddrTy == S16;
3686 
3687   int NumVAddrs, NumGradients;
3688   std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode);
3689   const int DMaskIdx = BaseOpcode->Atomic ? -1 :
3690     getDMaskIdx(BaseOpcode, NumDefs);
3691   unsigned DMask = 0;
3692 
3693   int DMaskLanes = 0;
3694   if (!BaseOpcode->Atomic) {
3695     DMask = MI.getOperand(DMaskIdx).getImm();
3696     if (BaseOpcode->Gather4) {
3697       DMaskLanes = 4;
3698     } else if (DMask != 0) {
3699       DMaskLanes = countPopulation(DMask);
3700     } else if (!IsTFE && !BaseOpcode->Store) {
3701       // If dmask is 0, this is a no-op load. This can be eliminated.
3702       B.buildUndef(MI.getOperand(0));
3703       MI.eraseFromParent();
3704       return true;
3705     }
3706   }
3707 
3708   Observer.changingInstr(MI);
3709   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
3710 
3711   unsigned NewOpcode = NumDefs == 0 ?
3712     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
3713 
3714   // Track that we legalized this
3715   MI.setDesc(B.getTII().get(NewOpcode));
3716 
3717   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
3718   // dmask to be at least 1 otherwise the instruction will fail
3719   if (IsTFE && DMask == 0) {
3720     DMask = 0x1;
3721     DMaskLanes = 1;
3722     MI.getOperand(DMaskIdx).setImm(DMask);
3723   }
3724 
3725   if (BaseOpcode->Atomic) {
3726     Register VData0 = MI.getOperand(2).getReg();
3727     LLT Ty = MRI->getType(VData0);
3728 
3729     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
3730     if (Ty.isVector())
3731       return false;
3732 
3733     if (BaseOpcode->AtomicX2) {
3734       Register VData1 = MI.getOperand(3).getReg();
3735       // The two values are packed in one register.
3736       LLT PackedTy = LLT::vector(2, Ty);
3737       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
3738       MI.getOperand(2).setReg(Concat.getReg(0));
3739       MI.getOperand(3).setReg(AMDGPU::NoRegister);
3740     }
3741   }
3742 
3743   int CorrectedNumVAddrs = NumVAddrs;
3744 
3745   // Optimize _L to _LZ when _L is zero
3746   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
3747         AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
3748     const ConstantFP *ConstantLod;
3749     const int LodIdx = AddrIdx + NumVAddrs - 1;
3750 
3751     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) {
3752       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
3753         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
3754         ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode(
3755           LZMappingInfo->LZ, ImageDimIntr->Dim);
3756 
3757         // The starting indexes should remain in the same place.
3758         --NumVAddrs;
3759         --CorrectedNumVAddrs;
3760 
3761         MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID(
3762           static_cast<Intrinsic::ID>(ImageDimIntr->Intr));
3763         MI.RemoveOperand(LodIdx);
3764       }
3765     }
3766   }
3767 
3768   // Optimize _mip away, when 'lod' is zero
3769   if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
3770     int64_t ConstantLod;
3771     const int LodIdx = AddrIdx + NumVAddrs - 1;
3772 
3773     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) {
3774       if (ConstantLod == 0) {
3775         // TODO: Change intrinsic opcode and remove operand instead or replacing
3776         // it with 0, as the _L to _LZ handling is done above.
3777         MI.getOperand(LodIdx).ChangeToImmediate(0);
3778         --CorrectedNumVAddrs;
3779       }
3780     }
3781   }
3782 
3783   // If the register allocator cannot place the address registers contiguously
3784   // without introducing moves, then using the non-sequential address encoding
3785   // is always preferable, since it saves VALU instructions and is usually a
3786   // wash in terms of code size or even better.
3787   //
3788   // However, we currently have no way of hinting to the register allocator
3789   // that MIMG addresses should be placed contiguously when it is possible to
3790   // do so, so force non-NSA for the common 2-address case as a heuristic.
3791   //
3792   // SIShrinkInstructions will convert NSA encodings to non-NSA after register
3793   // allocation when possible.
3794   const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
3795 
3796   // Rewrite the addressing register layout before doing anything else.
3797   if (IsA16) {
3798     // FIXME: this feature is missing from gfx10. When that is fixed, this check
3799     // should be introduced.
3800     if (!ST.hasR128A16() && !ST.hasGFX10A16())
3801       return false;
3802 
3803     if (NumVAddrs > 1) {
3804       SmallVector<Register, 4> PackedRegs;
3805       packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, NumVAddrs,
3806                                   NumGradients);
3807 
3808       if (!UseNSA && PackedRegs.size() > 1) {
3809         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
3810         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
3811         PackedRegs[0] = Concat.getReg(0);
3812         PackedRegs.resize(1);
3813       }
3814 
3815       const int NumPacked = PackedRegs.size();
3816       for (int I = 0; I != NumVAddrs; ++I) {
3817         MachineOperand &SrcOp = MI.getOperand(AddrIdx + I);
3818         if (!SrcOp.isReg()) {
3819           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
3820           continue;
3821         }
3822 
3823         assert(SrcOp.getReg() != AMDGPU::NoRegister);
3824 
3825         if (I < NumPacked)
3826           SrcOp.setReg(PackedRegs[I]);
3827         else
3828           SrcOp.setReg(AMDGPU::NoRegister);
3829       }
3830     }
3831   } else if (!UseNSA && NumVAddrs > 1) {
3832     convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
3833   }
3834 
3835 
3836   if (BaseOpcode->Store) { // No TFE for stores?
3837     // TODO: Handle dmask trim
3838     Register VData = MI.getOperand(1).getReg();
3839     LLT Ty = MRI->getType(VData);
3840     if (!Ty.isVector() || Ty.getElementType() != S16)
3841       return true;
3842 
3843     B.setInstr(MI);
3844 
3845     Register RepackedReg = handleD16VData(B, *MRI, VData);
3846     if (RepackedReg != VData) {
3847       MI.getOperand(1).setReg(RepackedReg);
3848     }
3849 
3850     return true;
3851   }
3852 
3853   Register DstReg = MI.getOperand(0).getReg();
3854   LLT Ty = MRI->getType(DstReg);
3855   const LLT EltTy = Ty.getScalarType();
3856   const bool IsD16 = Ty.getScalarType() == S16;
3857   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
3858 
3859   // Confirm that the return type is large enough for the dmask specified
3860   if (NumElts < DMaskLanes)
3861     return false;
3862 
3863   if (NumElts > 4 || DMaskLanes > 4)
3864     return false;
3865 
3866   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
3867   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
3868 
3869   // The raw dword aligned data component of the load. The only legal cases
3870   // where this matters should be when using the packed D16 format, for
3871   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
3872   LLT RoundedTy;
3873 
3874   // S32 vector to to cover all data, plus TFE result element.
3875   LLT TFETy;
3876 
3877   // Register type to use for each loaded component. Will be S32 or V2S16.
3878   LLT RegTy;
3879 
3880   if (IsD16 && ST.hasUnpackedD16VMem()) {
3881     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
3882     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
3883     RegTy = S32;
3884   } else {
3885     unsigned EltSize = EltTy.getSizeInBits();
3886     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
3887     unsigned RoundedSize = 32 * RoundedElts;
3888     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
3889     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
3890     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
3891   }
3892 
3893   // The return type does not need adjustment.
3894   // TODO: Should we change s16 case to s32 or <2 x s16>?
3895   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
3896     return true;
3897 
3898   Register Dst1Reg;
3899 
3900   // Insert after the instruction.
3901   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3902 
3903   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
3904   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
3905   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
3906   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
3907 
3908   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
3909 
3910   MI.getOperand(0).setReg(NewResultReg);
3911 
3912   // In the IR, TFE is supposed to be used with a 2 element struct return
3913   // type. The intruction really returns these two values in one contiguous
3914   // register, with one additional dword beyond the loaded data. Rewrite the
3915   // return type to use a single register result.
3916 
3917   if (IsTFE) {
3918     Dst1Reg = MI.getOperand(1).getReg();
3919     if (MRI->getType(Dst1Reg) != S32)
3920       return false;
3921 
3922     // TODO: Make sure the TFE operand bit is set.
3923     MI.RemoveOperand(1);
3924 
3925     // Handle the easy case that requires no repack instructions.
3926     if (Ty == S32) {
3927       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
3928       return true;
3929     }
3930   }
3931 
3932   // Now figure out how to copy the new result register back into the old
3933   // result.
3934   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
3935 
3936   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
3937 
3938   if (ResultNumRegs == 1) {
3939     assert(!IsTFE);
3940     ResultRegs[0] = NewResultReg;
3941   } else {
3942     // We have to repack into a new vector of some kind.
3943     for (int I = 0; I != NumDataRegs; ++I)
3944       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
3945     B.buildUnmerge(ResultRegs, NewResultReg);
3946 
3947     // Drop the final TFE element to get the data part. The TFE result is
3948     // directly written to the right place already.
3949     if (IsTFE)
3950       ResultRegs.resize(NumDataRegs);
3951   }
3952 
3953   // For an s16 scalar result, we form an s32 result with a truncate regardless
3954   // of packed vs. unpacked.
3955   if (IsD16 && !Ty.isVector()) {
3956     B.buildTrunc(DstReg, ResultRegs[0]);
3957     return true;
3958   }
3959 
3960   // Avoid a build/concat_vector of 1 entry.
3961   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
3962     B.buildBitcast(DstReg, ResultRegs[0]);
3963     return true;
3964   }
3965 
3966   assert(Ty.isVector());
3967 
3968   if (IsD16) {
3969     // For packed D16 results with TFE enabled, all the data components are
3970     // S32. Cast back to the expected type.
3971     //
3972     // TODO: We don't really need to use load s32 elements. We would only need one
3973     // cast for the TFE result if a multiple of v2s16 was used.
3974     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
3975       for (Register &Reg : ResultRegs)
3976         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
3977     } else if (ST.hasUnpackedD16VMem()) {
3978       for (Register &Reg : ResultRegs)
3979         Reg = B.buildTrunc(S16, Reg).getReg(0);
3980     }
3981   }
3982 
3983   auto padWithUndef = [&](LLT Ty, int NumElts) {
3984     if (NumElts == 0)
3985       return;
3986     Register Undef = B.buildUndef(Ty).getReg(0);
3987     for (int I = 0; I != NumElts; ++I)
3988       ResultRegs.push_back(Undef);
3989   };
3990 
3991   // Pad out any elements eliminated due to the dmask.
3992   LLT ResTy = MRI->getType(ResultRegs[0]);
3993   if (!ResTy.isVector()) {
3994     padWithUndef(ResTy, NumElts - ResultRegs.size());
3995     B.buildBuildVector(DstReg, ResultRegs);
3996     return true;
3997   }
3998 
3999   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4000   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4001 
4002   // Deal with the one annoying legal case.
4003   const LLT V3S16 = LLT::vector(3, 16);
4004   if (Ty == V3S16) {
4005     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4006     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4007     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4008     return true;
4009   }
4010 
4011   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4012   B.buildConcatVectors(DstReg, ResultRegs);
4013   return true;
4014 }
4015 
4016 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4017   MachineInstr &MI, MachineIRBuilder &B,
4018   GISelChangeObserver &Observer) const {
4019   Register Dst = MI.getOperand(0).getReg();
4020   LLT Ty = B.getMRI()->getType(Dst);
4021   unsigned Size = Ty.getSizeInBits();
4022   MachineFunction &MF = B.getMF();
4023 
4024   Observer.changingInstr(MI);
4025 
4026   // FIXME: We don't really need this intermediate instruction. The intrinsic
4027   // should be fixed to have a memory operand. Since it's readnone, we're not
4028   // allowed to add one.
4029   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4030   MI.RemoveOperand(1); // Remove intrinsic ID
4031 
4032   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4033   // TODO: Should this use datalayout alignment?
4034   const unsigned MemSize = (Size + 7) / 8;
4035   const Align MemAlign(4);
4036   MachineMemOperand *MMO = MF.getMachineMemOperand(
4037       MachinePointerInfo(),
4038       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4039           MachineMemOperand::MOInvariant,
4040       MemSize, MemAlign);
4041   MI.addMemOperand(MF, MMO);
4042 
4043   // There are no 96-bit result scalar loads, but widening to 128-bit should
4044   // always be legal. We may need to restore this to a 96-bit result if it turns
4045   // out this needs to be converted to a vector load during RegBankSelect.
4046   if (!isPowerOf2_32(Size)) {
4047     LegalizerHelper Helper(MF, *this, Observer, B);
4048     B.setInstr(MI);
4049 
4050     if (Ty.isVector())
4051       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4052     else
4053       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4054   }
4055 
4056   Observer.changedInstr(MI);
4057   return true;
4058 }
4059 
4060 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4061                                                 MachineRegisterInfo &MRI,
4062                                                 MachineIRBuilder &B) const {
4063   B.setInstr(MI);
4064 
4065   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
4066   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4067       !ST.isTrapHandlerEnabled()) {
4068     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4069   } else {
4070     // Pass queue pointer to trap handler as input, and insert trap instruction
4071     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4072     const ArgDescriptor *Arg =
4073         getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR);
4074     if (!Arg)
4075       return false;
4076     MachineRegisterInfo &MRI = *B.getMRI();
4077     Register SGPR01(AMDGPU::SGPR0_SGPR1);
4078     Register LiveIn = getLiveInRegister(
4079         B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64),
4080         /*InsertLiveInCopy=*/false);
4081     if (!loadInputValue(LiveIn, B, Arg))
4082       return false;
4083     B.buildCopy(SGPR01, LiveIn);
4084     B.buildInstr(AMDGPU::S_TRAP)
4085         .addImm(GCNSubtarget::TrapIDLLVMTrap)
4086         .addReg(SGPR01, RegState::Implicit);
4087   }
4088 
4089   MI.eraseFromParent();
4090   return true;
4091 }
4092 
4093 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4094     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4095   B.setInstr(MI);
4096 
4097   // Is non-HSA path or trap-handler disabled? then, report a warning
4098   // accordingly
4099   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4100       !ST.isTrapHandlerEnabled()) {
4101     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4102                                      "debugtrap handler not supported",
4103                                      MI.getDebugLoc(), DS_Warning);
4104     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4105     Ctx.diagnose(NoTrap);
4106   } else {
4107     // Insert debug-trap instruction
4108     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4109   }
4110 
4111   MI.eraseFromParent();
4112   return true;
4113 }
4114 
4115 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
4116                                             MachineIRBuilder &B,
4117                                             GISelChangeObserver &Observer) const {
4118   MachineRegisterInfo &MRI = *B.getMRI();
4119 
4120   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4121   auto IntrID = MI.getIntrinsicID();
4122   switch (IntrID) {
4123   case Intrinsic::amdgcn_if:
4124   case Intrinsic::amdgcn_else: {
4125     MachineInstr *Br = nullptr;
4126     MachineBasicBlock *UncondBrTarget = nullptr;
4127     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4128       const SIRegisterInfo *TRI
4129         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4130 
4131       B.setInstr(*BrCond);
4132       Register Def = MI.getOperand(1).getReg();
4133       Register Use = MI.getOperand(3).getReg();
4134 
4135       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4136       if (IntrID == Intrinsic::amdgcn_if) {
4137         B.buildInstr(AMDGPU::SI_IF)
4138           .addDef(Def)
4139           .addUse(Use)
4140           .addMBB(UncondBrTarget);
4141       } else {
4142         B.buildInstr(AMDGPU::SI_ELSE)
4143           .addDef(Def)
4144           .addUse(Use)
4145           .addMBB(UncondBrTarget)
4146           .addImm(0);
4147       }
4148 
4149       if (Br) {
4150         Br->getOperand(0).setMBB(CondBrTarget);
4151       } else {
4152         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4153         // since we're swapping branch targets it needs to be reinserted.
4154         // FIXME: IRTranslator should probably not do this
4155         B.buildBr(*CondBrTarget);
4156       }
4157 
4158       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4159       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4160       MI.eraseFromParent();
4161       BrCond->eraseFromParent();
4162       return true;
4163     }
4164 
4165     return false;
4166   }
4167   case Intrinsic::amdgcn_loop: {
4168     MachineInstr *Br = nullptr;
4169     MachineBasicBlock *UncondBrTarget = nullptr;
4170     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4171       const SIRegisterInfo *TRI
4172         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4173 
4174       B.setInstr(*BrCond);
4175 
4176       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4177       Register Reg = MI.getOperand(2).getReg();
4178       B.buildInstr(AMDGPU::SI_LOOP)
4179         .addUse(Reg)
4180         .addMBB(UncondBrTarget);
4181 
4182       if (Br)
4183         Br->getOperand(0).setMBB(CondBrTarget);
4184       else
4185         B.buildBr(*CondBrTarget);
4186 
4187       MI.eraseFromParent();
4188       BrCond->eraseFromParent();
4189       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4190       return true;
4191     }
4192 
4193     return false;
4194   }
4195   case Intrinsic::amdgcn_kernarg_segment_ptr:
4196     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4197       B.setInstr(MI);
4198       // This only makes sense to call in a kernel, so just lower to null.
4199       B.buildConstant(MI.getOperand(0).getReg(), 0);
4200       MI.eraseFromParent();
4201       return true;
4202     }
4203 
4204     return legalizePreloadedArgIntrin(
4205       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4206   case Intrinsic::amdgcn_implicitarg_ptr:
4207     return legalizeImplicitArgPtr(MI, MRI, B);
4208   case Intrinsic::amdgcn_workitem_id_x:
4209     return legalizePreloadedArgIntrin(MI, MRI, B,
4210                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4211   case Intrinsic::amdgcn_workitem_id_y:
4212     return legalizePreloadedArgIntrin(MI, MRI, B,
4213                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4214   case Intrinsic::amdgcn_workitem_id_z:
4215     return legalizePreloadedArgIntrin(MI, MRI, B,
4216                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4217   case Intrinsic::amdgcn_workgroup_id_x:
4218     return legalizePreloadedArgIntrin(MI, MRI, B,
4219                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4220   case Intrinsic::amdgcn_workgroup_id_y:
4221     return legalizePreloadedArgIntrin(MI, MRI, B,
4222                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4223   case Intrinsic::amdgcn_workgroup_id_z:
4224     return legalizePreloadedArgIntrin(MI, MRI, B,
4225                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4226   case Intrinsic::amdgcn_dispatch_ptr:
4227     return legalizePreloadedArgIntrin(MI, MRI, B,
4228                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4229   case Intrinsic::amdgcn_queue_ptr:
4230     return legalizePreloadedArgIntrin(MI, MRI, B,
4231                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4232   case Intrinsic::amdgcn_implicit_buffer_ptr:
4233     return legalizePreloadedArgIntrin(
4234       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4235   case Intrinsic::amdgcn_dispatch_id:
4236     return legalizePreloadedArgIntrin(MI, MRI, B,
4237                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4238   case Intrinsic::amdgcn_fdiv_fast:
4239     return legalizeFDIVFastIntrin(MI, MRI, B);
4240   case Intrinsic::amdgcn_is_shared:
4241     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4242   case Intrinsic::amdgcn_is_private:
4243     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4244   case Intrinsic::amdgcn_wavefrontsize: {
4245     B.setInstr(MI);
4246     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4247     MI.eraseFromParent();
4248     return true;
4249   }
4250   case Intrinsic::amdgcn_s_buffer_load:
4251     return legalizeSBufferLoad(MI, B, Observer);
4252   case Intrinsic::amdgcn_raw_buffer_store:
4253   case Intrinsic::amdgcn_struct_buffer_store:
4254     return legalizeBufferStore(MI, MRI, B, false, false);
4255   case Intrinsic::amdgcn_raw_buffer_store_format:
4256   case Intrinsic::amdgcn_struct_buffer_store_format:
4257     return legalizeBufferStore(MI, MRI, B, false, true);
4258   case Intrinsic::amdgcn_raw_tbuffer_store:
4259   case Intrinsic::amdgcn_struct_tbuffer_store:
4260     return legalizeBufferStore(MI, MRI, B, true, true);
4261   case Intrinsic::amdgcn_raw_buffer_load:
4262   case Intrinsic::amdgcn_struct_buffer_load:
4263     return legalizeBufferLoad(MI, MRI, B, false, false);
4264   case Intrinsic::amdgcn_raw_buffer_load_format:
4265   case Intrinsic::amdgcn_struct_buffer_load_format:
4266     return legalizeBufferLoad(MI, MRI, B, true, false);
4267   case Intrinsic::amdgcn_raw_tbuffer_load:
4268   case Intrinsic::amdgcn_struct_tbuffer_load:
4269     return legalizeBufferLoad(MI, MRI, B, true, true);
4270   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4271   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4272   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4273   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4274   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4275   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4276   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4277   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4278   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4279   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4280   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4281   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4282   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4283   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4284   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4285   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4286   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4287   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4288   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4289   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4290   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4291   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4292   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4293   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4294   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4295   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4296     return legalizeBufferAtomic(MI, B, IntrID);
4297   case Intrinsic::amdgcn_atomic_inc:
4298     return legalizeAtomicIncDec(MI, B, true);
4299   case Intrinsic::amdgcn_atomic_dec:
4300     return legalizeAtomicIncDec(MI, B, false);
4301   case Intrinsic::trap:
4302     return legalizeTrapIntrinsic(MI, MRI, B);
4303   case Intrinsic::debugtrap:
4304     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4305   default: {
4306     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4307             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4308       return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr);
4309     return true;
4310   }
4311   }
4312 
4313   return true;
4314 }
4315