1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
19 #endif
20 
21 #include "AMDGPULegalizerInfo.h"
22 
23 #include "AMDGPU.h"
24 #include "AMDGPUGlobalISelUtils.h"
25 #include "AMDGPUTargetMachine.h"
26 #include "SIMachineFunctionInfo.h"
27 #include "llvm/ADT/ScopeExit.h"
28 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
29 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
30 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
31 #include "llvm/CodeGen/TargetOpcodes.h"
32 #include "llvm/CodeGen/ValueTypes.h"
33 #include "llvm/IR/DerivedTypes.h"
34 #include "llvm/IR/DiagnosticInfo.h"
35 #include "llvm/IR/Type.h"
36 #include "llvm/Support/Debug.h"
37 
38 #define DEBUG_TYPE "amdgpu-legalinfo"
39 
40 using namespace llvm;
41 using namespace LegalizeActions;
42 using namespace LegalizeMutations;
43 using namespace LegalityPredicates;
44 using namespace MIPatternMatch;
45 
46 // Round the number of elements to the next power of two elements
47 static LLT getPow2VectorType(LLT Ty) {
48   unsigned NElts = Ty.getNumElements();
49   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
50   return Ty.changeNumElements(Pow2NElts);
51 }
52 
53 // Round the number of bits to the next power of two bits
54 static LLT getPow2ScalarType(LLT Ty) {
55   unsigned Bits = Ty.getSizeInBits();
56   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
57   return LLT::scalar(Pow2Bits);
58 }
59 
60 static LegalityPredicate isMultiple32(unsigned TypeIdx,
61                                       unsigned MaxSize = 1024) {
62   return [=](const LegalityQuery &Query) {
63     const LLT Ty = Query.Types[TypeIdx];
64     const LLT EltTy = Ty.getScalarType();
65     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
66   };
67 }
68 
69 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
70   return [=](const LegalityQuery &Query) {
71     return Query.Types[TypeIdx].getSizeInBits() == Size;
72   };
73 }
74 
75 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
76   return [=](const LegalityQuery &Query) {
77     const LLT Ty = Query.Types[TypeIdx];
78     return Ty.isVector() &&
79            Ty.getNumElements() % 2 != 0 &&
80            Ty.getElementType().getSizeInBits() < 32 &&
81            Ty.getSizeInBits() % 32 != 0;
82   };
83 }
84 
85 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
86   return [=](const LegalityQuery &Query) {
87     const LLT Ty = Query.Types[TypeIdx];
88     const LLT EltTy = Ty.getScalarType();
89     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
90   };
91 }
92 
93 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
94   return [=](const LegalityQuery &Query) {
95     const LLT Ty = Query.Types[TypeIdx];
96     const LLT EltTy = Ty.getElementType();
97     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
98   };
99 }
100 
101 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
102   return [=](const LegalityQuery &Query) {
103     const LLT Ty = Query.Types[TypeIdx];
104     const LLT EltTy = Ty.getElementType();
105     unsigned Size = Ty.getSizeInBits();
106     unsigned Pieces = (Size + 63) / 64;
107     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
108     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
109   };
110 }
111 
112 // Increase the number of vector elements to reach the next multiple of 32-bit
113 // type.
114 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
115   return [=](const LegalityQuery &Query) {
116     const LLT Ty = Query.Types[TypeIdx];
117 
118     const LLT EltTy = Ty.getElementType();
119     const int Size = Ty.getSizeInBits();
120     const int EltSize = EltTy.getSizeInBits();
121     const int NextMul32 = (Size + 31) / 32;
122 
123     assert(EltSize < 32);
124 
125     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
126     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
127   };
128 }
129 
130 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
131   return [=](const LegalityQuery &Query) {
132     const LLT QueryTy = Query.Types[TypeIdx];
133     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
134   };
135 }
136 
137 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
138   return [=](const LegalityQuery &Query) {
139     const LLT QueryTy = Query.Types[TypeIdx];
140     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
141   };
142 }
143 
144 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
145   return [=](const LegalityQuery &Query) {
146     const LLT QueryTy = Query.Types[TypeIdx];
147     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
148   };
149 }
150 
151 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
152 // v2s16.
153 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
154   return [=](const LegalityQuery &Query) {
155     const LLT Ty = Query.Types[TypeIdx];
156     if (Ty.isVector()) {
157       const int EltSize = Ty.getElementType().getSizeInBits();
158       return EltSize == 32 || EltSize == 64 ||
159             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
160              EltSize == 128 || EltSize == 256;
161     }
162 
163     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
164   };
165 }
166 
167 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
168   return [=](const LegalityQuery &Query) {
169     const LLT QueryTy = Query.Types[TypeIdx];
170     return QueryTy.isVector() && QueryTy.getElementType() == Type;
171   };
172 }
173 
174 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
175   return [=](const LegalityQuery &Query) {
176     const LLT QueryTy = Query.Types[TypeIdx];
177     if (!QueryTy.isVector())
178       return false;
179     const LLT EltTy = QueryTy.getElementType();
180     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
181   };
182 }
183 
184 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
185   return [=](const LegalityQuery &Query) {
186     const LLT Ty = Query.Types[TypeIdx];
187     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
188            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
189   };
190 }
191 
192 static LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1) {
193   return [=](const LegalityQuery &Query) {
194     return Query.Types[TypeIdx0].getSizeInBits() <
195            Query.Types[TypeIdx1].getSizeInBits();
196   };
197 }
198 
199 static LegalityPredicate greaterThan(unsigned TypeIdx0, unsigned TypeIdx1) {
200   return [=](const LegalityQuery &Query) {
201     return Query.Types[TypeIdx0].getSizeInBits() >
202            Query.Types[TypeIdx1].getSizeInBits();
203   };
204 }
205 
206 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
207                                          const GCNTargetMachine &TM)
208   :  ST(ST_) {
209   using namespace TargetOpcode;
210 
211   auto GetAddrSpacePtr = [&TM](unsigned AS) {
212     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
213   };
214 
215   const LLT S1 = LLT::scalar(1);
216   const LLT S16 = LLT::scalar(16);
217   const LLT S32 = LLT::scalar(32);
218   const LLT S64 = LLT::scalar(64);
219   const LLT S128 = LLT::scalar(128);
220   const LLT S256 = LLT::scalar(256);
221   const LLT S512 = LLT::scalar(512);
222   const LLT S1024 = LLT::scalar(1024);
223 
224   const LLT V2S16 = LLT::vector(2, 16);
225   const LLT V4S16 = LLT::vector(4, 16);
226 
227   const LLT V2S32 = LLT::vector(2, 32);
228   const LLT V3S32 = LLT::vector(3, 32);
229   const LLT V4S32 = LLT::vector(4, 32);
230   const LLT V5S32 = LLT::vector(5, 32);
231   const LLT V6S32 = LLT::vector(6, 32);
232   const LLT V7S32 = LLT::vector(7, 32);
233   const LLT V8S32 = LLT::vector(8, 32);
234   const LLT V9S32 = LLT::vector(9, 32);
235   const LLT V10S32 = LLT::vector(10, 32);
236   const LLT V11S32 = LLT::vector(11, 32);
237   const LLT V12S32 = LLT::vector(12, 32);
238   const LLT V13S32 = LLT::vector(13, 32);
239   const LLT V14S32 = LLT::vector(14, 32);
240   const LLT V15S32 = LLT::vector(15, 32);
241   const LLT V16S32 = LLT::vector(16, 32);
242   const LLT V32S32 = LLT::vector(32, 32);
243 
244   const LLT V2S64 = LLT::vector(2, 64);
245   const LLT V3S64 = LLT::vector(3, 64);
246   const LLT V4S64 = LLT::vector(4, 64);
247   const LLT V5S64 = LLT::vector(5, 64);
248   const LLT V6S64 = LLT::vector(6, 64);
249   const LLT V7S64 = LLT::vector(7, 64);
250   const LLT V8S64 = LLT::vector(8, 64);
251   const LLT V16S64 = LLT::vector(16, 64);
252 
253   std::initializer_list<LLT> AllS32Vectors =
254     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
255      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
256   std::initializer_list<LLT> AllS64Vectors =
257     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
258 
259   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
260   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
261   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
262   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
263   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
264   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
265   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
266 
267   const LLT CodePtr = FlatPtr;
268 
269   const std::initializer_list<LLT> AddrSpaces64 = {
270     GlobalPtr, ConstantPtr, FlatPtr
271   };
272 
273   const std::initializer_list<LLT> AddrSpaces32 = {
274     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
275   };
276 
277   const std::initializer_list<LLT> FPTypesBase = {
278     S32, S64
279   };
280 
281   const std::initializer_list<LLT> FPTypes16 = {
282     S32, S64, S16
283   };
284 
285   const std::initializer_list<LLT> FPTypesPK16 = {
286     S32, S64, S16, V2S16
287   };
288 
289   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
290 
291   setAction({G_BRCOND, S1}, Legal); // VCC branches
292   setAction({G_BRCOND, S32}, Legal); // SCC branches
293 
294   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
295   // elements for v3s16
296   getActionDefinitionsBuilder(G_PHI)
297     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
298     .legalFor(AllS32Vectors)
299     .legalFor(AllS64Vectors)
300     .legalFor(AddrSpaces64)
301     .legalFor(AddrSpaces32)
302     .clampScalar(0, S32, S256)
303     .widenScalarToNextPow2(0, 32)
304     .clampMaxNumElements(0, S32, 16)
305     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
306     .legalIf(isPointer(0));
307 
308   if (ST.hasVOP3PInsts()) {
309     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
310       .legalFor({S32, S16, V2S16})
311       .clampScalar(0, S16, S32)
312       .clampMaxNumElements(0, S16, 2)
313       .scalarize(0)
314       .widenScalarToNextPow2(0, 32);
315   } else if (ST.has16BitInsts()) {
316     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
317       .legalFor({S32, S16})
318       .clampScalar(0, S16, S32)
319       .scalarize(0)
320       .widenScalarToNextPow2(0, 32);
321   } else {
322     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
323       .legalFor({S32})
324       .clampScalar(0, S32, S32)
325       .scalarize(0);
326   }
327 
328   // FIXME: Not really legal. Placeholder for custom lowering.
329   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
330     .customFor({S32, S64})
331     .clampScalar(0, S32, S64)
332     .widenScalarToNextPow2(0, 32)
333     .scalarize(0);
334 
335   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
336     .legalFor({S32})
337     .clampScalar(0, S32, S32)
338     .scalarize(0);
339 
340   // Report legal for any types we can handle anywhere. For the cases only legal
341   // on the SALU, RegBankSelect will be able to re-legalize.
342   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
343     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
344     .clampScalar(0, S32, S64)
345     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
346     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
347     .widenScalarToNextPow2(0)
348     .scalarize(0);
349 
350   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
351                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
352     .legalFor({{S32, S1}, {S32, S32}})
353     .minScalar(0, S32)
354     // TODO: .scalarize(0)
355     .lower();
356 
357   getActionDefinitionsBuilder(G_BITCAST)
358     // Don't worry about the size constraint.
359     .legalIf(all(isRegisterType(0), isRegisterType(1)))
360     .lower();
361 
362 
363   getActionDefinitionsBuilder(G_CONSTANT)
364     .legalFor({S1, S32, S64, S16, GlobalPtr,
365                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
366     .clampScalar(0, S32, S64)
367     .widenScalarToNextPow2(0)
368     .legalIf(isPointer(0));
369 
370   getActionDefinitionsBuilder(G_FCONSTANT)
371     .legalFor({S32, S64, S16})
372     .clampScalar(0, S16, S64);
373 
374   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
375     .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
376                ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
377     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
378     .clampScalarOrElt(0, S32, S1024)
379     .legalIf(isMultiple32(0))
380     .widenScalarToNextPow2(0, 32)
381     .clampMaxNumElements(0, S32, 16);
382 
383   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
384   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
385     .unsupportedFor({PrivatePtr})
386     .custom();
387   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
388 
389   auto &FPOpActions = getActionDefinitionsBuilder(
390     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
391     .legalFor({S32, S64});
392   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
393     .customFor({S32, S64});
394   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
395     .customFor({S32, S64});
396 
397   if (ST.has16BitInsts()) {
398     if (ST.hasVOP3PInsts())
399       FPOpActions.legalFor({S16, V2S16});
400     else
401       FPOpActions.legalFor({S16});
402 
403     TrigActions.customFor({S16});
404     FDIVActions.customFor({S16});
405   }
406 
407   auto &MinNumMaxNum = getActionDefinitionsBuilder({
408       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
409 
410   if (ST.hasVOP3PInsts()) {
411     MinNumMaxNum.customFor(FPTypesPK16)
412       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
413       .clampMaxNumElements(0, S16, 2)
414       .clampScalar(0, S16, S64)
415       .scalarize(0);
416   } else if (ST.has16BitInsts()) {
417     MinNumMaxNum.customFor(FPTypes16)
418       .clampScalar(0, S16, S64)
419       .scalarize(0);
420   } else {
421     MinNumMaxNum.customFor(FPTypesBase)
422       .clampScalar(0, S32, S64)
423       .scalarize(0);
424   }
425 
426   if (ST.hasVOP3PInsts())
427     FPOpActions.clampMaxNumElements(0, S16, 2);
428 
429   FPOpActions
430     .scalarize(0)
431     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
432 
433   TrigActions
434     .scalarize(0)
435     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
436 
437   FDIVActions
438     .scalarize(0)
439     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
440 
441   getActionDefinitionsBuilder({G_FNEG, G_FABS})
442     .legalFor(FPTypesPK16)
443     .clampMaxNumElements(0, S16, 2)
444     .scalarize(0)
445     .clampScalar(0, S16, S64);
446 
447   if (ST.has16BitInsts()) {
448     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
449       .legalFor({S32, S64, S16})
450       .scalarize(0)
451       .clampScalar(0, S16, S64);
452   } else {
453     getActionDefinitionsBuilder(G_FSQRT)
454       .legalFor({S32, S64})
455       .scalarize(0)
456       .clampScalar(0, S32, S64);
457 
458     if (ST.hasFractBug()) {
459       getActionDefinitionsBuilder(G_FFLOOR)
460         .customFor({S64})
461         .legalFor({S32, S64})
462         .scalarize(0)
463         .clampScalar(0, S32, S64);
464     } else {
465       getActionDefinitionsBuilder(G_FFLOOR)
466         .legalFor({S32, S64})
467         .scalarize(0)
468         .clampScalar(0, S32, S64);
469     }
470   }
471 
472   getActionDefinitionsBuilder(G_FPTRUNC)
473     .legalFor({{S32, S64}, {S16, S32}})
474     .scalarize(0)
475     .lower();
476 
477   getActionDefinitionsBuilder(G_FPEXT)
478     .legalFor({{S64, S32}, {S32, S16}})
479     .lowerFor({{S64, S16}}) // FIXME: Implement
480     .scalarize(0);
481 
482   getActionDefinitionsBuilder(G_FSUB)
483       // Use actual fsub instruction
484       .legalFor({S32})
485       // Must use fadd + fneg
486       .lowerFor({S64, S16, V2S16})
487       .scalarize(0)
488       .clampScalar(0, S32, S64);
489 
490   // Whether this is legal depends on the floating point mode for the function.
491   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
492   if (ST.hasMadF16())
493     FMad.customFor({S32, S16});
494   else
495     FMad.customFor({S32});
496   FMad.scalarize(0)
497       .lower();
498 
499   // TODO: Do we need to clamp maximum bitwidth?
500   getActionDefinitionsBuilder(G_TRUNC)
501     .legalIf(isScalar(0))
502     .legalFor({{V2S16, V2S32}})
503     .clampMaxNumElements(0, S16, 2)
504     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
505     // situations (like an invalid implicit use), we don't want to infinite loop
506     // in the legalizer.
507     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
508     .alwaysLegal();
509 
510   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
511     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
512                {S32, S1}, {S64, S1}, {S16, S1}})
513     .scalarize(0)
514     .clampScalar(0, S32, S64)
515     .widenScalarToNextPow2(1, 32);
516 
517   // TODO: Split s1->s64 during regbankselect for VALU.
518   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
519     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
520     .lowerFor({{S32, S64}})
521     .lowerIf(typeIs(1, S1))
522     .customFor({{S64, S64}});
523   if (ST.has16BitInsts())
524     IToFP.legalFor({{S16, S16}});
525   IToFP.clampScalar(1, S32, S64)
526        .scalarize(0)
527        .widenScalarToNextPow2(1);
528 
529   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
530     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
531     .customFor({{S64, S64}});
532   if (ST.has16BitInsts())
533     FPToI.legalFor({{S16, S16}});
534   else
535     FPToI.minScalar(1, S32);
536 
537   FPToI.minScalar(0, S32)
538        .scalarize(0)
539        .lower();
540 
541   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
542     .scalarize(0)
543     .lower();
544 
545   if (ST.has16BitInsts()) {
546     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
547       .legalFor({S16, S32, S64})
548       .clampScalar(0, S16, S64)
549       .scalarize(0);
550   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
551     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
552       .legalFor({S32, S64})
553       .clampScalar(0, S32, S64)
554       .scalarize(0);
555   } else {
556     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
557       .legalFor({S32})
558       .customFor({S64})
559       .clampScalar(0, S32, S64)
560       .scalarize(0);
561   }
562 
563   getActionDefinitionsBuilder({G_PTR_ADD, G_PTR_MASK})
564     .scalarize(0)
565     .alwaysLegal();
566 
567   auto &CmpBuilder =
568     getActionDefinitionsBuilder(G_ICMP)
569     // The compare output type differs based on the register bank of the output,
570     // so make both s1 and s32 legal.
571     //
572     // Scalar compares producing output in scc will be promoted to s32, as that
573     // is the allocatable register type that will be needed for the copy from
574     // scc. This will be promoted during RegBankSelect, and we assume something
575     // before that won't try to use s32 result types.
576     //
577     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
578     // bank.
579     .legalForCartesianProduct(
580       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
581     .legalForCartesianProduct(
582       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
583   if (ST.has16BitInsts()) {
584     CmpBuilder.legalFor({{S1, S16}});
585   }
586 
587   CmpBuilder
588     .widenScalarToNextPow2(1)
589     .clampScalar(1, S32, S64)
590     .scalarize(0)
591     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
592 
593   getActionDefinitionsBuilder(G_FCMP)
594     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
595     .widenScalarToNextPow2(1)
596     .clampScalar(1, S32, S64)
597     .scalarize(0);
598 
599   // FIXME: fpow has a selection pattern that should move to custom lowering.
600   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
601   if (ST.has16BitInsts())
602     Exp2Ops.legalFor({S32, S16});
603   else
604     Exp2Ops.legalFor({S32});
605   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
606   Exp2Ops.scalarize(0);
607 
608   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
609   if (ST.has16BitInsts())
610     ExpOps.customFor({{S32}, {S16}});
611   else
612     ExpOps.customFor({S32});
613   ExpOps.clampScalar(0, MinScalarFPTy, S32)
614         .scalarize(0);
615 
616   // The 64-bit versions produce 32-bit results, but only on the SALU.
617   getActionDefinitionsBuilder(G_CTPOP)
618     .legalFor({{S32, S32}, {S32, S64}})
619     .clampScalar(0, S32, S32)
620     .clampScalar(1, S32, S64)
621     .scalarize(0)
622     .widenScalarToNextPow2(0, 32)
623     .widenScalarToNextPow2(1, 32);
624 
625   // The hardware instructions return a different result on 0 than the generic
626   // instructions expect. The hardware produces -1, but these produce the
627   // bitwidth.
628   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
629     .scalarize(0)
630     .clampScalar(0, S32, S32)
631     .clampScalar(1, S32, S64)
632     .widenScalarToNextPow2(0, 32)
633     .widenScalarToNextPow2(1, 32)
634     .lower();
635 
636   // The 64-bit versions produce 32-bit results, but only on the SALU.
637   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
638     .legalFor({{S32, S32}, {S32, S64}})
639     .clampScalar(0, S32, S32)
640     .clampScalar(1, S32, S64)
641     .scalarize(0)
642     .widenScalarToNextPow2(0, 32)
643     .widenScalarToNextPow2(1, 32);
644 
645   getActionDefinitionsBuilder(G_BITREVERSE)
646     .legalFor({S32})
647     .clampScalar(0, S32, S32)
648     .scalarize(0);
649 
650   if (ST.has16BitInsts()) {
651     getActionDefinitionsBuilder(G_BSWAP)
652       .legalFor({S16, S32, V2S16})
653       .clampMaxNumElements(0, S16, 2)
654       // FIXME: Fixing non-power-of-2 before clamp is workaround for
655       // narrowScalar limitation.
656       .widenScalarToNextPow2(0)
657       .clampScalar(0, S16, S32)
658       .scalarize(0);
659 
660     if (ST.hasVOP3PInsts()) {
661       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
662         .legalFor({S32, S16, V2S16})
663         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
664         .clampMaxNumElements(0, S16, 2)
665         .minScalar(0, S16)
666         .widenScalarToNextPow2(0)
667         .scalarize(0)
668         .lower();
669     } else {
670       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
671         .legalFor({S32, S16})
672         .widenScalarToNextPow2(0)
673         .minScalar(0, S16)
674         .scalarize(0)
675         .lower();
676     }
677   } else {
678     // TODO: Should have same legality without v_perm_b32
679     getActionDefinitionsBuilder(G_BSWAP)
680       .legalFor({S32})
681       .lowerIf(narrowerThan(0, 32))
682       // FIXME: Fixing non-power-of-2 before clamp is workaround for
683       // narrowScalar limitation.
684       .widenScalarToNextPow2(0)
685       .maxScalar(0, S32)
686       .scalarize(0)
687       .lower();
688 
689     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
690       .legalFor({S32})
691       .minScalar(0, S32)
692       .widenScalarToNextPow2(0)
693       .scalarize(0)
694       .lower();
695   }
696 
697   getActionDefinitionsBuilder(G_INTTOPTR)
698     // List the common cases
699     .legalForCartesianProduct(AddrSpaces64, {S64})
700     .legalForCartesianProduct(AddrSpaces32, {S32})
701     .scalarize(0)
702     // Accept any address space as long as the size matches
703     .legalIf(sameSize(0, 1))
704     .widenScalarIf(smallerThan(1, 0),
705       [](const LegalityQuery &Query) {
706         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
707       })
708     .narrowScalarIf(greaterThan(1, 0),
709       [](const LegalityQuery &Query) {
710         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
711       });
712 
713   getActionDefinitionsBuilder(G_PTRTOINT)
714     // List the common cases
715     .legalForCartesianProduct(AddrSpaces64, {S64})
716     .legalForCartesianProduct(AddrSpaces32, {S32})
717     .scalarize(0)
718     // Accept any address space as long as the size matches
719     .legalIf(sameSize(0, 1))
720     .widenScalarIf(smallerThan(0, 1),
721       [](const LegalityQuery &Query) {
722         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
723       })
724     .narrowScalarIf(
725       greaterThan(0, 1),
726       [](const LegalityQuery &Query) {
727         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
728       });
729 
730   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
731     .scalarize(0)
732     .custom();
733 
734   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
735   // handle some operations by just promoting the register during
736   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
737   auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned {
738     switch (AS) {
739     // FIXME: Private element size.
740     case AMDGPUAS::PRIVATE_ADDRESS:
741       return 32;
742     // FIXME: Check subtarget
743     case AMDGPUAS::LOCAL_ADDRESS:
744       return ST.useDS128() ? 128 : 64;
745 
746     // Treat constant and global as identical. SMRD loads are sometimes usable
747     // for global loads (ideally constant address space should be eliminated)
748     // depending on the context. Legality cannot be context dependent, but
749     // RegBankSelect can split the load as necessary depending on the pointer
750     // register bank/uniformity and if the memory is invariant or not written in
751     // a kernel.
752     case AMDGPUAS::CONSTANT_ADDRESS:
753     case AMDGPUAS::GLOBAL_ADDRESS:
754       return IsLoad ? 512 : 128;
755     default:
756       return 128;
757     }
758   };
759 
760   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
761                                     bool IsLoad) -> bool {
762     const LLT DstTy = Query.Types[0];
763 
764     // Split vector extloads.
765     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
766     unsigned Align = Query.MMODescrs[0].AlignInBits;
767 
768     if (MemSize < DstTy.getSizeInBits())
769       MemSize = std::max(MemSize, Align);
770 
771     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
772       return true;
773 
774     const LLT PtrTy = Query.Types[1];
775     unsigned AS = PtrTy.getAddressSpace();
776     if (MemSize > maxSizeForAddrSpace(AS, IsLoad))
777       return true;
778 
779     // Catch weird sized loads that don't evenly divide into the access sizes
780     // TODO: May be able to widen depending on alignment etc.
781     unsigned NumRegs = (MemSize + 31) / 32;
782     if (NumRegs == 3) {
783       if (!ST.hasDwordx3LoadStores())
784         return true;
785     } else {
786       // If the alignment allows, these should have been widened.
787       if (!isPowerOf2_32(NumRegs))
788         return true;
789     }
790 
791     if (Align < MemSize) {
792       const SITargetLowering *TLI = ST.getTargetLowering();
793       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
794     }
795 
796     return false;
797   };
798 
799   const auto shouldWidenLoadResult = [=](const LegalityQuery &Query) -> bool {
800     unsigned Size = Query.Types[0].getSizeInBits();
801     if (isPowerOf2_32(Size))
802       return false;
803 
804     if (Size == 96 && ST.hasDwordx3LoadStores())
805       return false;
806 
807     unsigned AddrSpace = Query.Types[1].getAddressSpace();
808     if (Size >= maxSizeForAddrSpace(AddrSpace, true))
809       return false;
810 
811     unsigned Align = Query.MMODescrs[0].AlignInBits;
812     unsigned RoundedSize = NextPowerOf2(Size);
813     return (Align >= RoundedSize);
814   };
815 
816   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
817   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
818   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
819 
820   // TODO: Refine based on subtargets which support unaligned access or 128-bit
821   // LDS
822   // TODO: Unsupported flat for SI.
823 
824   for (unsigned Op : {G_LOAD, G_STORE}) {
825     const bool IsStore = Op == G_STORE;
826 
827     auto &Actions = getActionDefinitionsBuilder(Op);
828     // Whitelist the common cases.
829     // TODO: Loads to s16 on gfx9
830     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
831                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
832                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
833                                       {S128, GlobalPtr, 128, GlobalAlign32},
834                                       {S64, GlobalPtr, 64, GlobalAlign32},
835                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
836                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
837                                       {S32, GlobalPtr, 8, GlobalAlign8},
838                                       {S32, GlobalPtr, 16, GlobalAlign16},
839 
840                                       {S32, LocalPtr, 32, 32},
841                                       {S64, LocalPtr, 64, 32},
842                                       {V2S32, LocalPtr, 64, 32},
843                                       {S32, LocalPtr, 8, 8},
844                                       {S32, LocalPtr, 16, 16},
845                                       {V2S16, LocalPtr, 32, 32},
846 
847                                       {S32, PrivatePtr, 32, 32},
848                                       {S32, PrivatePtr, 8, 8},
849                                       {S32, PrivatePtr, 16, 16},
850                                       {V2S16, PrivatePtr, 32, 32},
851 
852                                       {S32, FlatPtr, 32, GlobalAlign32},
853                                       {S32, FlatPtr, 16, GlobalAlign16},
854                                       {S32, FlatPtr, 8, GlobalAlign8},
855                                       {V2S16, FlatPtr, 32, GlobalAlign32},
856 
857                                       {S32, ConstantPtr, 32, GlobalAlign32},
858                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
859                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
860                                       {S64, ConstantPtr, 64, GlobalAlign32},
861                                       {S128, ConstantPtr, 128, GlobalAlign32},
862                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
863     Actions
864         .customIf(typeIs(1, Constant32Ptr))
865         // Widen suitably aligned loads by loading extra elements.
866         .moreElementsIf([=](const LegalityQuery &Query) {
867             const LLT Ty = Query.Types[0];
868             return Op == G_LOAD && Ty.isVector() &&
869                    shouldWidenLoadResult(Query);
870           }, moreElementsToNextPow2(0))
871         .widenScalarIf([=](const LegalityQuery &Query) {
872             const LLT Ty = Query.Types[0];
873             return Op == G_LOAD && !Ty.isVector() &&
874                    shouldWidenLoadResult(Query);
875           }, widenScalarOrEltToNextPow2(0))
876         .narrowScalarIf(
877             [=](const LegalityQuery &Query) -> bool {
878               return !Query.Types[0].isVector() &&
879                      needToSplitMemOp(Query, Op == G_LOAD);
880             },
881             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
882               const LLT DstTy = Query.Types[0];
883               const LLT PtrTy = Query.Types[1];
884 
885               const unsigned DstSize = DstTy.getSizeInBits();
886               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
887 
888               // Split extloads.
889               if (DstSize > MemSize)
890                 return std::make_pair(0, LLT::scalar(MemSize));
891 
892               if (!isPowerOf2_32(DstSize)) {
893                 // We're probably decomposing an odd sized store. Try to split
894                 // to the widest type. TODO: Account for alignment. As-is it
895                 // should be OK, since the new parts will be further legalized.
896                 unsigned FloorSize = PowerOf2Floor(DstSize);
897                 return std::make_pair(0, LLT::scalar(FloorSize));
898               }
899 
900               if (DstSize > 32 && (DstSize % 32 != 0)) {
901                 // FIXME: Need a way to specify non-extload of larger size if
902                 // suitably aligned.
903                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
904               }
905 
906               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
907                                                      Op == G_LOAD);
908               if (MemSize > MaxSize)
909                 return std::make_pair(0, LLT::scalar(MaxSize));
910 
911               unsigned Align = Query.MMODescrs[0].AlignInBits;
912               return std::make_pair(0, LLT::scalar(Align));
913             })
914         .fewerElementsIf(
915             [=](const LegalityQuery &Query) -> bool {
916               return Query.Types[0].isVector() &&
917                      needToSplitMemOp(Query, Op == G_LOAD);
918             },
919             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
920               const LLT DstTy = Query.Types[0];
921               const LLT PtrTy = Query.Types[1];
922 
923               LLT EltTy = DstTy.getElementType();
924               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
925                                                      Op == G_LOAD);
926 
927               // FIXME: Handle widened to power of 2 results better. This ends
928               // up scalarizing.
929               // FIXME: 3 element stores scalarized on SI
930 
931               // Split if it's too large for the address space.
932               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
933                 unsigned NumElts = DstTy.getNumElements();
934                 unsigned EltSize = EltTy.getSizeInBits();
935 
936                 if (MaxSize % EltSize == 0) {
937                   return std::make_pair(
938                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
939                 }
940 
941                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
942 
943                 // FIXME: Refine when odd breakdowns handled
944                 // The scalars will need to be re-legalized.
945                 if (NumPieces == 1 || NumPieces >= NumElts ||
946                     NumElts % NumPieces != 0)
947                   return std::make_pair(0, EltTy);
948 
949                 return std::make_pair(0,
950                                       LLT::vector(NumElts / NumPieces, EltTy));
951               }
952 
953               // FIXME: We could probably handle weird extending loads better.
954               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
955               if (DstTy.getSizeInBits() > MemSize)
956                 return std::make_pair(0, EltTy);
957 
958               unsigned EltSize = EltTy.getSizeInBits();
959               unsigned DstSize = DstTy.getSizeInBits();
960               if (!isPowerOf2_32(DstSize)) {
961                 // We're probably decomposing an odd sized store. Try to split
962                 // to the widest type. TODO: Account for alignment. As-is it
963                 // should be OK, since the new parts will be further legalized.
964                 unsigned FloorSize = PowerOf2Floor(DstSize);
965                 return std::make_pair(
966                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
967               }
968 
969               // Need to split because of alignment.
970               unsigned Align = Query.MMODescrs[0].AlignInBits;
971               if (EltSize > Align &&
972                   (EltSize / Align < DstTy.getNumElements())) {
973                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
974               }
975 
976               // May need relegalization for the scalars.
977               return std::make_pair(0, EltTy);
978             })
979         .minScalar(0, S32);
980 
981     if (IsStore)
982       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
983 
984     // TODO: Need a bitcast lower option?
985     Actions
986         .legalIf([=](const LegalityQuery &Query) {
987           const LLT Ty0 = Query.Types[0];
988           unsigned Size = Ty0.getSizeInBits();
989           unsigned MemSize = Query.MMODescrs[0].SizeInBits;
990           unsigned Align = Query.MMODescrs[0].AlignInBits;
991 
992           // FIXME: Widening store from alignment not valid.
993           if (MemSize < Size)
994             MemSize = std::max(MemSize, Align);
995 
996           // No extending vector loads.
997           if (Size > MemSize && Ty0.isVector())
998             return false;
999 
1000           switch (MemSize) {
1001           case 8:
1002           case 16:
1003             return Size == 32;
1004           case 32:
1005           case 64:
1006           case 128:
1007             return true;
1008           case 96:
1009             return ST.hasDwordx3LoadStores();
1010           case 256:
1011           case 512:
1012             return true;
1013           default:
1014             return false;
1015           }
1016         })
1017         .widenScalarToNextPow2(0)
1018         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
1019   }
1020 
1021   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1022                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1023                                                   {S32, GlobalPtr, 16, 2 * 8},
1024                                                   {S32, LocalPtr, 8, 8},
1025                                                   {S32, LocalPtr, 16, 16},
1026                                                   {S32, PrivatePtr, 8, 8},
1027                                                   {S32, PrivatePtr, 16, 16},
1028                                                   {S32, ConstantPtr, 8, 8},
1029                                                   {S32, ConstantPtr, 16, 2 * 8}});
1030   if (ST.hasFlatAddressSpace()) {
1031     ExtLoads.legalForTypesWithMemDesc(
1032         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1033   }
1034 
1035   ExtLoads.clampScalar(0, S32, S32)
1036           .widenScalarToNextPow2(0)
1037           .unsupportedIfMemSizeNotPow2()
1038           .lower();
1039 
1040   auto &Atomics = getActionDefinitionsBuilder(
1041     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1042      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1043      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1044      G_ATOMICRMW_UMIN})
1045     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1046                {S64, GlobalPtr}, {S64, LocalPtr}});
1047   if (ST.hasFlatAddressSpace()) {
1048     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1049   }
1050 
1051   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1052     .legalFor({{S32, LocalPtr}});
1053 
1054   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1055   // demarshalling
1056   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1057     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1058                 {S32, FlatPtr}, {S64, FlatPtr}})
1059     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1060                {S32, RegionPtr}, {S64, RegionPtr}});
1061   // TODO: Pointer types, any 32-bit or 64-bit vector
1062 
1063   // Condition should be s32 for scalar, s1 for vector.
1064   getActionDefinitionsBuilder(G_SELECT)
1065     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1066           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1067           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1068     .clampScalar(0, S16, S64)
1069     .scalarize(1)
1070     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1071     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1072     .clampMaxNumElements(0, S32, 2)
1073     .clampMaxNumElements(0, LocalPtr, 2)
1074     .clampMaxNumElements(0, PrivatePtr, 2)
1075     .scalarize(0)
1076     .widenScalarToNextPow2(0)
1077     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1078 
1079   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1080   // be more flexible with the shift amount type.
1081   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1082     .legalFor({{S32, S32}, {S64, S32}});
1083   if (ST.has16BitInsts()) {
1084     if (ST.hasVOP3PInsts()) {
1085       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1086             .clampMaxNumElements(0, S16, 2);
1087     } else
1088       Shifts.legalFor({{S16, S16}});
1089 
1090     // TODO: Support 16-bit shift amounts for all types
1091     Shifts.widenScalarIf(
1092       [=](const LegalityQuery &Query) {
1093         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1094         // 32-bit amount.
1095         const LLT ValTy = Query.Types[0];
1096         const LLT AmountTy = Query.Types[1];
1097         return ValTy.getSizeInBits() <= 16 &&
1098                AmountTy.getSizeInBits() < 16;
1099       }, changeTo(1, S16));
1100     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1101     Shifts.clampScalar(1, S32, S32);
1102     Shifts.clampScalar(0, S16, S64);
1103     Shifts.widenScalarToNextPow2(0, 16);
1104   } else {
1105     // Make sure we legalize the shift amount type first, as the general
1106     // expansion for the shifted type will produce much worse code if it hasn't
1107     // been truncated already.
1108     Shifts.clampScalar(1, S32, S32);
1109     Shifts.clampScalar(0, S32, S64);
1110     Shifts.widenScalarToNextPow2(0, 32);
1111   }
1112   Shifts.scalarize(0);
1113 
1114   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1115     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1116     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1117     unsigned IdxTypeIdx = 2;
1118 
1119     getActionDefinitionsBuilder(Op)
1120       .customIf([=](const LegalityQuery &Query) {
1121           const LLT EltTy = Query.Types[EltTypeIdx];
1122           const LLT VecTy = Query.Types[VecTypeIdx];
1123           const LLT IdxTy = Query.Types[IdxTypeIdx];
1124           return (EltTy.getSizeInBits() == 16 ||
1125                   EltTy.getSizeInBits() % 32 == 0) &&
1126                  VecTy.getSizeInBits() % 32 == 0 &&
1127                  VecTy.getSizeInBits() <= 1024 &&
1128                  IdxTy.getSizeInBits() == 32;
1129         })
1130       .clampScalar(EltTypeIdx, S32, S64)
1131       .clampScalar(VecTypeIdx, S32, S64)
1132       .clampScalar(IdxTypeIdx, S32, S32);
1133   }
1134 
1135   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1136     .unsupportedIf([=](const LegalityQuery &Query) {
1137         const LLT &EltTy = Query.Types[1].getElementType();
1138         return Query.Types[0] != EltTy;
1139       });
1140 
1141   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1142     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1143     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1144 
1145     // FIXME: Doesn't handle extract of illegal sizes.
1146     getActionDefinitionsBuilder(Op)
1147       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1148       // FIXME: Multiples of 16 should not be legal.
1149       .legalIf([=](const LegalityQuery &Query) {
1150           const LLT BigTy = Query.Types[BigTyIdx];
1151           const LLT LitTy = Query.Types[LitTyIdx];
1152           return (BigTy.getSizeInBits() % 32 == 0) &&
1153                  (LitTy.getSizeInBits() % 16 == 0);
1154         })
1155       .widenScalarIf(
1156         [=](const LegalityQuery &Query) {
1157           const LLT BigTy = Query.Types[BigTyIdx];
1158           return (BigTy.getScalarSizeInBits() < 16);
1159         },
1160         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1161       .widenScalarIf(
1162         [=](const LegalityQuery &Query) {
1163           const LLT LitTy = Query.Types[LitTyIdx];
1164           return (LitTy.getScalarSizeInBits() < 16);
1165         },
1166         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1167       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1168       .widenScalarToNextPow2(BigTyIdx, 32);
1169 
1170   }
1171 
1172   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1173     .legalForCartesianProduct(AllS32Vectors, {S32})
1174     .legalForCartesianProduct(AllS64Vectors, {S64})
1175     .clampNumElements(0, V16S32, V32S32)
1176     .clampNumElements(0, V2S64, V16S64)
1177     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1178 
1179   if (ST.hasScalarPackInsts()) {
1180     BuildVector
1181       // FIXME: Should probably widen s1 vectors straight to s32
1182       .minScalarOrElt(0, S16)
1183       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1184       .minScalar(1, S32);
1185 
1186     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1187       .legalFor({V2S16, S32})
1188       .lower();
1189     BuildVector.minScalarOrElt(0, S32);
1190   } else {
1191     BuildVector.customFor({V2S16, S16});
1192     BuildVector.minScalarOrElt(0, S32);
1193 
1194     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1195       .customFor({V2S16, S32})
1196       .lower();
1197   }
1198 
1199   BuildVector.legalIf(isRegisterType(0));
1200 
1201   // FIXME: Clamp maximum size
1202   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1203     .legalIf(isRegisterType(0));
1204 
1205   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1206   // pre-legalize.
1207   if (ST.hasVOP3PInsts()) {
1208     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1209       .customFor({V2S16, V2S16})
1210       .lower();
1211   } else
1212     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1213 
1214   // Merge/Unmerge
1215   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1216     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1217     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1218 
1219     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1220       const LLT Ty = Query.Types[TypeIdx];
1221       if (Ty.isVector()) {
1222         const LLT &EltTy = Ty.getElementType();
1223         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1224           return true;
1225         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1226           return true;
1227       }
1228       return false;
1229     };
1230 
1231     auto &Builder = getActionDefinitionsBuilder(Op)
1232       // Try to widen to s16 first for small types.
1233       // TODO: Only do this on targets with legal s16 shifts
1234       .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1235 
1236       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1237       .lowerFor({{S16, V2S16}})
1238       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1239       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1240                            elementTypeIs(1, S16)),
1241                        changeTo(1, V2S16))
1242       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1243       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1244       // valid.
1245       .clampScalar(LitTyIdx, S32, S512)
1246       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1247       // Break up vectors with weird elements into scalars
1248       .fewerElementsIf(
1249         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1250         scalarize(0))
1251       .fewerElementsIf(
1252         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1253         scalarize(1))
1254       .clampScalar(BigTyIdx, S32, S1024);
1255 
1256     if (Op == G_MERGE_VALUES) {
1257       Builder.widenScalarIf(
1258         // TODO: Use 16-bit shifts if legal for 8-bit values?
1259         [=](const LegalityQuery &Query) {
1260           const LLT Ty = Query.Types[LitTyIdx];
1261           return Ty.getSizeInBits() < 32;
1262         },
1263         changeTo(LitTyIdx, S32));
1264     }
1265 
1266     Builder.widenScalarIf(
1267       [=](const LegalityQuery &Query) {
1268         const LLT Ty = Query.Types[BigTyIdx];
1269         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1270           Ty.getSizeInBits() % 16 != 0;
1271       },
1272       [=](const LegalityQuery &Query) {
1273         // Pick the next power of 2, or a multiple of 64 over 128.
1274         // Whichever is smaller.
1275         const LLT &Ty = Query.Types[BigTyIdx];
1276         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1277         if (NewSizeInBits >= 256) {
1278           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1279           if (RoundedTo < NewSizeInBits)
1280             NewSizeInBits = RoundedTo;
1281         }
1282         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1283       })
1284       .legalIf([=](const LegalityQuery &Query) {
1285           const LLT &BigTy = Query.Types[BigTyIdx];
1286           const LLT &LitTy = Query.Types[LitTyIdx];
1287 
1288           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1289             return false;
1290           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1291             return false;
1292 
1293           return BigTy.getSizeInBits() % 16 == 0 &&
1294                  LitTy.getSizeInBits() % 16 == 0 &&
1295                  BigTy.getSizeInBits() <= 1024;
1296         })
1297       // Any vectors left are the wrong size. Scalarize them.
1298       .scalarize(0)
1299       .scalarize(1);
1300   }
1301 
1302   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1303   // RegBankSelect.
1304   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1305     .legalFor({{S32}, {S64}});
1306 
1307   if (ST.hasVOP3PInsts()) {
1308     SextInReg.lowerFor({{V2S16}})
1309       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1310       // get more vector shift opportunities, since we'll get those when
1311       // expanded.
1312       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1313   } else if (ST.has16BitInsts()) {
1314     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1315   } else {
1316     // Prefer to promote to s32 before lowering if we don't have 16-bit
1317     // shifts. This avoid a lot of intermediate truncate and extend operations.
1318     SextInReg.lowerFor({{S32}, {S64}});
1319   }
1320 
1321   SextInReg
1322     .scalarize(0)
1323     .clampScalar(0, S32, S64)
1324     .lower();
1325 
1326   getActionDefinitionsBuilder(G_FSHR)
1327     .legalFor({{S32, S32}})
1328     .scalarize(0)
1329     .lower();
1330 
1331   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1332     .legalFor({S64});
1333 
1334   getActionDefinitionsBuilder({
1335       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1336       G_FCOPYSIGN,
1337 
1338       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1339       G_READ_REGISTER,
1340       G_WRITE_REGISTER,
1341 
1342       G_SADDO, G_SSUBO,
1343 
1344        // TODO: Implement
1345       G_FMINIMUM, G_FMAXIMUM,
1346       G_FSHL
1347     }).lower();
1348 
1349   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1350         G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1351         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1352     .unsupported();
1353 
1354   computeTables();
1355   verify(*ST.getInstrInfo());
1356 }
1357 
1358 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1359                                          MachineRegisterInfo &MRI,
1360                                          MachineIRBuilder &B,
1361                                          GISelChangeObserver &Observer) const {
1362   switch (MI.getOpcode()) {
1363   case TargetOpcode::G_ADDRSPACE_CAST:
1364     return legalizeAddrSpaceCast(MI, MRI, B);
1365   case TargetOpcode::G_FRINT:
1366     return legalizeFrint(MI, MRI, B);
1367   case TargetOpcode::G_FCEIL:
1368     return legalizeFceil(MI, MRI, B);
1369   case TargetOpcode::G_INTRINSIC_TRUNC:
1370     return legalizeIntrinsicTrunc(MI, MRI, B);
1371   case TargetOpcode::G_SITOFP:
1372     return legalizeITOFP(MI, MRI, B, true);
1373   case TargetOpcode::G_UITOFP:
1374     return legalizeITOFP(MI, MRI, B, false);
1375   case TargetOpcode::G_FPTOSI:
1376     return legalizeFPTOI(MI, MRI, B, true);
1377   case TargetOpcode::G_FPTOUI:
1378     return legalizeFPTOI(MI, MRI, B, false);
1379   case TargetOpcode::G_FMINNUM:
1380   case TargetOpcode::G_FMAXNUM:
1381   case TargetOpcode::G_FMINNUM_IEEE:
1382   case TargetOpcode::G_FMAXNUM_IEEE:
1383     return legalizeMinNumMaxNum(MI, MRI, B);
1384   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1385     return legalizeExtractVectorElt(MI, MRI, B);
1386   case TargetOpcode::G_INSERT_VECTOR_ELT:
1387     return legalizeInsertVectorElt(MI, MRI, B);
1388   case TargetOpcode::G_SHUFFLE_VECTOR:
1389     return legalizeShuffleVector(MI, MRI, B);
1390   case TargetOpcode::G_FSIN:
1391   case TargetOpcode::G_FCOS:
1392     return legalizeSinCos(MI, MRI, B);
1393   case TargetOpcode::G_GLOBAL_VALUE:
1394     return legalizeGlobalValue(MI, MRI, B);
1395   case TargetOpcode::G_LOAD:
1396     return legalizeLoad(MI, MRI, B, Observer);
1397   case TargetOpcode::G_FMAD:
1398     return legalizeFMad(MI, MRI, B);
1399   case TargetOpcode::G_FDIV:
1400     return legalizeFDIV(MI, MRI, B);
1401   case TargetOpcode::G_UDIV:
1402   case TargetOpcode::G_UREM:
1403     return legalizeUDIV_UREM(MI, MRI, B);
1404   case TargetOpcode::G_SDIV:
1405   case TargetOpcode::G_SREM:
1406     return legalizeSDIV_SREM(MI, MRI, B);
1407   case TargetOpcode::G_ATOMIC_CMPXCHG:
1408     return legalizeAtomicCmpXChg(MI, MRI, B);
1409   case TargetOpcode::G_FLOG:
1410     return legalizeFlog(MI, B, 1.0f / numbers::log2ef);
1411   case TargetOpcode::G_FLOG10:
1412     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1413   case TargetOpcode::G_FEXP:
1414     return legalizeFExp(MI, B);
1415   case TargetOpcode::G_FPOW:
1416     return legalizeFPow(MI, B);
1417   case TargetOpcode::G_FFLOOR:
1418     return legalizeFFloor(MI, MRI, B);
1419   case TargetOpcode::G_BUILD_VECTOR:
1420     return legalizeBuildVector(MI, MRI, B);
1421   default:
1422     return false;
1423   }
1424 
1425   llvm_unreachable("expected switch to return");
1426 }
1427 
1428 Register AMDGPULegalizerInfo::getSegmentAperture(
1429   unsigned AS,
1430   MachineRegisterInfo &MRI,
1431   MachineIRBuilder &B) const {
1432   MachineFunction &MF = B.getMF();
1433   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1434   const LLT S32 = LLT::scalar(32);
1435 
1436   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1437 
1438   if (ST.hasApertureRegs()) {
1439     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1440     // getreg.
1441     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1442         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1443         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1444     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1445         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1446         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1447     unsigned Encoding =
1448         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1449         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1450         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1451 
1452     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1453 
1454     B.buildInstr(AMDGPU::S_GETREG_B32)
1455       .addDef(GetReg)
1456       .addImm(Encoding);
1457     MRI.setType(GetReg, S32);
1458 
1459     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1460     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1461   }
1462 
1463   Register QueuePtr = MRI.createGenericVirtualRegister(
1464     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1465 
1466   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1467   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1468     return Register();
1469 
1470   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1471   // private_segment_aperture_base_hi.
1472   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1473 
1474   // TODO: can we be smarter about machine pointer info?
1475   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1476   MachineMemOperand *MMO = MF.getMachineMemOperand(
1477       PtrInfo,
1478       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1479           MachineMemOperand::MOInvariant,
1480       4, commonAlignment(Align(64), StructOffset));
1481 
1482   Register LoadAddr;
1483 
1484   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1485   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1486 }
1487 
1488 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1489   MachineInstr &MI, MachineRegisterInfo &MRI,
1490   MachineIRBuilder &B) const {
1491   MachineFunction &MF = B.getMF();
1492 
1493   B.setInstr(MI);
1494 
1495   const LLT S32 = LLT::scalar(32);
1496   Register Dst = MI.getOperand(0).getReg();
1497   Register Src = MI.getOperand(1).getReg();
1498 
1499   LLT DstTy = MRI.getType(Dst);
1500   LLT SrcTy = MRI.getType(Src);
1501   unsigned DestAS = DstTy.getAddressSpace();
1502   unsigned SrcAS = SrcTy.getAddressSpace();
1503 
1504   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1505   // vector element.
1506   assert(!DstTy.isVector());
1507 
1508   const AMDGPUTargetMachine &TM
1509     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1510 
1511   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1512   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1513     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1514     return true;
1515   }
1516 
1517   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1518     // Truncate.
1519     B.buildExtract(Dst, Src, 0);
1520     MI.eraseFromParent();
1521     return true;
1522   }
1523 
1524   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1525     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1526     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1527 
1528     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1529     // another. Merge operands are required to be the same type, but creating an
1530     // extra ptrtoint would be kind of pointless.
1531     auto HighAddr = B.buildConstant(
1532       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1533     B.buildMerge(Dst, {Src, HighAddr});
1534     MI.eraseFromParent();
1535     return true;
1536   }
1537 
1538   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1539     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1540            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1541     unsigned NullVal = TM.getNullPointerValue(DestAS);
1542 
1543     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1544     auto FlatNull = B.buildConstant(SrcTy, 0);
1545 
1546     // Extract low 32-bits of the pointer.
1547     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1548 
1549     auto CmpRes =
1550         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1551     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1552 
1553     MI.eraseFromParent();
1554     return true;
1555   }
1556 
1557   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1558     return false;
1559 
1560   if (!ST.hasFlatAddressSpace())
1561     return false;
1562 
1563   auto SegmentNull =
1564       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1565   auto FlatNull =
1566       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1567 
1568   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1569   if (!ApertureReg.isValid())
1570     return false;
1571 
1572   auto CmpRes =
1573       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1574 
1575   // Coerce the type of the low half of the result so we can use merge_values.
1576   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1577 
1578   // TODO: Should we allow mismatched types but matching sizes in merges to
1579   // avoid the ptrtoint?
1580   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1581   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1582 
1583   MI.eraseFromParent();
1584   return true;
1585 }
1586 
1587 bool AMDGPULegalizerInfo::legalizeFrint(
1588   MachineInstr &MI, MachineRegisterInfo &MRI,
1589   MachineIRBuilder &B) const {
1590   B.setInstr(MI);
1591 
1592   Register Src = MI.getOperand(1).getReg();
1593   LLT Ty = MRI.getType(Src);
1594   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1595 
1596   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1597   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1598 
1599   auto C1 = B.buildFConstant(Ty, C1Val);
1600   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1601 
1602   // TODO: Should this propagate fast-math-flags?
1603   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1604   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1605 
1606   auto C2 = B.buildFConstant(Ty, C2Val);
1607   auto Fabs = B.buildFAbs(Ty, Src);
1608 
1609   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1610   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1611   return true;
1612 }
1613 
1614 bool AMDGPULegalizerInfo::legalizeFceil(
1615   MachineInstr &MI, MachineRegisterInfo &MRI,
1616   MachineIRBuilder &B) const {
1617   B.setInstr(MI);
1618 
1619   const LLT S1 = LLT::scalar(1);
1620   const LLT S64 = LLT::scalar(64);
1621 
1622   Register Src = MI.getOperand(1).getReg();
1623   assert(MRI.getType(Src) == S64);
1624 
1625   // result = trunc(src)
1626   // if (src > 0.0 && src != result)
1627   //   result += 1.0
1628 
1629   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1630 
1631   const auto Zero = B.buildFConstant(S64, 0.0);
1632   const auto One = B.buildFConstant(S64, 1.0);
1633   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1634   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1635   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1636   auto Add = B.buildSelect(S64, And, One, Zero);
1637 
1638   // TODO: Should this propagate fast-math-flags?
1639   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1640   return true;
1641 }
1642 
1643 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1644                                               MachineIRBuilder &B) {
1645   const unsigned FractBits = 52;
1646   const unsigned ExpBits = 11;
1647   LLT S32 = LLT::scalar(32);
1648 
1649   auto Const0 = B.buildConstant(S32, FractBits - 32);
1650   auto Const1 = B.buildConstant(S32, ExpBits);
1651 
1652   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1653     .addUse(Const0.getReg(0))
1654     .addUse(Const1.getReg(0));
1655 
1656   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1657 }
1658 
1659 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1660   MachineInstr &MI, MachineRegisterInfo &MRI,
1661   MachineIRBuilder &B) const {
1662   B.setInstr(MI);
1663 
1664   const LLT S1 = LLT::scalar(1);
1665   const LLT S32 = LLT::scalar(32);
1666   const LLT S64 = LLT::scalar(64);
1667 
1668   Register Src = MI.getOperand(1).getReg();
1669   assert(MRI.getType(Src) == S64);
1670 
1671   // TODO: Should this use extract since the low half is unused?
1672   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1673   Register Hi = Unmerge.getReg(1);
1674 
1675   // Extract the upper half, since this is where we will find the sign and
1676   // exponent.
1677   auto Exp = extractF64Exponent(Hi, B);
1678 
1679   const unsigned FractBits = 52;
1680 
1681   // Extract the sign bit.
1682   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1683   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1684 
1685   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1686 
1687   const auto Zero32 = B.buildConstant(S32, 0);
1688 
1689   // Extend back to 64-bits.
1690   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1691 
1692   auto Shr = B.buildAShr(S64, FractMask, Exp);
1693   auto Not = B.buildNot(S64, Shr);
1694   auto Tmp0 = B.buildAnd(S64, Src, Not);
1695   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1696 
1697   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1698   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1699 
1700   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1701   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1702   return true;
1703 }
1704 
1705 bool AMDGPULegalizerInfo::legalizeITOFP(
1706   MachineInstr &MI, MachineRegisterInfo &MRI,
1707   MachineIRBuilder &B, bool Signed) const {
1708   B.setInstr(MI);
1709 
1710   Register Dst = MI.getOperand(0).getReg();
1711   Register Src = MI.getOperand(1).getReg();
1712 
1713   const LLT S64 = LLT::scalar(64);
1714   const LLT S32 = LLT::scalar(32);
1715 
1716   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1717 
1718   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1719 
1720   auto CvtHi = Signed ?
1721     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1722     B.buildUITOFP(S64, Unmerge.getReg(1));
1723 
1724   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1725 
1726   auto ThirtyTwo = B.buildConstant(S32, 32);
1727   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1728     .addUse(CvtHi.getReg(0))
1729     .addUse(ThirtyTwo.getReg(0));
1730 
1731   // TODO: Should this propagate fast-math-flags?
1732   B.buildFAdd(Dst, LdExp, CvtLo);
1733   MI.eraseFromParent();
1734   return true;
1735 }
1736 
1737 // TODO: Copied from DAG implementation. Verify logic and document how this
1738 // actually works.
1739 bool AMDGPULegalizerInfo::legalizeFPTOI(
1740   MachineInstr &MI, MachineRegisterInfo &MRI,
1741   MachineIRBuilder &B, bool Signed) const {
1742   B.setInstr(MI);
1743 
1744   Register Dst = MI.getOperand(0).getReg();
1745   Register Src = MI.getOperand(1).getReg();
1746 
1747   const LLT S64 = LLT::scalar(64);
1748   const LLT S32 = LLT::scalar(32);
1749 
1750   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1751 
1752   unsigned Flags = MI.getFlags();
1753 
1754   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
1755   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
1756   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
1757 
1758   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
1759   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
1760   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
1761 
1762   auto Hi = Signed ?
1763     B.buildFPTOSI(S32, FloorMul) :
1764     B.buildFPTOUI(S32, FloorMul);
1765   auto Lo = B.buildFPTOUI(S32, Fma);
1766 
1767   B.buildMerge(Dst, { Lo, Hi });
1768   MI.eraseFromParent();
1769 
1770   return true;
1771 }
1772 
1773 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1774   MachineInstr &MI, MachineRegisterInfo &MRI,
1775   MachineIRBuilder &B) const {
1776   MachineFunction &MF = B.getMF();
1777   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1778 
1779   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1780                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1781 
1782   // With ieee_mode disabled, the instructions have the correct behavior
1783   // already for G_FMINNUM/G_FMAXNUM
1784   if (!MFI->getMode().IEEE)
1785     return !IsIEEEOp;
1786 
1787   if (IsIEEEOp)
1788     return true;
1789 
1790   MachineIRBuilder HelperBuilder(MI);
1791   GISelObserverWrapper DummyObserver;
1792   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1793   HelperBuilder.setInstr(MI);
1794   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1795 }
1796 
1797 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1798   MachineInstr &MI, MachineRegisterInfo &MRI,
1799   MachineIRBuilder &B) const {
1800   // TODO: Should move some of this into LegalizerHelper.
1801 
1802   // TODO: Promote dynamic indexing of s16 to s32
1803 
1804   // FIXME: Artifact combiner probably should have replaced the truncated
1805   // constant before this, so we shouldn't need
1806   // getConstantVRegValWithLookThrough.
1807   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1808     MI.getOperand(2).getReg(), MRI);
1809   if (!IdxVal) // Dynamic case will be selected to register indexing.
1810     return true;
1811 
1812   Register Dst = MI.getOperand(0).getReg();
1813   Register Vec = MI.getOperand(1).getReg();
1814 
1815   LLT VecTy = MRI.getType(Vec);
1816   LLT EltTy = VecTy.getElementType();
1817   assert(EltTy == MRI.getType(Dst));
1818 
1819   B.setInstr(MI);
1820 
1821   if (IdxVal->Value < VecTy.getNumElements())
1822     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
1823   else
1824     B.buildUndef(Dst);
1825 
1826   MI.eraseFromParent();
1827   return true;
1828 }
1829 
1830 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1831   MachineInstr &MI, MachineRegisterInfo &MRI,
1832   MachineIRBuilder &B) const {
1833   // TODO: Should move some of this into LegalizerHelper.
1834 
1835   // TODO: Promote dynamic indexing of s16 to s32
1836 
1837   // FIXME: Artifact combiner probably should have replaced the truncated
1838   // constant before this, so we shouldn't need
1839   // getConstantVRegValWithLookThrough.
1840   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1841     MI.getOperand(3).getReg(), MRI);
1842   if (!IdxVal) // Dynamic case will be selected to register indexing.
1843     return true;
1844 
1845   Register Dst = MI.getOperand(0).getReg();
1846   Register Vec = MI.getOperand(1).getReg();
1847   Register Ins = MI.getOperand(2).getReg();
1848 
1849   LLT VecTy = MRI.getType(Vec);
1850   LLT EltTy = VecTy.getElementType();
1851   assert(EltTy == MRI.getType(Ins));
1852 
1853   B.setInstr(MI);
1854 
1855   if (IdxVal->Value < VecTy.getNumElements())
1856     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
1857   else
1858     B.buildUndef(Dst);
1859 
1860   MI.eraseFromParent();
1861   return true;
1862 }
1863 
1864 bool AMDGPULegalizerInfo::legalizeShuffleVector(
1865   MachineInstr &MI, MachineRegisterInfo &MRI,
1866   MachineIRBuilder &B) const {
1867   const LLT V2S16 = LLT::vector(2, 16);
1868 
1869   Register Dst = MI.getOperand(0).getReg();
1870   Register Src0 = MI.getOperand(1).getReg();
1871   LLT DstTy = MRI.getType(Dst);
1872   LLT SrcTy = MRI.getType(Src0);
1873 
1874   if (SrcTy == V2S16 && DstTy == V2S16 &&
1875       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
1876     return true;
1877 
1878   MachineIRBuilder HelperBuilder(MI);
1879   GISelObserverWrapper DummyObserver;
1880   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
1881   HelperBuilder.setInstr(MI);
1882   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
1883 }
1884 
1885 bool AMDGPULegalizerInfo::legalizeSinCos(
1886   MachineInstr &MI, MachineRegisterInfo &MRI,
1887   MachineIRBuilder &B) const {
1888   B.setInstr(MI);
1889 
1890   Register DstReg = MI.getOperand(0).getReg();
1891   Register SrcReg = MI.getOperand(1).getReg();
1892   LLT Ty = MRI.getType(DstReg);
1893   unsigned Flags = MI.getFlags();
1894 
1895   Register TrigVal;
1896   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1897   if (ST.hasTrigReducedRange()) {
1898     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1899     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1900       .addUse(MulVal.getReg(0))
1901       .setMIFlags(Flags).getReg(0);
1902   } else
1903     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1904 
1905   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1906     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1907   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1908     .addUse(TrigVal)
1909     .setMIFlags(Flags);
1910   MI.eraseFromParent();
1911   return true;
1912 }
1913 
1914 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1915   Register DstReg, LLT PtrTy,
1916   MachineIRBuilder &B, const GlobalValue *GV,
1917   unsigned Offset, unsigned GAFlags) const {
1918   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1919   // to the following code sequence:
1920   //
1921   // For constant address space:
1922   //   s_getpc_b64 s[0:1]
1923   //   s_add_u32 s0, s0, $symbol
1924   //   s_addc_u32 s1, s1, 0
1925   //
1926   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1927   //   a fixup or relocation is emitted to replace $symbol with a literal
1928   //   constant, which is a pc-relative offset from the encoding of the $symbol
1929   //   operand to the global variable.
1930   //
1931   // For global address space:
1932   //   s_getpc_b64 s[0:1]
1933   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1934   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1935   //
1936   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1937   //   fixups or relocations are emitted to replace $symbol@*@lo and
1938   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1939   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
1940   //   operand to the global variable.
1941   //
1942   // What we want here is an offset from the value returned by s_getpc
1943   // (which is the address of the s_add_u32 instruction) to the global
1944   // variable, but since the encoding of $symbol starts 4 bytes after the start
1945   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1946   // small. This requires us to add 4 to the global variable offset in order to
1947   // compute the correct address.
1948 
1949   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1950 
1951   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1952     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1953 
1954   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1955     .addDef(PCReg);
1956 
1957   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1958   if (GAFlags == SIInstrInfo::MO_NONE)
1959     MIB.addImm(0);
1960   else
1961     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1962 
1963   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1964 
1965   if (PtrTy.getSizeInBits() == 32)
1966     B.buildExtract(DstReg, PCReg, 0);
1967   return true;
1968  }
1969 
1970 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1971   MachineInstr &MI, MachineRegisterInfo &MRI,
1972   MachineIRBuilder &B) const {
1973   Register DstReg = MI.getOperand(0).getReg();
1974   LLT Ty = MRI.getType(DstReg);
1975   unsigned AS = Ty.getAddressSpace();
1976 
1977   const GlobalValue *GV = MI.getOperand(1).getGlobal();
1978   MachineFunction &MF = B.getMF();
1979   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1980   B.setInstr(MI);
1981 
1982   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1983     if (!MFI->isEntryFunction()) {
1984       const Function &Fn = MF.getFunction();
1985       DiagnosticInfoUnsupported BadLDSDecl(
1986         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
1987         DS_Warning);
1988       Fn.getContext().diagnose(BadLDSDecl);
1989 
1990       // We currently don't have a way to correctly allocate LDS objects that
1991       // aren't directly associated with a kernel. We do force inlining of
1992       // functions that use local objects. However, if these dead functions are
1993       // not eliminated, we don't want a compile time error. Just emit a warning
1994       // and a trap, since there should be no callable path here.
1995       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
1996       B.buildUndef(DstReg);
1997       MI.eraseFromParent();
1998       return true;
1999     }
2000 
2001     // TODO: We could emit code to handle the initialization somewhere.
2002     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2003       const SITargetLowering *TLI = ST.getTargetLowering();
2004       if (!TLI->shouldUseLDSConstAddress(GV)) {
2005         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2006         return true; // Leave in place;
2007       }
2008 
2009       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
2010       MI.eraseFromParent();
2011       return true;
2012     }
2013 
2014     const Function &Fn = MF.getFunction();
2015     DiagnosticInfoUnsupported BadInit(
2016       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2017     Fn.getContext().diagnose(BadInit);
2018     return true;
2019   }
2020 
2021   const SITargetLowering *TLI = ST.getTargetLowering();
2022 
2023   if (TLI->shouldEmitFixup(GV)) {
2024     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2025     MI.eraseFromParent();
2026     return true;
2027   }
2028 
2029   if (TLI->shouldEmitPCReloc(GV)) {
2030     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2031     MI.eraseFromParent();
2032     return true;
2033   }
2034 
2035   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2036   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2037 
2038   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2039       MachinePointerInfo::getGOT(MF),
2040       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2041           MachineMemOperand::MOInvariant,
2042       8 /*Size*/, Align(8));
2043 
2044   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2045 
2046   if (Ty.getSizeInBits() == 32) {
2047     // Truncate if this is a 32-bit constant adrdess.
2048     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2049     B.buildExtract(DstReg, Load, 0);
2050   } else
2051     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2052 
2053   MI.eraseFromParent();
2054   return true;
2055 }
2056 
2057 bool AMDGPULegalizerInfo::legalizeLoad(
2058   MachineInstr &MI, MachineRegisterInfo &MRI,
2059   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
2060   B.setInstr(MI);
2061   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2062   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
2063   Observer.changingInstr(MI);
2064   MI.getOperand(1).setReg(Cast.getReg(0));
2065   Observer.changedInstr(MI);
2066   return true;
2067 }
2068 
2069 bool AMDGPULegalizerInfo::legalizeFMad(
2070   MachineInstr &MI, MachineRegisterInfo &MRI,
2071   MachineIRBuilder &B) const {
2072   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2073   assert(Ty.isScalar());
2074 
2075   MachineFunction &MF = B.getMF();
2076   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2077 
2078   // TODO: Always legal with future ftz flag.
2079   // FIXME: Do we need just output?
2080   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2081     return true;
2082   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2083     return true;
2084 
2085   MachineIRBuilder HelperBuilder(MI);
2086   GISelObserverWrapper DummyObserver;
2087   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2088   HelperBuilder.setInstr(MI);
2089   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2090 }
2091 
2092 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2093   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2094   Register DstReg = MI.getOperand(0).getReg();
2095   Register PtrReg = MI.getOperand(1).getReg();
2096   Register CmpVal = MI.getOperand(2).getReg();
2097   Register NewVal = MI.getOperand(3).getReg();
2098 
2099   assert(SITargetLowering::isFlatGlobalAddrSpace(
2100            MRI.getType(PtrReg).getAddressSpace()) &&
2101          "this should not have been custom lowered");
2102 
2103   LLT ValTy = MRI.getType(CmpVal);
2104   LLT VecTy = LLT::vector(2, ValTy);
2105 
2106   B.setInstr(MI);
2107   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2108 
2109   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2110     .addDef(DstReg)
2111     .addUse(PtrReg)
2112     .addUse(PackedVal)
2113     .setMemRefs(MI.memoperands());
2114 
2115   MI.eraseFromParent();
2116   return true;
2117 }
2118 
2119 bool AMDGPULegalizerInfo::legalizeFlog(
2120   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2121   Register Dst = MI.getOperand(0).getReg();
2122   Register Src = MI.getOperand(1).getReg();
2123   LLT Ty = B.getMRI()->getType(Dst);
2124   unsigned Flags = MI.getFlags();
2125   B.setInstr(MI);
2126 
2127   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2128   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2129 
2130   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2131   MI.eraseFromParent();
2132   return true;
2133 }
2134 
2135 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2136                                        MachineIRBuilder &B) const {
2137   Register Dst = MI.getOperand(0).getReg();
2138   Register Src = MI.getOperand(1).getReg();
2139   unsigned Flags = MI.getFlags();
2140   LLT Ty = B.getMRI()->getType(Dst);
2141   B.setInstr(MI);
2142 
2143   auto K = B.buildFConstant(Ty, numbers::log2e);
2144   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2145   B.buildFExp2(Dst, Mul, Flags);
2146   MI.eraseFromParent();
2147   return true;
2148 }
2149 
2150 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2151                                        MachineIRBuilder &B) const {
2152   Register Dst = MI.getOperand(0).getReg();
2153   Register Src0 = MI.getOperand(1).getReg();
2154   Register Src1 = MI.getOperand(2).getReg();
2155   unsigned Flags = MI.getFlags();
2156   LLT Ty = B.getMRI()->getType(Dst);
2157   B.setInstr(MI);
2158   const LLT S16 = LLT::scalar(16);
2159   const LLT S32 = LLT::scalar(32);
2160 
2161   if (Ty == S32) {
2162     auto Log = B.buildFLog2(S32, Src0, Flags);
2163     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2164       .addUse(Log.getReg(0))
2165       .addUse(Src1)
2166       .setMIFlags(Flags);
2167     B.buildFExp2(Dst, Mul, Flags);
2168   } else if (Ty == S16) {
2169     // There's no f16 fmul_legacy, so we need to convert for it.
2170     auto Log = B.buildFLog2(S16, Src0, Flags);
2171     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2172     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2173     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2174       .addUse(Ext0.getReg(0))
2175       .addUse(Ext1.getReg(0))
2176       .setMIFlags(Flags);
2177 
2178     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2179   } else
2180     return false;
2181 
2182   MI.eraseFromParent();
2183   return true;
2184 }
2185 
2186 // Find a source register, ignoring any possible source modifiers.
2187 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2188   Register ModSrc = OrigSrc;
2189   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2190     ModSrc = SrcFNeg->getOperand(1).getReg();
2191     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2192       ModSrc = SrcFAbs->getOperand(1).getReg();
2193   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2194     ModSrc = SrcFAbs->getOperand(1).getReg();
2195   return ModSrc;
2196 }
2197 
2198 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2199                                          MachineRegisterInfo &MRI,
2200                                          MachineIRBuilder &B) const {
2201   B.setInstr(MI);
2202 
2203   const LLT S1 = LLT::scalar(1);
2204   const LLT S64 = LLT::scalar(64);
2205   Register Dst = MI.getOperand(0).getReg();
2206   Register OrigSrc = MI.getOperand(1).getReg();
2207   unsigned Flags = MI.getFlags();
2208   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2209          "this should not have been custom lowered");
2210 
2211   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2212   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2213   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2214   // V_FRACT bug is:
2215   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2216   //
2217   // Convert floor(x) to (x - fract(x))
2218 
2219   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2220     .addUse(OrigSrc)
2221     .setMIFlags(Flags);
2222 
2223   // Give source modifier matching some assistance before obscuring a foldable
2224   // pattern.
2225 
2226   // TODO: We can avoid the neg on the fract? The input sign to fract
2227   // shouldn't matter?
2228   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2229 
2230   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2231 
2232   Register Min = MRI.createGenericVirtualRegister(S64);
2233 
2234   // We don't need to concern ourselves with the snan handling difference, so
2235   // use the one which will directly select.
2236   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2237   if (MFI->getMode().IEEE)
2238     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2239   else
2240     B.buildFMinNum(Min, Fract, Const, Flags);
2241 
2242   Register CorrectedFract = Min;
2243   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2244     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2245     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2246   }
2247 
2248   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2249   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2250 
2251   MI.eraseFromParent();
2252   return true;
2253 }
2254 
2255 // Turn an illegal packed v2s16 build vector into bit operations.
2256 // TODO: This should probably be a bitcast action in LegalizerHelper.
2257 bool AMDGPULegalizerInfo::legalizeBuildVector(
2258   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2259   Register Dst = MI.getOperand(0).getReg();
2260   const LLT S32 = LLT::scalar(32);
2261   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2262 
2263   Register Src0 = MI.getOperand(1).getReg();
2264   Register Src1 = MI.getOperand(2).getReg();
2265   assert(MRI.getType(Src0) == LLT::scalar(16));
2266 
2267   B.setInstr(MI);
2268   auto Merge = B.buildMerge(S32, {Src0, Src1});
2269   B.buildBitcast(Dst, Merge);
2270 
2271   MI.eraseFromParent();
2272   return true;
2273 }
2274 
2275 // Return the use branch instruction, otherwise null if the usage is invalid.
2276 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2277                                        MachineRegisterInfo &MRI,
2278                                        MachineInstr *&Br) {
2279   Register CondDef = MI.getOperand(0).getReg();
2280   if (!MRI.hasOneNonDBGUse(CondDef))
2281     return nullptr;
2282 
2283   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2284   if (UseMI.getParent() != MI.getParent() ||
2285       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2286     return nullptr;
2287 
2288   // Make sure the cond br is followed by a G_BR
2289   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2290   if (Next != MI.getParent()->end()) {
2291     if (Next->getOpcode() != AMDGPU::G_BR)
2292       return nullptr;
2293     Br = &*Next;
2294   }
2295 
2296   return &UseMI;
2297 }
2298 
2299 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B,
2300                                                MachineRegisterInfo &MRI,
2301                                                Register LiveIn,
2302                                                Register PhyReg) const {
2303   assert(PhyReg.isPhysical() && "Physical register expected");
2304 
2305   // Insert the live-in copy, if required, by defining destination virtual
2306   // register.
2307   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
2308   if (!MRI.getVRegDef(LiveIn)) {
2309     // FIXME: Should have scoped insert pt
2310     MachineBasicBlock &OrigInsBB = B.getMBB();
2311     auto OrigInsPt = B.getInsertPt();
2312 
2313     MachineBasicBlock &EntryMBB = B.getMF().front();
2314     EntryMBB.addLiveIn(PhyReg);
2315     B.setInsertPt(EntryMBB, EntryMBB.begin());
2316     B.buildCopy(LiveIn, PhyReg);
2317 
2318     B.setInsertPt(OrigInsBB, OrigInsPt);
2319   }
2320 
2321   return LiveIn;
2322 }
2323 
2324 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B,
2325                                                 MachineRegisterInfo &MRI,
2326                                                 Register PhyReg, LLT Ty,
2327                                                 bool InsertLiveInCopy) const {
2328   assert(PhyReg.isPhysical() && "Physical register expected");
2329 
2330   // Get or create virtual live-in regester
2331   Register LiveIn = MRI.getLiveInVirtReg(PhyReg);
2332   if (!LiveIn) {
2333     LiveIn = MRI.createGenericVirtualRegister(Ty);
2334     MRI.addLiveIn(PhyReg, LiveIn);
2335   }
2336 
2337   // When the actual true copy required is from virtual register to physical
2338   // register (to be inserted later), live-in copy insertion from physical
2339   // to register virtual register is not required
2340   if (!InsertLiveInCopy)
2341     return LiveIn;
2342 
2343   return insertLiveInCopy(B, MRI, LiveIn, PhyReg);
2344 }
2345 
2346 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor(
2347     MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2348   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2349   const ArgDescriptor *Arg;
2350   const TargetRegisterClass *RC;
2351   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
2352   if (!Arg) {
2353     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
2354     return nullptr;
2355   }
2356   return Arg;
2357 }
2358 
2359 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2360                                          const ArgDescriptor *Arg) const {
2361   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2362     return false; // TODO: Handle these
2363 
2364   Register SrcReg = Arg->getRegister();
2365   assert(SrcReg.isPhysical() && "Physical register expected");
2366   assert(DstReg.isVirtual() && "Virtual register expected");
2367 
2368   MachineRegisterInfo &MRI = *B.getMRI();
2369 
2370   LLT Ty = MRI.getType(DstReg);
2371   Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty);
2372 
2373   if (Arg->isMasked()) {
2374     // TODO: Should we try to emit this once in the entry block?
2375     const LLT S32 = LLT::scalar(32);
2376     const unsigned Mask = Arg->getMask();
2377     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2378 
2379     Register AndMaskSrc = LiveIn;
2380 
2381     if (Shift != 0) {
2382       auto ShiftAmt = B.buildConstant(S32, Shift);
2383       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2384     }
2385 
2386     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2387   } else {
2388     B.buildCopy(DstReg, LiveIn);
2389   }
2390 
2391   return true;
2392 }
2393 
2394 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2395     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2396     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2397   B.setInstr(MI);
2398 
2399   const ArgDescriptor *Arg = getArgDescriptor(B, ArgType);
2400   if (!Arg)
2401     return false;
2402 
2403   if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg))
2404     return false;
2405 
2406   MI.eraseFromParent();
2407   return true;
2408 }
2409 
2410 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2411                                        MachineRegisterInfo &MRI,
2412                                        MachineIRBuilder &B) const {
2413   B.setInstr(MI);
2414   Register Dst = MI.getOperand(0).getReg();
2415   LLT DstTy = MRI.getType(Dst);
2416   LLT S16 = LLT::scalar(16);
2417   LLT S32 = LLT::scalar(32);
2418   LLT S64 = LLT::scalar(64);
2419 
2420   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2421     return true;
2422 
2423   if (DstTy == S16)
2424     return legalizeFDIV16(MI, MRI, B);
2425   if (DstTy == S32)
2426     return legalizeFDIV32(MI, MRI, B);
2427   if (DstTy == S64)
2428     return legalizeFDIV64(MI, MRI, B);
2429 
2430   return false;
2431 }
2432 
2433 static Register buildDivRCP(MachineIRBuilder &B, Register Src) {
2434   const LLT S32 = LLT::scalar(32);
2435 
2436   auto Cvt0 = B.buildUITOFP(S32, Src);
2437   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Cvt0});
2438   auto FPUIntMaxPlus1 = B.buildFConstant(S32, BitsToFloat(0x4f800000));
2439   auto Mul = B.buildFMul(S32, RcpIFlag, FPUIntMaxPlus1);
2440   return B.buildFPTOUI(S32, Mul).getReg(0);
2441 }
2442 
2443 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2444                                                   Register DstReg,
2445                                                   Register Num,
2446                                                   Register Den,
2447                                                   bool IsRem) const {
2448   const LLT S1 = LLT::scalar(1);
2449   const LLT S32 = LLT::scalar(32);
2450 
2451   // RCP =  URECIP(Den) = 2^32 / Den + e
2452   // e is rounding error.
2453   auto RCP = buildDivRCP(B, Den);
2454 
2455   // RCP_LO = mul(RCP, Den)
2456   auto RCP_LO = B.buildMul(S32, RCP, Den);
2457 
2458   // RCP_HI = mulhu (RCP, Den) */
2459   auto RCP_HI = B.buildUMulH(S32, RCP, Den);
2460 
2461   // NEG_RCP_LO = -RCP_LO
2462   auto Zero = B.buildConstant(S32, 0);
2463   auto NEG_RCP_LO = B.buildSub(S32, Zero, RCP_LO);
2464 
2465   // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
2466   auto CmpRcpHiZero = B.buildICmp(CmpInst::ICMP_EQ, S1, RCP_HI, Zero);
2467   auto ABS_RCP_LO = B.buildSelect(S32, CmpRcpHiZero, NEG_RCP_LO, RCP_LO);
2468 
2469   // Calculate the rounding error from the URECIP instruction
2470   // E = mulhu(ABS_RCP_LO, RCP)
2471   auto E = B.buildUMulH(S32, ABS_RCP_LO, RCP);
2472 
2473   // RCP_A_E = RCP + E
2474   auto RCP_A_E = B.buildAdd(S32, RCP, E);
2475 
2476   // RCP_S_E = RCP - E
2477   auto RCP_S_E = B.buildSub(S32, RCP, E);
2478 
2479   // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
2480   auto Tmp0 = B.buildSelect(S32, CmpRcpHiZero, RCP_A_E, RCP_S_E);
2481 
2482   // Quotient = mulhu(Tmp0, Num)stmp
2483   auto Quotient = B.buildUMulH(S32, Tmp0, Num);
2484 
2485   // Num_S_Remainder = Quotient * Den
2486   auto Num_S_Remainder = B.buildMul(S32, Quotient, Den);
2487 
2488   // Remainder = Num - Num_S_Remainder
2489   auto Remainder = B.buildSub(S32, Num, Num_S_Remainder);
2490 
2491   // Remainder_GE_Den = Remainder >= Den
2492   auto Remainder_GE_Den = B.buildICmp(CmpInst::ICMP_UGE, S1, Remainder, Den);
2493 
2494   // Remainder_GE_Zero = Num >= Num_S_Remainder;
2495   auto Remainder_GE_Zero = B.buildICmp(CmpInst::ICMP_UGE, S1,
2496                                        Num, Num_S_Remainder);
2497 
2498   // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
2499   auto Tmp1 = B.buildAnd(S1, Remainder_GE_Den, Remainder_GE_Zero);
2500 
2501   // Calculate Division result:
2502 
2503   // Quotient_A_One = Quotient + 1
2504   auto One = B.buildConstant(S32, 1);
2505   auto Quotient_A_One = B.buildAdd(S32, Quotient, One);
2506 
2507   // Quotient_S_One = Quotient - 1
2508   auto Quotient_S_One = B.buildSub(S32, Quotient, One);
2509 
2510   // Div = (Tmp1 == 0 ? Quotient_A_One : Quotient)
2511   auto Div = B.buildSelect(S32, Tmp1, Quotient, Quotient_A_One);
2512 
2513   // Div = (Remainder_GE_Zero ? Div : Quotient_S_One)
2514   if (IsRem) {
2515     Div = B.buildSelect(S32, Remainder_GE_Zero, Div, Quotient_S_One);
2516 
2517     // Calculate Rem result:
2518     auto Remainder_S_Den = B.buildSub(S32, Remainder, Den);
2519 
2520     // Remainder_A_Den = Remainder + Den
2521     auto Remainder_A_Den = B.buildAdd(S32, Remainder, Den);
2522 
2523     // Rem = (Tmp1 ? Remainder_S_Den : Remainder)
2524     auto Rem = B.buildSelect(S32, Tmp1, Remainder_S_Den, Remainder);
2525 
2526     // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den)
2527     B.buildSelect(DstReg, Remainder_GE_Zero, Rem, Remainder_A_Den);
2528   } else {
2529     B.buildSelect(DstReg, Remainder_GE_Zero, Div, Quotient_S_One);
2530   }
2531 }
2532 
2533 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2534                                               MachineRegisterInfo &MRI,
2535                                               MachineIRBuilder &B) const {
2536   B.setInstr(MI);
2537   const bool IsRem = MI.getOpcode() == AMDGPU::G_UREM;
2538   Register DstReg = MI.getOperand(0).getReg();
2539   Register Num = MI.getOperand(1).getReg();
2540   Register Den = MI.getOperand(2).getReg();
2541   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsRem);
2542   MI.eraseFromParent();
2543   return true;
2544 }
2545 
2546 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2547 //
2548 // Return lo, hi of result
2549 //
2550 // %cvt.lo = G_UITOFP Val.lo
2551 // %cvt.hi = G_UITOFP Val.hi
2552 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2553 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2554 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2555 // %mul2 = G_FMUL %mul1, 2**(-32)
2556 // %trunc = G_INTRINSIC_TRUNC %mul2
2557 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2558 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2559 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2560                                                        Register Val) {
2561   const LLT S32 = LLT::scalar(32);
2562   auto Unmerge = B.buildUnmerge(S32, Val);
2563 
2564   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2565   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2566 
2567   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2568                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2569 
2570   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2571   auto Mul1 =
2572       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2573 
2574   // 2**(-32)
2575   auto Mul2 =
2576       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2577   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2578 
2579   // -(2**32)
2580   auto Mad2 = B.buildFMAD(S32, Trunc,
2581                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2582 
2583   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2584   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2585 
2586   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2587 }
2588 
2589 bool AMDGPULegalizerInfo::legalizeUDIV_UREM64(MachineInstr &MI,
2590                                               MachineRegisterInfo &MRI,
2591                                               MachineIRBuilder &B) const {
2592   B.setInstr(MI);
2593 
2594   const bool IsDiv = MI.getOpcode() == TargetOpcode::G_UDIV;
2595   const LLT S32 = LLT::scalar(32);
2596   const LLT S64 = LLT::scalar(64);
2597   const LLT S1 = LLT::scalar(1);
2598   Register Numer = MI.getOperand(1).getReg();
2599   Register Denom = MI.getOperand(2).getReg();
2600   Register RcpLo, RcpHi;
2601 
2602   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2603 
2604   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2605 
2606   auto Zero64 = B.buildConstant(S64, 0);
2607   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2608 
2609   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2610   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2611 
2612   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2613   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2614   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2615 
2616   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2617   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2618   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2619   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2620 
2621   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2622   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2623   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2624   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2625   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2626 
2627   auto Zero32 = B.buildConstant(S32, 0);
2628   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2629   auto Add2_HiC =
2630       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2631   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2632   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2633 
2634   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2635   Register NumerLo = UnmergeNumer.getReg(0);
2636   Register NumerHi = UnmergeNumer.getReg(1);
2637 
2638   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2639   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2640   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2641   Register Mul3_Lo = UnmergeMul3.getReg(0);
2642   Register Mul3_Hi = UnmergeMul3.getReg(1);
2643   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2644   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2645   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2646   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2647 
2648   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2649   Register DenomLo = UnmergeDenom.getReg(0);
2650   Register DenomHi = UnmergeDenom.getReg(1);
2651 
2652   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2653   auto C1 = B.buildSExt(S32, CmpHi);
2654 
2655   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2656   auto C2 = B.buildSExt(S32, CmpLo);
2657 
2658   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2659   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2660 
2661   // TODO: Here and below portions of the code can be enclosed into if/endif.
2662   // Currently control flow is unconditional and we have 4 selects after
2663   // potential endif to substitute PHIs.
2664 
2665   // if C3 != 0 ...
2666   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2667   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2668   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2669   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2670 
2671   auto One64 = B.buildConstant(S64, 1);
2672   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2673 
2674   auto C4 =
2675       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2676   auto C5 =
2677       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2678   auto C6 = B.buildSelect(
2679       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2680 
2681   // if (C6 != 0)
2682   auto Add4 = B.buildAdd(S64, Add3, One64);
2683   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2684 
2685   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2686   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2687   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2688 
2689   // endif C6
2690   // endif C3
2691 
2692   if (IsDiv) {
2693     auto Sel1 = B.buildSelect(
2694         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2695     B.buildSelect(MI.getOperand(0),
2696                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2697   } else {
2698     auto Sel2 = B.buildSelect(
2699         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2700     B.buildSelect(MI.getOperand(0),
2701                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2702   }
2703 
2704   MI.eraseFromParent();
2705   return true;
2706 }
2707 
2708 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2709                                             MachineRegisterInfo &MRI,
2710                                             MachineIRBuilder &B) const {
2711   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2712   if (Ty == LLT::scalar(32))
2713     return legalizeUDIV_UREM32(MI, MRI, B);
2714   if (Ty == LLT::scalar(64))
2715     return legalizeUDIV_UREM64(MI, MRI, B);
2716   return false;
2717 }
2718 
2719 bool AMDGPULegalizerInfo::legalizeSDIV_SREM32(MachineInstr &MI,
2720                                               MachineRegisterInfo &MRI,
2721                                               MachineIRBuilder &B) const {
2722   B.setInstr(MI);
2723   const LLT S32 = LLT::scalar(32);
2724 
2725   const bool IsRem = MI.getOpcode() == AMDGPU::G_SREM;
2726   Register DstReg = MI.getOperand(0).getReg();
2727   Register LHS = MI.getOperand(1).getReg();
2728   Register RHS = MI.getOperand(2).getReg();
2729 
2730   auto ThirtyOne = B.buildConstant(S32, 31);
2731   auto LHSign = B.buildAShr(S32, LHS, ThirtyOne);
2732   auto RHSign = B.buildAShr(S32, LHS, ThirtyOne);
2733 
2734   LHS = B.buildAdd(S32, LHS, LHSign).getReg(0);
2735   RHS = B.buildAdd(S32, RHS, RHSign).getReg(0);
2736 
2737   LHS = B.buildXor(S32, LHS, LHSign).getReg(0);
2738   RHS = B.buildXor(S32, RHS, RHSign).getReg(0);
2739 
2740   Register UDivRem = MRI.createGenericVirtualRegister(S32);
2741   legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsRem);
2742 
2743   if (IsRem) {
2744     auto RSign = LHSign; // Remainder sign is the same as LHS
2745     UDivRem = B.buildXor(S32, UDivRem, RSign).getReg(0);
2746     B.buildSub(DstReg, UDivRem, RSign);
2747   } else {
2748     auto DSign = B.buildXor(S32, LHSign, RHSign);
2749     UDivRem = B.buildXor(S32, UDivRem, DSign).getReg(0);
2750     B.buildSub(DstReg, UDivRem, DSign);
2751   }
2752 
2753   MI.eraseFromParent();
2754   return true;
2755 }
2756 
2757 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2758                                             MachineRegisterInfo &MRI,
2759                                             MachineIRBuilder &B) const {
2760   if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32))
2761     return legalizeSDIV_SREM32(MI, MRI, B);
2762   return false;
2763 }
2764 
2765 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2766                                                  MachineRegisterInfo &MRI,
2767                                                  MachineIRBuilder &B) const {
2768   Register Res = MI.getOperand(0).getReg();
2769   Register LHS = MI.getOperand(1).getReg();
2770   Register RHS = MI.getOperand(2).getReg();
2771 
2772   uint16_t Flags = MI.getFlags();
2773 
2774   LLT ResTy = MRI.getType(Res);
2775   LLT S32 = LLT::scalar(32);
2776   LLT S64 = LLT::scalar(64);
2777 
2778   const MachineFunction &MF = B.getMF();
2779   bool Unsafe =
2780     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2781 
2782   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2783     return false;
2784 
2785   if (!Unsafe && ResTy == S32 &&
2786       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2787     return false;
2788 
2789   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2790     // 1 / x -> RCP(x)
2791     if (CLHS->isExactlyValue(1.0)) {
2792       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2793         .addUse(RHS)
2794         .setMIFlags(Flags);
2795 
2796       MI.eraseFromParent();
2797       return true;
2798     }
2799 
2800     // -1 / x -> RCP( FNEG(x) )
2801     if (CLHS->isExactlyValue(-1.0)) {
2802       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2803       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2804         .addUse(FNeg.getReg(0))
2805         .setMIFlags(Flags);
2806 
2807       MI.eraseFromParent();
2808       return true;
2809     }
2810   }
2811 
2812   // x / y -> x * (1.0 / y)
2813   if (Unsafe) {
2814     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2815       .addUse(RHS)
2816       .setMIFlags(Flags);
2817     B.buildFMul(Res, LHS, RCP, Flags);
2818 
2819     MI.eraseFromParent();
2820     return true;
2821   }
2822 
2823   return false;
2824 }
2825 
2826 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2827                                          MachineRegisterInfo &MRI,
2828                                          MachineIRBuilder &B) const {
2829   B.setInstr(MI);
2830   Register Res = MI.getOperand(0).getReg();
2831   Register LHS = MI.getOperand(1).getReg();
2832   Register RHS = MI.getOperand(2).getReg();
2833 
2834   uint16_t Flags = MI.getFlags();
2835 
2836   LLT S16 = LLT::scalar(16);
2837   LLT S32 = LLT::scalar(32);
2838 
2839   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2840   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2841 
2842   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2843     .addUse(RHSExt.getReg(0))
2844     .setMIFlags(Flags);
2845 
2846   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2847   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2848 
2849   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2850     .addUse(RDst.getReg(0))
2851     .addUse(RHS)
2852     .addUse(LHS)
2853     .setMIFlags(Flags);
2854 
2855   MI.eraseFromParent();
2856   return true;
2857 }
2858 
2859 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2860 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2861 static void toggleSPDenormMode(bool Enable,
2862                                MachineIRBuilder &B,
2863                                const GCNSubtarget &ST,
2864                                AMDGPU::SIModeRegisterDefaults Mode) {
2865   // Set SP denorm mode to this value.
2866   unsigned SPDenormMode =
2867     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
2868 
2869   if (ST.hasDenormModeInst()) {
2870     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2871     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
2872 
2873     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2874     B.buildInstr(AMDGPU::S_DENORM_MODE)
2875       .addImm(NewDenormModeValue);
2876 
2877   } else {
2878     // Select FP32 bit field in mode register.
2879     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2880                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2881                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2882 
2883     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2884       .addImm(SPDenormMode)
2885       .addImm(SPDenormModeBitField);
2886   }
2887 }
2888 
2889 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2890                                          MachineRegisterInfo &MRI,
2891                                          MachineIRBuilder &B) const {
2892   B.setInstr(MI);
2893   Register Res = MI.getOperand(0).getReg();
2894   Register LHS = MI.getOperand(1).getReg();
2895   Register RHS = MI.getOperand(2).getReg();
2896   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2897   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2898 
2899   uint16_t Flags = MI.getFlags();
2900 
2901   LLT S32 = LLT::scalar(32);
2902   LLT S1 = LLT::scalar(1);
2903 
2904   auto One = B.buildFConstant(S32, 1.0f);
2905 
2906   auto DenominatorScaled =
2907     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2908       .addUse(LHS)
2909       .addUse(RHS)
2910       .addImm(0)
2911       .setMIFlags(Flags);
2912   auto NumeratorScaled =
2913     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2914       .addUse(LHS)
2915       .addUse(RHS)
2916       .addImm(1)
2917       .setMIFlags(Flags);
2918 
2919   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2920     .addUse(DenominatorScaled.getReg(0))
2921     .setMIFlags(Flags);
2922   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2923 
2924   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2925   // aren't modeled as reading it.
2926   if (!Mode.allFP32Denormals())
2927     toggleSPDenormMode(true, B, ST, Mode);
2928 
2929   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2930   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2931   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2932   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2933   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2934   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2935 
2936   if (!Mode.allFP32Denormals())
2937     toggleSPDenormMode(false, B, ST, Mode);
2938 
2939   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2940     .addUse(Fma4.getReg(0))
2941     .addUse(Fma1.getReg(0))
2942     .addUse(Fma3.getReg(0))
2943     .addUse(NumeratorScaled.getReg(1))
2944     .setMIFlags(Flags);
2945 
2946   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2947     .addUse(Fmas.getReg(0))
2948     .addUse(RHS)
2949     .addUse(LHS)
2950     .setMIFlags(Flags);
2951 
2952   MI.eraseFromParent();
2953   return true;
2954 }
2955 
2956 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
2957                                          MachineRegisterInfo &MRI,
2958                                          MachineIRBuilder &B) const {
2959   B.setInstr(MI);
2960   Register Res = MI.getOperand(0).getReg();
2961   Register LHS = MI.getOperand(1).getReg();
2962   Register RHS = MI.getOperand(2).getReg();
2963 
2964   uint16_t Flags = MI.getFlags();
2965 
2966   LLT S64 = LLT::scalar(64);
2967   LLT S1 = LLT::scalar(1);
2968 
2969   auto One = B.buildFConstant(S64, 1.0);
2970 
2971   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2972     .addUse(LHS)
2973     .addUse(RHS)
2974     .addImm(0)
2975     .setMIFlags(Flags);
2976 
2977   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
2978 
2979   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
2980     .addUse(DivScale0.getReg(0))
2981     .setMIFlags(Flags);
2982 
2983   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
2984   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
2985   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
2986 
2987   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2988     .addUse(LHS)
2989     .addUse(RHS)
2990     .addImm(1)
2991     .setMIFlags(Flags);
2992 
2993   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
2994   auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags);
2995   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
2996 
2997   Register Scale;
2998   if (!ST.hasUsableDivScaleConditionOutput()) {
2999     // Workaround a hardware bug on SI where the condition output from div_scale
3000     // is not usable.
3001 
3002     LLT S32 = LLT::scalar(32);
3003 
3004     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3005     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3006     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3007     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3008 
3009     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3010                               Scale1Unmerge.getReg(1));
3011     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3012                               Scale0Unmerge.getReg(1));
3013     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3014   } else {
3015     Scale = DivScale1.getReg(1);
3016   }
3017 
3018   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3019     .addUse(Fma4.getReg(0))
3020     .addUse(Fma3.getReg(0))
3021     .addUse(Mul.getReg(0))
3022     .addUse(Scale)
3023     .setMIFlags(Flags);
3024 
3025   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3026     .addUse(Fmas.getReg(0))
3027     .addUse(RHS)
3028     .addUse(LHS)
3029     .setMIFlags(Flags);
3030 
3031   MI.eraseFromParent();
3032   return true;
3033 }
3034 
3035 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3036                                                  MachineRegisterInfo &MRI,
3037                                                  MachineIRBuilder &B) const {
3038   B.setInstr(MI);
3039   Register Res = MI.getOperand(0).getReg();
3040   Register LHS = MI.getOperand(2).getReg();
3041   Register RHS = MI.getOperand(3).getReg();
3042   uint16_t Flags = MI.getFlags();
3043 
3044   LLT S32 = LLT::scalar(32);
3045   LLT S1 = LLT::scalar(1);
3046 
3047   auto Abs = B.buildFAbs(S32, RHS, Flags);
3048   const APFloat C0Val(1.0f);
3049 
3050   auto C0 = B.buildConstant(S32, 0x6f800000);
3051   auto C1 = B.buildConstant(S32, 0x2f800000);
3052   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3053 
3054   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3055   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3056 
3057   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3058 
3059   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3060     .addUse(Mul0.getReg(0))
3061     .setMIFlags(Flags);
3062 
3063   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3064 
3065   B.buildFMul(Res, Sel, Mul1, Flags);
3066 
3067   MI.eraseFromParent();
3068   return true;
3069 }
3070 
3071 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3072                                                  MachineRegisterInfo &MRI,
3073                                                  MachineIRBuilder &B) const {
3074   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3075   if (!MFI->isEntryFunction()) {
3076     return legalizePreloadedArgIntrin(MI, MRI, B,
3077                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3078   }
3079 
3080   B.setInstr(MI);
3081 
3082   uint64_t Offset =
3083     ST.getTargetLowering()->getImplicitParameterOffset(
3084       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3085   Register DstReg = MI.getOperand(0).getReg();
3086   LLT DstTy = MRI.getType(DstReg);
3087   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3088 
3089   const ArgDescriptor *Arg;
3090   const TargetRegisterClass *RC;
3091   std::tie(Arg, RC)
3092     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
3093   if (!Arg)
3094     return false;
3095 
3096   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3097   if (!loadInputValue(KernargPtrReg, B, Arg))
3098     return false;
3099 
3100   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3101   MI.eraseFromParent();
3102   return true;
3103 }
3104 
3105 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3106                                               MachineRegisterInfo &MRI,
3107                                               MachineIRBuilder &B,
3108                                               unsigned AddrSpace) const {
3109   B.setInstr(MI);
3110   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3111   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
3112   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3113   MI.eraseFromParent();
3114   return true;
3115 }
3116 
3117 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3118 // offset (the offset that is included in bounds checking and swizzling, to be
3119 // split between the instruction's voffset and immoffset fields) and soffset
3120 // (the offset that is excluded from bounds checking and swizzling, to go in
3121 // the instruction's soffset field).  This function takes the first kind of
3122 // offset and figures out how to split it between voffset and immoffset.
3123 std::tuple<Register, unsigned, unsigned>
3124 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3125                                         Register OrigOffset) const {
3126   const unsigned MaxImm = 4095;
3127   Register BaseReg;
3128   unsigned TotalConstOffset;
3129   MachineInstr *OffsetDef;
3130   const LLT S32 = LLT::scalar(32);
3131 
3132   std::tie(BaseReg, TotalConstOffset, OffsetDef)
3133     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
3134 
3135   unsigned ImmOffset = TotalConstOffset;
3136 
3137   // If the immediate value is too big for the immoffset field, put the value
3138   // and -4096 into the immoffset field so that the value that is copied/added
3139   // for the voffset field is a multiple of 4096, and it stands more chance
3140   // of being CSEd with the copy/add for another similar load/store.
3141   // However, do not do that rounding down to a multiple of 4096 if that is a
3142   // negative number, as it appears to be illegal to have a negative offset
3143   // in the vgpr, even if adding the immediate offset makes it positive.
3144   unsigned Overflow = ImmOffset & ~MaxImm;
3145   ImmOffset -= Overflow;
3146   if ((int32_t)Overflow < 0) {
3147     Overflow += ImmOffset;
3148     ImmOffset = 0;
3149   }
3150 
3151   if (Overflow != 0) {
3152     if (!BaseReg) {
3153       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3154     } else {
3155       auto OverflowVal = B.buildConstant(S32, Overflow);
3156       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3157     }
3158   }
3159 
3160   if (!BaseReg)
3161     BaseReg = B.buildConstant(S32, 0).getReg(0);
3162 
3163   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3164 }
3165 
3166 /// Handle register layout difference for f16 images for some subtargets.
3167 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3168                                              MachineRegisterInfo &MRI,
3169                                              Register Reg) const {
3170   if (!ST.hasUnpackedD16VMem())
3171     return Reg;
3172 
3173   const LLT S16 = LLT::scalar(16);
3174   const LLT S32 = LLT::scalar(32);
3175   LLT StoreVT = MRI.getType(Reg);
3176   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3177 
3178   auto Unmerge = B.buildUnmerge(S16, Reg);
3179 
3180   SmallVector<Register, 4> WideRegs;
3181   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3182     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3183 
3184   int NumElts = StoreVT.getNumElements();
3185 
3186   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3187 }
3188 
3189 Register AMDGPULegalizerInfo::fixStoreSourceType(
3190   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3191   MachineRegisterInfo *MRI = B.getMRI();
3192   LLT Ty = MRI->getType(VData);
3193 
3194   const LLT S16 = LLT::scalar(16);
3195 
3196   // Fixup illegal register types for i8 stores.
3197   if (Ty == LLT::scalar(8) || Ty == S16) {
3198     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3199     return AnyExt;
3200   }
3201 
3202   if (Ty.isVector()) {
3203     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3204       if (IsFormat)
3205         return handleD16VData(B, *MRI, VData);
3206     }
3207   }
3208 
3209   return VData;
3210 }
3211 
3212 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3213                                               MachineRegisterInfo &MRI,
3214                                               MachineIRBuilder &B,
3215                                               bool IsTyped,
3216                                               bool IsFormat) const {
3217   B.setInstr(MI);
3218 
3219   Register VData = MI.getOperand(1).getReg();
3220   LLT Ty = MRI.getType(VData);
3221   LLT EltTy = Ty.getScalarType();
3222   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3223   const LLT S32 = LLT::scalar(32);
3224 
3225   VData = fixStoreSourceType(B, VData, IsFormat);
3226   Register RSrc = MI.getOperand(2).getReg();
3227 
3228   MachineMemOperand *MMO = *MI.memoperands_begin();
3229   const int MemSize = MMO->getSize();
3230 
3231   unsigned ImmOffset;
3232   unsigned TotalOffset;
3233 
3234   // The typed intrinsics add an immediate after the registers.
3235   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3236 
3237   // The struct intrinsic variants add one additional operand over raw.
3238   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3239   Register VIndex;
3240   int OpOffset = 0;
3241   if (HasVIndex) {
3242     VIndex = MI.getOperand(3).getReg();
3243     OpOffset = 1;
3244   }
3245 
3246   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3247   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3248 
3249   unsigned Format = 0;
3250   if (IsTyped) {
3251     Format = MI.getOperand(5 + OpOffset).getImm();
3252     ++OpOffset;
3253   }
3254 
3255   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3256 
3257   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3258   if (TotalOffset != 0)
3259     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3260 
3261   unsigned Opc;
3262   if (IsTyped) {
3263     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3264                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3265   } else if (IsFormat) {
3266     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3267                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3268   } else {
3269     switch (MemSize) {
3270     case 1:
3271       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3272       break;
3273     case 2:
3274       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3275       break;
3276     default:
3277       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3278       break;
3279     }
3280   }
3281 
3282   if (!VIndex)
3283     VIndex = B.buildConstant(S32, 0).getReg(0);
3284 
3285   auto MIB = B.buildInstr(Opc)
3286     .addUse(VData)              // vdata
3287     .addUse(RSrc)               // rsrc
3288     .addUse(VIndex)             // vindex
3289     .addUse(VOffset)            // voffset
3290     .addUse(SOffset)            // soffset
3291     .addImm(ImmOffset);         // offset(imm)
3292 
3293   if (IsTyped)
3294     MIB.addImm(Format);
3295 
3296   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3297      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3298      .addMemOperand(MMO);
3299 
3300   MI.eraseFromParent();
3301   return true;
3302 }
3303 
3304 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3305                                              MachineRegisterInfo &MRI,
3306                                              MachineIRBuilder &B,
3307                                              bool IsFormat,
3308                                              bool IsTyped) const {
3309   B.setInstr(MI);
3310 
3311   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3312   MachineMemOperand *MMO = *MI.memoperands_begin();
3313   const int MemSize = MMO->getSize();
3314   const LLT S32 = LLT::scalar(32);
3315 
3316   Register Dst = MI.getOperand(0).getReg();
3317   Register RSrc = MI.getOperand(2).getReg();
3318 
3319   // The typed intrinsics add an immediate after the registers.
3320   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3321 
3322   // The struct intrinsic variants add one additional operand over raw.
3323   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3324   Register VIndex;
3325   int OpOffset = 0;
3326   if (HasVIndex) {
3327     VIndex = MI.getOperand(3).getReg();
3328     OpOffset = 1;
3329   }
3330 
3331   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3332   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3333 
3334   unsigned Format = 0;
3335   if (IsTyped) {
3336     Format = MI.getOperand(5 + OpOffset).getImm();
3337     ++OpOffset;
3338   }
3339 
3340   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3341   unsigned ImmOffset;
3342   unsigned TotalOffset;
3343 
3344   LLT Ty = MRI.getType(Dst);
3345   LLT EltTy = Ty.getScalarType();
3346   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3347   const bool Unpacked = ST.hasUnpackedD16VMem();
3348 
3349   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3350   if (TotalOffset != 0)
3351     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3352 
3353   unsigned Opc;
3354 
3355   if (IsTyped) {
3356     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3357                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3358   } else if (IsFormat) {
3359     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3360                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3361   } else {
3362     switch (MemSize) {
3363     case 1:
3364       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3365       break;
3366     case 2:
3367       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3368       break;
3369     default:
3370       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3371       break;
3372     }
3373   }
3374 
3375   Register LoadDstReg;
3376 
3377   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3378   LLT UnpackedTy = Ty.changeElementSize(32);
3379 
3380   if (IsExtLoad)
3381     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3382   else if (Unpacked && IsD16 && Ty.isVector())
3383     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3384   else
3385     LoadDstReg = Dst;
3386 
3387   if (!VIndex)
3388     VIndex = B.buildConstant(S32, 0).getReg(0);
3389 
3390   auto MIB = B.buildInstr(Opc)
3391     .addDef(LoadDstReg)         // vdata
3392     .addUse(RSrc)               // rsrc
3393     .addUse(VIndex)             // vindex
3394     .addUse(VOffset)            // voffset
3395     .addUse(SOffset)            // soffset
3396     .addImm(ImmOffset);         // offset(imm)
3397 
3398   if (IsTyped)
3399     MIB.addImm(Format);
3400 
3401   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3402      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3403      .addMemOperand(MMO);
3404 
3405   if (LoadDstReg != Dst) {
3406     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3407 
3408     // Widen result for extending loads was widened.
3409     if (IsExtLoad)
3410       B.buildTrunc(Dst, LoadDstReg);
3411     else {
3412       // Repack to original 16-bit vector result
3413       // FIXME: G_TRUNC should work, but legalization currently fails
3414       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3415       SmallVector<Register, 4> Repack;
3416       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3417         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3418       B.buildMerge(Dst, Repack);
3419     }
3420   }
3421 
3422   MI.eraseFromParent();
3423   return true;
3424 }
3425 
3426 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3427                                                MachineIRBuilder &B,
3428                                                bool IsInc) const {
3429   B.setInstr(MI);
3430   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3431                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3432   B.buildInstr(Opc)
3433     .addDef(MI.getOperand(0).getReg())
3434     .addUse(MI.getOperand(2).getReg())
3435     .addUse(MI.getOperand(3).getReg())
3436     .cloneMemRefs(MI);
3437   MI.eraseFromParent();
3438   return true;
3439 }
3440 
3441 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3442   switch (IntrID) {
3443   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3444   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3445     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3446   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3447   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3448     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3449   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3450   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3451     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3452   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3453   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3454     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3455   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3456   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3457     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3458   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3459   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3460     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3461   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3462   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3463     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3464   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3465   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3466     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3467   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3468   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3469     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3470   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3471   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3472     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3473   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3474   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3475     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3476   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3477   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3478     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3479   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3480   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3481     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3482   default:
3483     llvm_unreachable("unhandled atomic opcode");
3484   }
3485 }
3486 
3487 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3488                                                MachineIRBuilder &B,
3489                                                Intrinsic::ID IID) const {
3490   B.setInstr(MI);
3491 
3492   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3493                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3494 
3495   Register Dst = MI.getOperand(0).getReg();
3496   Register VData = MI.getOperand(2).getReg();
3497 
3498   Register CmpVal;
3499   int OpOffset = 0;
3500 
3501   if (IsCmpSwap) {
3502     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3503     ++OpOffset;
3504   }
3505 
3506   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3507   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
3508 
3509   // The struct intrinsic variants add one additional operand over raw.
3510   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3511   Register VIndex;
3512   if (HasVIndex) {
3513     VIndex = MI.getOperand(4 + OpOffset).getReg();
3514     ++OpOffset;
3515   }
3516 
3517   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3518   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3519   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3520 
3521   MachineMemOperand *MMO = *MI.memoperands_begin();
3522 
3523   unsigned ImmOffset;
3524   unsigned TotalOffset;
3525   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3526   if (TotalOffset != 0)
3527     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3528 
3529   if (!VIndex)
3530     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3531 
3532   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
3533     .addDef(Dst)
3534     .addUse(VData); // vdata
3535 
3536   if (IsCmpSwap)
3537     MIB.addReg(CmpVal);
3538 
3539   MIB.addUse(RSrc)               // rsrc
3540      .addUse(VIndex)             // vindex
3541      .addUse(VOffset)            // voffset
3542      .addUse(SOffset)            // soffset
3543      .addImm(ImmOffset)          // offset(imm)
3544      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3545      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3546      .addMemOperand(MMO);
3547 
3548   MI.eraseFromParent();
3549   return true;
3550 }
3551 
3552 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3553 /// vector with s16 typed elements.
3554 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI,
3555                                         SmallVectorImpl<Register> &PackedAddrs,
3556                                         int AddrIdx, int DimIdx, int NumVAddrs,
3557                                         int NumGradients) {
3558   const LLT S16 = LLT::scalar(16);
3559   const LLT V2S16 = LLT::vector(2, 16);
3560 
3561   for (int I = AddrIdx; I < AddrIdx + NumVAddrs; ++I) {
3562     MachineOperand &SrcOp = MI.getOperand(I);
3563     if (!SrcOp.isReg())
3564       continue; // _L to _LZ may have eliminated this.
3565 
3566     Register AddrReg = SrcOp.getReg();
3567 
3568     if (I < DimIdx) {
3569       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
3570       PackedAddrs.push_back(AddrReg);
3571     } else {
3572       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
3573       // derivatives dx/dh and dx/dv are packed with undef.
3574       if (((I + 1) >= (AddrIdx + NumVAddrs)) ||
3575           ((NumGradients / 2) % 2 == 1 &&
3576            (I == DimIdx + (NumGradients / 2) - 1 ||
3577             I == DimIdx + NumGradients - 1)) ||
3578           // Check for _L to _LZ optimization
3579           !MI.getOperand(I + 1).isReg()) {
3580         PackedAddrs.push_back(
3581             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
3582                 .getReg(0));
3583       } else {
3584         PackedAddrs.push_back(
3585             B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()})
3586                 .getReg(0));
3587         ++I;
3588       }
3589     }
3590   }
3591 }
3592 
3593 /// Convert from separate vaddr components to a single vector address register,
3594 /// and replace the remaining operands with $noreg.
3595 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
3596                                      int DimIdx, int NumVAddrs) {
3597   const LLT S32 = LLT::scalar(32);
3598 
3599   SmallVector<Register, 8> AddrRegs;
3600   for (int I = 0; I != NumVAddrs; ++I) {
3601     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3602     if (SrcOp.isReg()) {
3603       AddrRegs.push_back(SrcOp.getReg());
3604       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
3605     }
3606   }
3607 
3608   int NumAddrRegs = AddrRegs.size();
3609   if (NumAddrRegs != 1) {
3610     // Round up to 8 elements for v5-v7
3611     // FIXME: Missing intermediate sized register classes and instructions.
3612     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
3613       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
3614       auto Undef = B.buildUndef(S32);
3615       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
3616       NumAddrRegs = RoundedNumRegs;
3617     }
3618 
3619     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
3620     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
3621   }
3622 
3623   for (int I = 1; I != NumVAddrs; ++I) {
3624     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3625     if (SrcOp.isReg())
3626       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
3627   }
3628 }
3629 
3630 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
3631 ///
3632 /// Depending on the subtarget, load/store with 16-bit element data need to be
3633 /// rewritten to use the low half of 32-bit registers, or directly use a packed
3634 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
3635 /// registers.
3636 ///
3637 /// We don't want to directly select image instructions just yet, but also want
3638 /// to exposes all register repacking to the legalizer/combiners. We also don't
3639 /// want a selected instrution entering RegBankSelect. In order to avoid
3640 /// defining a multitude of intermediate image instructions, directly hack on
3641 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
3642 /// now unnecessary arguments with $noreg.
3643 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3644     MachineInstr &MI, MachineIRBuilder &B,
3645     GISelChangeObserver &Observer,
3646     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3647   B.setInstr(MI);
3648 
3649   const int NumDefs = MI.getNumExplicitDefs();
3650   bool IsTFE = NumDefs == 2;
3651   // We are only processing the operands of d16 image operations on subtargets
3652   // that use the unpacked register layout, or need to repack the TFE result.
3653 
3654   // TODO: Do we need to guard against already legalized intrinsics?
3655   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3656     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3657 
3658   MachineRegisterInfo *MRI = B.getMRI();
3659   const LLT S32 = LLT::scalar(32);
3660   const LLT S16 = LLT::scalar(16);
3661   const LLT V2S16 = LLT::vector(2, 16);
3662 
3663   // Index of first address argument
3664   const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs);
3665 
3666   // Check for 16 bit addresses and pack if true.
3667   int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
3668   LLT AddrTy = MRI->getType(MI.getOperand(DimIdx).getReg());
3669   const bool IsA16 = AddrTy == S16;
3670 
3671   int NumVAddrs, NumGradients;
3672   std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode);
3673   const int DMaskIdx = BaseOpcode->Atomic ? -1 :
3674     getDMaskIdx(BaseOpcode, NumDefs);
3675   unsigned DMask = 0;
3676 
3677   int DMaskLanes = 0;
3678   if (!BaseOpcode->Atomic) {
3679     DMask = MI.getOperand(DMaskIdx).getImm();
3680     if (BaseOpcode->Gather4) {
3681       DMaskLanes = 4;
3682     } else if (DMask != 0) {
3683       DMaskLanes = countPopulation(DMask);
3684     } else if (!IsTFE && !BaseOpcode->Store) {
3685       // If dmask is 0, this is a no-op load. This can be eliminated.
3686       B.buildUndef(MI.getOperand(0));
3687       MI.eraseFromParent();
3688       return true;
3689     }
3690   }
3691 
3692   Observer.changingInstr(MI);
3693   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
3694 
3695   unsigned NewOpcode = NumDefs == 0 ?
3696     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
3697 
3698   // Track that we legalized this
3699   MI.setDesc(B.getTII().get(NewOpcode));
3700 
3701   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
3702   // dmask to be at least 1 otherwise the instruction will fail
3703   if (IsTFE && DMask == 0) {
3704     DMask = 0x1;
3705     DMaskLanes = 1;
3706     MI.getOperand(DMaskIdx).setImm(DMask);
3707   }
3708 
3709   if (BaseOpcode->Atomic) {
3710     Register VData0 = MI.getOperand(2).getReg();
3711     LLT Ty = MRI->getType(VData0);
3712 
3713     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
3714     if (Ty.isVector())
3715       return false;
3716 
3717     if (BaseOpcode->AtomicX2) {
3718       Register VData1 = MI.getOperand(3).getReg();
3719       // The two values are packed in one register.
3720       LLT PackedTy = LLT::vector(2, Ty);
3721       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
3722       MI.getOperand(2).setReg(Concat.getReg(0));
3723       MI.getOperand(3).setReg(AMDGPU::NoRegister);
3724     }
3725   }
3726 
3727   int CorrectedNumVAddrs = NumVAddrs;
3728 
3729   // Optimize _L to _LZ when _L is zero
3730   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
3731         AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
3732     const ConstantFP *ConstantLod;
3733     const int LodIdx = AddrIdx + NumVAddrs - 1;
3734 
3735     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) {
3736       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
3737         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
3738         ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode(
3739           LZMappingInfo->LZ, ImageDimIntr->Dim);
3740 
3741         // The starting indexes should remain in the same place.
3742         --NumVAddrs;
3743         --CorrectedNumVAddrs;
3744 
3745         MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID(
3746           static_cast<Intrinsic::ID>(ImageDimIntr->Intr));
3747         MI.RemoveOperand(LodIdx);
3748       }
3749     }
3750   }
3751 
3752   // Optimize _mip away, when 'lod' is zero
3753   if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
3754     int64_t ConstantLod;
3755     const int LodIdx = AddrIdx + NumVAddrs - 1;
3756 
3757     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) {
3758       if (ConstantLod == 0) {
3759         // TODO: Change intrinsic opcode and remove operand instead or replacing
3760         // it with 0, as the _L to _LZ handling is done above.
3761         MI.getOperand(LodIdx).ChangeToImmediate(0);
3762         --CorrectedNumVAddrs;
3763       }
3764     }
3765   }
3766 
3767   // If the register allocator cannot place the address registers contiguously
3768   // without introducing moves, then using the non-sequential address encoding
3769   // is always preferable, since it saves VALU instructions and is usually a
3770   // wash in terms of code size or even better.
3771   //
3772   // However, we currently have no way of hinting to the register allocator
3773   // that MIMG addresses should be placed contiguously when it is possible to
3774   // do so, so force non-NSA for the common 2-address case as a heuristic.
3775   //
3776   // SIShrinkInstructions will convert NSA encodings to non-NSA after register
3777   // allocation when possible.
3778   const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
3779 
3780   // Rewrite the addressing register layout before doing anything else.
3781   if (IsA16) {
3782     // FIXME: this feature is missing from gfx10. When that is fixed, this check
3783     // should be introduced.
3784     if (!ST.hasR128A16() && !ST.hasGFX10A16())
3785       return false;
3786 
3787     if (NumVAddrs > 1) {
3788       SmallVector<Register, 4> PackedRegs;
3789       packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, NumVAddrs,
3790                                   NumGradients);
3791 
3792       if (!UseNSA && PackedRegs.size() > 1) {
3793         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
3794         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
3795         PackedRegs[0] = Concat.getReg(0);
3796         PackedRegs.resize(1);
3797       }
3798 
3799       const int NumPacked = PackedRegs.size();
3800       for (int I = 0; I != NumVAddrs; ++I) {
3801         MachineOperand &SrcOp = MI.getOperand(AddrIdx + I);
3802         if (!SrcOp.isReg()) {
3803           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
3804           continue;
3805         }
3806 
3807         assert(SrcOp.getReg() != AMDGPU::NoRegister);
3808 
3809         if (I < NumPacked)
3810           SrcOp.setReg(PackedRegs[I]);
3811         else
3812           SrcOp.setReg(AMDGPU::NoRegister);
3813       }
3814     }
3815   } else if (!UseNSA && NumVAddrs > 1) {
3816     convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
3817   }
3818 
3819 
3820   if (BaseOpcode->Store) { // No TFE for stores?
3821     // TODO: Handle dmask trim
3822     Register VData = MI.getOperand(1).getReg();
3823     LLT Ty = MRI->getType(VData);
3824     if (!Ty.isVector() || Ty.getElementType() != S16)
3825       return true;
3826 
3827     B.setInstr(MI);
3828 
3829     Register RepackedReg = handleD16VData(B, *MRI, VData);
3830     if (RepackedReg != VData) {
3831       MI.getOperand(1).setReg(RepackedReg);
3832     }
3833 
3834     return true;
3835   }
3836 
3837   Register DstReg = MI.getOperand(0).getReg();
3838   LLT Ty = MRI->getType(DstReg);
3839   const LLT EltTy = Ty.getScalarType();
3840   const bool IsD16 = Ty.getScalarType() == S16;
3841   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
3842 
3843   // Confirm that the return type is large enough for the dmask specified
3844   if (NumElts < DMaskLanes)
3845     return false;
3846 
3847   if (NumElts > 4 || DMaskLanes > 4)
3848     return false;
3849 
3850   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
3851   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
3852 
3853   // The raw dword aligned data component of the load. The only legal cases
3854   // where this matters should be when using the packed D16 format, for
3855   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
3856   LLT RoundedTy;
3857 
3858   // S32 vector to to cover all data, plus TFE result element.
3859   LLT TFETy;
3860 
3861   // Register type to use for each loaded component. Will be S32 or V2S16.
3862   LLT RegTy;
3863 
3864   if (IsD16 && ST.hasUnpackedD16VMem()) {
3865     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
3866     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
3867     RegTy = S32;
3868   } else {
3869     unsigned EltSize = EltTy.getSizeInBits();
3870     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
3871     unsigned RoundedSize = 32 * RoundedElts;
3872     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
3873     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
3874     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
3875   }
3876 
3877   // The return type does not need adjustment.
3878   // TODO: Should we change s16 case to s32 or <2 x s16>?
3879   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
3880     return true;
3881 
3882   Register Dst1Reg;
3883 
3884   // Insert after the instruction.
3885   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3886 
3887   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
3888   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
3889   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
3890   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
3891 
3892   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
3893 
3894   MI.getOperand(0).setReg(NewResultReg);
3895 
3896   // In the IR, TFE is supposed to be used with a 2 element struct return
3897   // type. The intruction really returns these two values in one contiguous
3898   // register, with one additional dword beyond the loaded data. Rewrite the
3899   // return type to use a single register result.
3900 
3901   if (IsTFE) {
3902     Dst1Reg = MI.getOperand(1).getReg();
3903     if (MRI->getType(Dst1Reg) != S32)
3904       return false;
3905 
3906     // TODO: Make sure the TFE operand bit is set.
3907     MI.RemoveOperand(1);
3908 
3909     // Handle the easy case that requires no repack instructions.
3910     if (Ty == S32) {
3911       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
3912       return true;
3913     }
3914   }
3915 
3916   // Now figure out how to copy the new result register back into the old
3917   // result.
3918   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
3919 
3920   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
3921 
3922   if (ResultNumRegs == 1) {
3923     assert(!IsTFE);
3924     ResultRegs[0] = NewResultReg;
3925   } else {
3926     // We have to repack into a new vector of some kind.
3927     for (int I = 0; I != NumDataRegs; ++I)
3928       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
3929     B.buildUnmerge(ResultRegs, NewResultReg);
3930 
3931     // Drop the final TFE element to get the data part. The TFE result is
3932     // directly written to the right place already.
3933     if (IsTFE)
3934       ResultRegs.resize(NumDataRegs);
3935   }
3936 
3937   // For an s16 scalar result, we form an s32 result with a truncate regardless
3938   // of packed vs. unpacked.
3939   if (IsD16 && !Ty.isVector()) {
3940     B.buildTrunc(DstReg, ResultRegs[0]);
3941     return true;
3942   }
3943 
3944   // Avoid a build/concat_vector of 1 entry.
3945   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
3946     B.buildBitcast(DstReg, ResultRegs[0]);
3947     return true;
3948   }
3949 
3950   assert(Ty.isVector());
3951 
3952   if (IsD16) {
3953     // For packed D16 results with TFE enabled, all the data components are
3954     // S32. Cast back to the expected type.
3955     //
3956     // TODO: We don't really need to use load s32 elements. We would only need one
3957     // cast for the TFE result if a multiple of v2s16 was used.
3958     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
3959       for (Register &Reg : ResultRegs)
3960         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
3961     } else if (ST.hasUnpackedD16VMem()) {
3962       for (Register &Reg : ResultRegs)
3963         Reg = B.buildTrunc(S16, Reg).getReg(0);
3964     }
3965   }
3966 
3967   auto padWithUndef = [&](LLT Ty, int NumElts) {
3968     if (NumElts == 0)
3969       return;
3970     Register Undef = B.buildUndef(Ty).getReg(0);
3971     for (int I = 0; I != NumElts; ++I)
3972       ResultRegs.push_back(Undef);
3973   };
3974 
3975   // Pad out any elements eliminated due to the dmask.
3976   LLT ResTy = MRI->getType(ResultRegs[0]);
3977   if (!ResTy.isVector()) {
3978     padWithUndef(ResTy, NumElts - ResultRegs.size());
3979     B.buildBuildVector(DstReg, ResultRegs);
3980     return true;
3981   }
3982 
3983   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
3984   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
3985 
3986   // Deal with the one annoying legal case.
3987   const LLT V3S16 = LLT::vector(3, 16);
3988   if (Ty == V3S16) {
3989     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
3990     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
3991     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
3992     return true;
3993   }
3994 
3995   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
3996   B.buildConcatVectors(DstReg, ResultRegs);
3997   return true;
3998 }
3999 
4000 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4001   MachineInstr &MI, MachineIRBuilder &B,
4002   GISelChangeObserver &Observer) const {
4003   Register Dst = MI.getOperand(0).getReg();
4004   LLT Ty = B.getMRI()->getType(Dst);
4005   unsigned Size = Ty.getSizeInBits();
4006   MachineFunction &MF = B.getMF();
4007 
4008   Observer.changingInstr(MI);
4009 
4010   // FIXME: We don't really need this intermediate instruction. The intrinsic
4011   // should be fixed to have a memory operand. Since it's readnone, we're not
4012   // allowed to add one.
4013   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4014   MI.RemoveOperand(1); // Remove intrinsic ID
4015 
4016   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4017   // TODO: Should this use datalayout alignment?
4018   const unsigned MemSize = (Size + 7) / 8;
4019   const Align MemAlign(4);
4020   MachineMemOperand *MMO = MF.getMachineMemOperand(
4021       MachinePointerInfo(),
4022       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4023           MachineMemOperand::MOInvariant,
4024       MemSize, MemAlign);
4025   MI.addMemOperand(MF, MMO);
4026 
4027   // There are no 96-bit result scalar loads, but widening to 128-bit should
4028   // always be legal. We may need to restore this to a 96-bit result if it turns
4029   // out this needs to be converted to a vector load during RegBankSelect.
4030   if (!isPowerOf2_32(Size)) {
4031     LegalizerHelper Helper(MF, *this, Observer, B);
4032     B.setInstr(MI);
4033 
4034     if (Ty.isVector())
4035       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4036     else
4037       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4038   }
4039 
4040   Observer.changedInstr(MI);
4041   return true;
4042 }
4043 
4044 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4045                                                 MachineRegisterInfo &MRI,
4046                                                 MachineIRBuilder &B) const {
4047   B.setInstr(MI);
4048 
4049   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
4050   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4051       !ST.isTrapHandlerEnabled()) {
4052     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4053   } else {
4054     // Pass queue pointer to trap handler as input, and insert trap instruction
4055     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4056     const ArgDescriptor *Arg =
4057         getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR);
4058     if (!Arg)
4059       return false;
4060     MachineRegisterInfo &MRI = *B.getMRI();
4061     Register SGPR01(AMDGPU::SGPR0_SGPR1);
4062     Register LiveIn = getLiveInRegister(
4063         B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64),
4064         /*InsertLiveInCopy=*/false);
4065     if (!loadInputValue(LiveIn, B, Arg))
4066       return false;
4067     B.buildCopy(SGPR01, LiveIn);
4068     B.buildInstr(AMDGPU::S_TRAP)
4069         .addImm(GCNSubtarget::TrapIDLLVMTrap)
4070         .addReg(SGPR01, RegState::Implicit);
4071   }
4072 
4073   MI.eraseFromParent();
4074   return true;
4075 }
4076 
4077 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4078     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4079   B.setInstr(MI);
4080 
4081   // Is non-HSA path or trap-handler disabled? then, report a warning
4082   // accordingly
4083   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4084       !ST.isTrapHandlerEnabled()) {
4085     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4086                                      "debugtrap handler not supported",
4087                                      MI.getDebugLoc(), DS_Warning);
4088     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4089     Ctx.diagnose(NoTrap);
4090   } else {
4091     // Insert debug-trap instruction
4092     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4093   }
4094 
4095   MI.eraseFromParent();
4096   return true;
4097 }
4098 
4099 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
4100                                             MachineIRBuilder &B,
4101                                             GISelChangeObserver &Observer) const {
4102   MachineRegisterInfo &MRI = *B.getMRI();
4103 
4104   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4105   auto IntrID = MI.getIntrinsicID();
4106   switch (IntrID) {
4107   case Intrinsic::amdgcn_if:
4108   case Intrinsic::amdgcn_else: {
4109     MachineInstr *Br = nullptr;
4110     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
4111       const SIRegisterInfo *TRI
4112         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4113 
4114       B.setInstr(*BrCond);
4115       Register Def = MI.getOperand(1).getReg();
4116       Register Use = MI.getOperand(3).getReg();
4117 
4118       MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB();
4119       if (Br)
4120         BrTarget = Br->getOperand(0).getMBB();
4121 
4122       if (IntrID == Intrinsic::amdgcn_if) {
4123         B.buildInstr(AMDGPU::SI_IF)
4124           .addDef(Def)
4125           .addUse(Use)
4126           .addMBB(BrTarget);
4127       } else {
4128         B.buildInstr(AMDGPU::SI_ELSE)
4129           .addDef(Def)
4130           .addUse(Use)
4131           .addMBB(BrTarget)
4132           .addImm(0);
4133       }
4134 
4135       if (Br)
4136         Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB());
4137 
4138       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4139       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4140       MI.eraseFromParent();
4141       BrCond->eraseFromParent();
4142       return true;
4143     }
4144 
4145     return false;
4146   }
4147   case Intrinsic::amdgcn_loop: {
4148     MachineInstr *Br = nullptr;
4149     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
4150       const SIRegisterInfo *TRI
4151         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4152 
4153       B.setInstr(*BrCond);
4154 
4155       MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB();
4156       if (Br)
4157         BrTarget = Br->getOperand(0).getMBB();
4158 
4159       Register Reg = MI.getOperand(2).getReg();
4160       B.buildInstr(AMDGPU::SI_LOOP)
4161         .addUse(Reg)
4162         .addMBB(BrTarget);
4163 
4164       if (Br)
4165         Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB());
4166 
4167       MI.eraseFromParent();
4168       BrCond->eraseFromParent();
4169       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4170       return true;
4171     }
4172 
4173     return false;
4174   }
4175   case Intrinsic::amdgcn_kernarg_segment_ptr:
4176     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4177       B.setInstr(MI);
4178       // This only makes sense to call in a kernel, so just lower to null.
4179       B.buildConstant(MI.getOperand(0).getReg(), 0);
4180       MI.eraseFromParent();
4181       return true;
4182     }
4183 
4184     return legalizePreloadedArgIntrin(
4185       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4186   case Intrinsic::amdgcn_implicitarg_ptr:
4187     return legalizeImplicitArgPtr(MI, MRI, B);
4188   case Intrinsic::amdgcn_workitem_id_x:
4189     return legalizePreloadedArgIntrin(MI, MRI, B,
4190                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4191   case Intrinsic::amdgcn_workitem_id_y:
4192     return legalizePreloadedArgIntrin(MI, MRI, B,
4193                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4194   case Intrinsic::amdgcn_workitem_id_z:
4195     return legalizePreloadedArgIntrin(MI, MRI, B,
4196                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4197   case Intrinsic::amdgcn_workgroup_id_x:
4198     return legalizePreloadedArgIntrin(MI, MRI, B,
4199                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4200   case Intrinsic::amdgcn_workgroup_id_y:
4201     return legalizePreloadedArgIntrin(MI, MRI, B,
4202                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4203   case Intrinsic::amdgcn_workgroup_id_z:
4204     return legalizePreloadedArgIntrin(MI, MRI, B,
4205                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4206   case Intrinsic::amdgcn_dispatch_ptr:
4207     return legalizePreloadedArgIntrin(MI, MRI, B,
4208                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4209   case Intrinsic::amdgcn_queue_ptr:
4210     return legalizePreloadedArgIntrin(MI, MRI, B,
4211                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4212   case Intrinsic::amdgcn_implicit_buffer_ptr:
4213     return legalizePreloadedArgIntrin(
4214       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4215   case Intrinsic::amdgcn_dispatch_id:
4216     return legalizePreloadedArgIntrin(MI, MRI, B,
4217                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4218   case Intrinsic::amdgcn_fdiv_fast:
4219     return legalizeFDIVFastIntrin(MI, MRI, B);
4220   case Intrinsic::amdgcn_is_shared:
4221     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4222   case Intrinsic::amdgcn_is_private:
4223     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4224   case Intrinsic::amdgcn_wavefrontsize: {
4225     B.setInstr(MI);
4226     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4227     MI.eraseFromParent();
4228     return true;
4229   }
4230   case Intrinsic::amdgcn_s_buffer_load:
4231     return legalizeSBufferLoad(MI, B, Observer);
4232   case Intrinsic::amdgcn_raw_buffer_store:
4233   case Intrinsic::amdgcn_struct_buffer_store:
4234     return legalizeBufferStore(MI, MRI, B, false, false);
4235   case Intrinsic::amdgcn_raw_buffer_store_format:
4236   case Intrinsic::amdgcn_struct_buffer_store_format:
4237     return legalizeBufferStore(MI, MRI, B, false, true);
4238   case Intrinsic::amdgcn_raw_tbuffer_store:
4239   case Intrinsic::amdgcn_struct_tbuffer_store:
4240     return legalizeBufferStore(MI, MRI, B, true, true);
4241   case Intrinsic::amdgcn_raw_buffer_load:
4242   case Intrinsic::amdgcn_struct_buffer_load:
4243     return legalizeBufferLoad(MI, MRI, B, false, false);
4244   case Intrinsic::amdgcn_raw_buffer_load_format:
4245   case Intrinsic::amdgcn_struct_buffer_load_format:
4246     return legalizeBufferLoad(MI, MRI, B, true, false);
4247   case Intrinsic::amdgcn_raw_tbuffer_load:
4248   case Intrinsic::amdgcn_struct_tbuffer_load:
4249     return legalizeBufferLoad(MI, MRI, B, true, true);
4250   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4251   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4252   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4253   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4254   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4255   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4256   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4257   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4258   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4259   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4260   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4261   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4262   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4263   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4264   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4265   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4266   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4267   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4268   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4269   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4270   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4271   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4272   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4273   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4274   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4275   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4276     return legalizeBufferAtomic(MI, B, IntrID);
4277   case Intrinsic::amdgcn_atomic_inc:
4278     return legalizeAtomicIncDec(MI, B, true);
4279   case Intrinsic::amdgcn_atomic_dec:
4280     return legalizeAtomicIncDec(MI, B, false);
4281   case Intrinsic::trap:
4282     return legalizeTrapIntrinsic(MI, MRI, B);
4283   case Intrinsic::debugtrap:
4284     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4285   default: {
4286     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4287             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4288       return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr);
4289     return true;
4290   }
4291   }
4292 
4293   return true;
4294 }
4295