1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
19 #endif
20 
21 #include "AMDGPULegalizerInfo.h"
22 
23 #include "AMDGPU.h"
24 #include "AMDGPUGlobalISelUtils.h"
25 #include "AMDGPUTargetMachine.h"
26 #include "SIMachineFunctionInfo.h"
27 #include "llvm/ADT/ScopeExit.h"
28 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
29 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
30 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
31 #include "llvm/CodeGen/TargetOpcodes.h"
32 #include "llvm/CodeGen/ValueTypes.h"
33 #include "llvm/IR/DerivedTypes.h"
34 #include "llvm/IR/DiagnosticInfo.h"
35 #include "llvm/IR/Type.h"
36 #include "llvm/Support/Debug.h"
37 
38 #define DEBUG_TYPE "amdgpu-legalinfo"
39 
40 using namespace llvm;
41 using namespace LegalizeActions;
42 using namespace LegalizeMutations;
43 using namespace LegalityPredicates;
44 using namespace MIPatternMatch;
45 
46 // Round the number of elements to the next power of two elements
47 static LLT getPow2VectorType(LLT Ty) {
48   unsigned NElts = Ty.getNumElements();
49   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
50   return Ty.changeNumElements(Pow2NElts);
51 }
52 
53 // Round the number of bits to the next power of two bits
54 static LLT getPow2ScalarType(LLT Ty) {
55   unsigned Bits = Ty.getSizeInBits();
56   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
57   return LLT::scalar(Pow2Bits);
58 }
59 
60 static LegalityPredicate isMultiple32(unsigned TypeIdx,
61                                       unsigned MaxSize = 1024) {
62   return [=](const LegalityQuery &Query) {
63     const LLT Ty = Query.Types[TypeIdx];
64     const LLT EltTy = Ty.getScalarType();
65     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
66   };
67 }
68 
69 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
70   return [=](const LegalityQuery &Query) {
71     return Query.Types[TypeIdx].getSizeInBits() == Size;
72   };
73 }
74 
75 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
76   return [=](const LegalityQuery &Query) {
77     const LLT Ty = Query.Types[TypeIdx];
78     return Ty.isVector() &&
79            Ty.getNumElements() % 2 != 0 &&
80            Ty.getElementType().getSizeInBits() < 32 &&
81            Ty.getSizeInBits() % 32 != 0;
82   };
83 }
84 
85 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
86   return [=](const LegalityQuery &Query) {
87     const LLT Ty = Query.Types[TypeIdx];
88     const LLT EltTy = Ty.getScalarType();
89     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
90   };
91 }
92 
93 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
94   return [=](const LegalityQuery &Query) {
95     const LLT Ty = Query.Types[TypeIdx];
96     const LLT EltTy = Ty.getElementType();
97     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
98   };
99 }
100 
101 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
102   return [=](const LegalityQuery &Query) {
103     const LLT Ty = Query.Types[TypeIdx];
104     const LLT EltTy = Ty.getElementType();
105     unsigned Size = Ty.getSizeInBits();
106     unsigned Pieces = (Size + 63) / 64;
107     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
108     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
109   };
110 }
111 
112 // Increase the number of vector elements to reach the next multiple of 32-bit
113 // type.
114 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
115   return [=](const LegalityQuery &Query) {
116     const LLT Ty = Query.Types[TypeIdx];
117 
118     const LLT EltTy = Ty.getElementType();
119     const int Size = Ty.getSizeInBits();
120     const int EltSize = EltTy.getSizeInBits();
121     const int NextMul32 = (Size + 31) / 32;
122 
123     assert(EltSize < 32);
124 
125     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
126     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
127   };
128 }
129 
130 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
131   return [=](const LegalityQuery &Query) {
132     const LLT QueryTy = Query.Types[TypeIdx];
133     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
134   };
135 }
136 
137 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
138   return [=](const LegalityQuery &Query) {
139     const LLT QueryTy = Query.Types[TypeIdx];
140     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
141   };
142 }
143 
144 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
145   return [=](const LegalityQuery &Query) {
146     const LLT QueryTy = Query.Types[TypeIdx];
147     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
148   };
149 }
150 
151 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
152 // v2s16.
153 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
154   return [=](const LegalityQuery &Query) {
155     const LLT Ty = Query.Types[TypeIdx];
156     if (Ty.isVector()) {
157       const int EltSize = Ty.getElementType().getSizeInBits();
158       return EltSize == 32 || EltSize == 64 ||
159             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
160              EltSize == 128 || EltSize == 256;
161     }
162 
163     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
164   };
165 }
166 
167 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
168   return [=](const LegalityQuery &Query) {
169     const LLT QueryTy = Query.Types[TypeIdx];
170     return QueryTy.isVector() && QueryTy.getElementType() == Type;
171   };
172 }
173 
174 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
175   return [=](const LegalityQuery &Query) {
176     const LLT QueryTy = Query.Types[TypeIdx];
177     if (!QueryTy.isVector())
178       return false;
179     const LLT EltTy = QueryTy.getElementType();
180     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
181   };
182 }
183 
184 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
185   return [=](const LegalityQuery &Query) {
186     const LLT Ty = Query.Types[TypeIdx];
187     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
188            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
189   };
190 }
191 
192 static LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1) {
193   return [=](const LegalityQuery &Query) {
194     return Query.Types[TypeIdx0].getSizeInBits() <
195            Query.Types[TypeIdx1].getSizeInBits();
196   };
197 }
198 
199 static LegalityPredicate greaterThan(unsigned TypeIdx0, unsigned TypeIdx1) {
200   return [=](const LegalityQuery &Query) {
201     return Query.Types[TypeIdx0].getSizeInBits() >
202            Query.Types[TypeIdx1].getSizeInBits();
203   };
204 }
205 
206 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
207                                          const GCNTargetMachine &TM)
208   :  ST(ST_) {
209   using namespace TargetOpcode;
210 
211   auto GetAddrSpacePtr = [&TM](unsigned AS) {
212     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
213   };
214 
215   const LLT S1 = LLT::scalar(1);
216   const LLT S16 = LLT::scalar(16);
217   const LLT S32 = LLT::scalar(32);
218   const LLT S64 = LLT::scalar(64);
219   const LLT S128 = LLT::scalar(128);
220   const LLT S256 = LLT::scalar(256);
221   const LLT S512 = LLT::scalar(512);
222   const LLT S1024 = LLT::scalar(1024);
223 
224   const LLT V2S16 = LLT::vector(2, 16);
225   const LLT V4S16 = LLT::vector(4, 16);
226 
227   const LLT V2S32 = LLT::vector(2, 32);
228   const LLT V3S32 = LLT::vector(3, 32);
229   const LLT V4S32 = LLT::vector(4, 32);
230   const LLT V5S32 = LLT::vector(5, 32);
231   const LLT V6S32 = LLT::vector(6, 32);
232   const LLT V7S32 = LLT::vector(7, 32);
233   const LLT V8S32 = LLT::vector(8, 32);
234   const LLT V9S32 = LLT::vector(9, 32);
235   const LLT V10S32 = LLT::vector(10, 32);
236   const LLT V11S32 = LLT::vector(11, 32);
237   const LLT V12S32 = LLT::vector(12, 32);
238   const LLT V13S32 = LLT::vector(13, 32);
239   const LLT V14S32 = LLT::vector(14, 32);
240   const LLT V15S32 = LLT::vector(15, 32);
241   const LLT V16S32 = LLT::vector(16, 32);
242   const LLT V32S32 = LLT::vector(32, 32);
243 
244   const LLT V2S64 = LLT::vector(2, 64);
245   const LLT V3S64 = LLT::vector(3, 64);
246   const LLT V4S64 = LLT::vector(4, 64);
247   const LLT V5S64 = LLT::vector(5, 64);
248   const LLT V6S64 = LLT::vector(6, 64);
249   const LLT V7S64 = LLT::vector(7, 64);
250   const LLT V8S64 = LLT::vector(8, 64);
251   const LLT V16S64 = LLT::vector(16, 64);
252 
253   std::initializer_list<LLT> AllS32Vectors =
254     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
255      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
256   std::initializer_list<LLT> AllS64Vectors =
257     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
258 
259   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
260   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
261   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
262   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
263   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
264   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
265   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
266 
267   const LLT CodePtr = FlatPtr;
268 
269   const std::initializer_list<LLT> AddrSpaces64 = {
270     GlobalPtr, ConstantPtr, FlatPtr
271   };
272 
273   const std::initializer_list<LLT> AddrSpaces32 = {
274     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
275   };
276 
277   const std::initializer_list<LLT> FPTypesBase = {
278     S32, S64
279   };
280 
281   const std::initializer_list<LLT> FPTypes16 = {
282     S32, S64, S16
283   };
284 
285   const std::initializer_list<LLT> FPTypesPK16 = {
286     S32, S64, S16, V2S16
287   };
288 
289   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
290 
291   setAction({G_BRCOND, S1}, Legal); // VCC branches
292   setAction({G_BRCOND, S32}, Legal); // SCC branches
293 
294   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
295   // elements for v3s16
296   getActionDefinitionsBuilder(G_PHI)
297     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
298     .legalFor(AllS32Vectors)
299     .legalFor(AllS64Vectors)
300     .legalFor(AddrSpaces64)
301     .legalFor(AddrSpaces32)
302     .clampScalar(0, S32, S256)
303     .widenScalarToNextPow2(0, 32)
304     .clampMaxNumElements(0, S32, 16)
305     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
306     .legalIf(isPointer(0));
307 
308   if (ST.hasVOP3PInsts()) {
309     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
310       .legalFor({S32, S16, V2S16})
311       .clampScalar(0, S16, S32)
312       .clampMaxNumElements(0, S16, 2)
313       .scalarize(0)
314       .widenScalarToNextPow2(0, 32);
315   } else if (ST.has16BitInsts()) {
316     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
317       .legalFor({S32, S16})
318       .clampScalar(0, S16, S32)
319       .scalarize(0)
320       .widenScalarToNextPow2(0, 32);
321   } else {
322     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
323       .legalFor({S32})
324       .clampScalar(0, S32, S32)
325       .scalarize(0);
326   }
327 
328   // FIXME: Not really legal. Placeholder for custom lowering.
329   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
330     .customFor({S32, S64})
331     .clampScalar(0, S32, S64)
332     .widenScalarToNextPow2(0, 32)
333     .scalarize(0);
334 
335   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
336     .legalFor({S32})
337     .clampScalar(0, S32, S32)
338     .scalarize(0);
339 
340   // Report legal for any types we can handle anywhere. For the cases only legal
341   // on the SALU, RegBankSelect will be able to re-legalize.
342   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
343     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
344     .clampScalar(0, S32, S64)
345     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
346     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
347     .widenScalarToNextPow2(0)
348     .scalarize(0);
349 
350   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
351                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
352     .legalFor({{S32, S1}, {S32, S32}})
353     .minScalar(0, S32)
354     // TODO: .scalarize(0)
355     .lower();
356 
357   getActionDefinitionsBuilder(G_BITCAST)
358     // Don't worry about the size constraint.
359     .legalIf(all(isRegisterType(0), isRegisterType(1)))
360     .lower();
361 
362 
363   getActionDefinitionsBuilder(G_CONSTANT)
364     .legalFor({S1, S32, S64, S16, GlobalPtr,
365                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
366     .clampScalar(0, S32, S64)
367     .widenScalarToNextPow2(0)
368     .legalIf(isPointer(0));
369 
370   getActionDefinitionsBuilder(G_FCONSTANT)
371     .legalFor({S32, S64, S16})
372     .clampScalar(0, S16, S64);
373 
374   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
375     .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
376                ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
377     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
378     .clampScalarOrElt(0, S32, S1024)
379     .legalIf(isMultiple32(0))
380     .widenScalarToNextPow2(0, 32)
381     .clampMaxNumElements(0, S32, 16);
382 
383   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
384   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
385     .unsupportedFor({PrivatePtr})
386     .custom();
387   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
388 
389   auto &FPOpActions = getActionDefinitionsBuilder(
390     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
391     .legalFor({S32, S64});
392   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
393     .customFor({S32, S64});
394   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
395     .customFor({S32, S64});
396 
397   if (ST.has16BitInsts()) {
398     if (ST.hasVOP3PInsts())
399       FPOpActions.legalFor({S16, V2S16});
400     else
401       FPOpActions.legalFor({S16});
402 
403     TrigActions.customFor({S16});
404     FDIVActions.customFor({S16});
405   }
406 
407   auto &MinNumMaxNum = getActionDefinitionsBuilder({
408       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
409 
410   if (ST.hasVOP3PInsts()) {
411     MinNumMaxNum.customFor(FPTypesPK16)
412       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
413       .clampMaxNumElements(0, S16, 2)
414       .clampScalar(0, S16, S64)
415       .scalarize(0);
416   } else if (ST.has16BitInsts()) {
417     MinNumMaxNum.customFor(FPTypes16)
418       .clampScalar(0, S16, S64)
419       .scalarize(0);
420   } else {
421     MinNumMaxNum.customFor(FPTypesBase)
422       .clampScalar(0, S32, S64)
423       .scalarize(0);
424   }
425 
426   if (ST.hasVOP3PInsts())
427     FPOpActions.clampMaxNumElements(0, S16, 2);
428 
429   FPOpActions
430     .scalarize(0)
431     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
432 
433   TrigActions
434     .scalarize(0)
435     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
436 
437   FDIVActions
438     .scalarize(0)
439     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
440 
441   getActionDefinitionsBuilder({G_FNEG, G_FABS})
442     .legalFor(FPTypesPK16)
443     .clampMaxNumElements(0, S16, 2)
444     .scalarize(0)
445     .clampScalar(0, S16, S64);
446 
447   if (ST.has16BitInsts()) {
448     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
449       .legalFor({S32, S64, S16})
450       .scalarize(0)
451       .clampScalar(0, S16, S64);
452   } else {
453     getActionDefinitionsBuilder(G_FSQRT)
454       .legalFor({S32, S64})
455       .scalarize(0)
456       .clampScalar(0, S32, S64);
457 
458     if (ST.hasFractBug()) {
459       getActionDefinitionsBuilder(G_FFLOOR)
460         .customFor({S64})
461         .legalFor({S32, S64})
462         .scalarize(0)
463         .clampScalar(0, S32, S64);
464     } else {
465       getActionDefinitionsBuilder(G_FFLOOR)
466         .legalFor({S32, S64})
467         .scalarize(0)
468         .clampScalar(0, S32, S64);
469     }
470   }
471 
472   getActionDefinitionsBuilder(G_FPTRUNC)
473     .legalFor({{S32, S64}, {S16, S32}})
474     .scalarize(0)
475     .lower();
476 
477   getActionDefinitionsBuilder(G_FPEXT)
478     .legalFor({{S64, S32}, {S32, S16}})
479     .lowerFor({{S64, S16}}) // FIXME: Implement
480     .scalarize(0);
481 
482   getActionDefinitionsBuilder(G_FSUB)
483       // Use actual fsub instruction
484       .legalFor({S32})
485       // Must use fadd + fneg
486       .lowerFor({S64, S16, V2S16})
487       .scalarize(0)
488       .clampScalar(0, S32, S64);
489 
490   // Whether this is legal depends on the floating point mode for the function.
491   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
492   if (ST.hasMadF16())
493     FMad.customFor({S32, S16});
494   else
495     FMad.customFor({S32});
496   FMad.scalarize(0)
497       .lower();
498 
499   // TODO: Do we need to clamp maximum bitwidth?
500   getActionDefinitionsBuilder(G_TRUNC)
501     .legalIf(isScalar(0))
502     .legalFor({{V2S16, V2S32}})
503     .clampMaxNumElements(0, S16, 2)
504     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
505     // situations (like an invalid implicit use), we don't want to infinite loop
506     // in the legalizer.
507     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
508     .alwaysLegal();
509 
510   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
511     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
512                {S32, S1}, {S64, S1}, {S16, S1}})
513     .scalarize(0)
514     .clampScalar(0, S32, S64)
515     .widenScalarToNextPow2(1, 32);
516 
517   // TODO: Split s1->s64 during regbankselect for VALU.
518   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
519     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
520     .lowerFor({{S32, S64}})
521     .lowerIf(typeIs(1, S1))
522     .customFor({{S64, S64}});
523   if (ST.has16BitInsts())
524     IToFP.legalFor({{S16, S16}});
525   IToFP.clampScalar(1, S32, S64)
526        .scalarize(0)
527        .widenScalarToNextPow2(1);
528 
529   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
530     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
531     .customFor({{S64, S64}});
532   if (ST.has16BitInsts())
533     FPToI.legalFor({{S16, S16}});
534   else
535     FPToI.minScalar(1, S32);
536 
537   FPToI.minScalar(0, S32)
538        .scalarize(0)
539        .lower();
540 
541   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
542     .scalarize(0)
543     .lower();
544 
545   if (ST.has16BitInsts()) {
546     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
547       .legalFor({S16, S32, S64})
548       .clampScalar(0, S16, S64)
549       .scalarize(0);
550   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
551     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
552       .legalFor({S32, S64})
553       .clampScalar(0, S32, S64)
554       .scalarize(0);
555   } else {
556     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
557       .legalFor({S32})
558       .customFor({S64})
559       .clampScalar(0, S32, S64)
560       .scalarize(0);
561   }
562 
563   getActionDefinitionsBuilder({G_PTR_ADD, G_PTR_MASK})
564     .scalarize(0)
565     .alwaysLegal();
566 
567   auto &CmpBuilder =
568     getActionDefinitionsBuilder(G_ICMP)
569     // The compare output type differs based on the register bank of the output,
570     // so make both s1 and s32 legal.
571     //
572     // Scalar compares producing output in scc will be promoted to s32, as that
573     // is the allocatable register type that will be needed for the copy from
574     // scc. This will be promoted during RegBankSelect, and we assume something
575     // before that won't try to use s32 result types.
576     //
577     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
578     // bank.
579     .legalForCartesianProduct(
580       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
581     .legalForCartesianProduct(
582       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
583   if (ST.has16BitInsts()) {
584     CmpBuilder.legalFor({{S1, S16}});
585   }
586 
587   CmpBuilder
588     .widenScalarToNextPow2(1)
589     .clampScalar(1, S32, S64)
590     .scalarize(0)
591     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
592 
593   getActionDefinitionsBuilder(G_FCMP)
594     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
595     .widenScalarToNextPow2(1)
596     .clampScalar(1, S32, S64)
597     .scalarize(0);
598 
599   // FIXME: fpow has a selection pattern that should move to custom lowering.
600   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
601   if (ST.has16BitInsts())
602     Exp2Ops.legalFor({S32, S16});
603   else
604     Exp2Ops.legalFor({S32});
605   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
606   Exp2Ops.scalarize(0);
607 
608   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
609   if (ST.has16BitInsts())
610     ExpOps.customFor({{S32}, {S16}});
611   else
612     ExpOps.customFor({S32});
613   ExpOps.clampScalar(0, MinScalarFPTy, S32)
614         .scalarize(0);
615 
616   // The 64-bit versions produce 32-bit results, but only on the SALU.
617   getActionDefinitionsBuilder(G_CTPOP)
618     .legalFor({{S32, S32}, {S32, S64}})
619     .clampScalar(0, S32, S32)
620     .clampScalar(1, S32, S64)
621     .scalarize(0)
622     .widenScalarToNextPow2(0, 32)
623     .widenScalarToNextPow2(1, 32);
624 
625   // The hardware instructions return a different result on 0 than the generic
626   // instructions expect. The hardware produces -1, but these produce the
627   // bitwidth.
628   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
629     .scalarize(0)
630     .clampScalar(0, S32, S32)
631     .clampScalar(1, S32, S64)
632     .widenScalarToNextPow2(0, 32)
633     .widenScalarToNextPow2(1, 32)
634     .lower();
635 
636   // The 64-bit versions produce 32-bit results, but only on the SALU.
637   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
638     .legalFor({{S32, S32}, {S32, S64}})
639     .clampScalar(0, S32, S32)
640     .clampScalar(1, S32, S64)
641     .scalarize(0)
642     .widenScalarToNextPow2(0, 32)
643     .widenScalarToNextPow2(1, 32);
644 
645   getActionDefinitionsBuilder(G_BITREVERSE)
646     .legalFor({S32})
647     .clampScalar(0, S32, S32)
648     .scalarize(0);
649 
650   if (ST.has16BitInsts()) {
651     getActionDefinitionsBuilder(G_BSWAP)
652       .legalFor({S16, S32, V2S16})
653       .clampMaxNumElements(0, S16, 2)
654       // FIXME: Fixing non-power-of-2 before clamp is workaround for
655       // narrowScalar limitation.
656       .widenScalarToNextPow2(0)
657       .clampScalar(0, S16, S32)
658       .scalarize(0);
659 
660     if (ST.hasVOP3PInsts()) {
661       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
662         .legalFor({S32, S16, V2S16})
663         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
664         .clampMaxNumElements(0, S16, 2)
665         .minScalar(0, S16)
666         .widenScalarToNextPow2(0)
667         .scalarize(0)
668         .lower();
669     } else {
670       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
671         .legalFor({S32, S16})
672         .widenScalarToNextPow2(0)
673         .minScalar(0, S16)
674         .scalarize(0)
675         .lower();
676     }
677   } else {
678     // TODO: Should have same legality without v_perm_b32
679     getActionDefinitionsBuilder(G_BSWAP)
680       .legalFor({S32})
681       .lowerIf(narrowerThan(0, 32))
682       // FIXME: Fixing non-power-of-2 before clamp is workaround for
683       // narrowScalar limitation.
684       .widenScalarToNextPow2(0)
685       .maxScalar(0, S32)
686       .scalarize(0)
687       .lower();
688 
689     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
690       .legalFor({S32})
691       .minScalar(0, S32)
692       .widenScalarToNextPow2(0)
693       .scalarize(0)
694       .lower();
695   }
696 
697   getActionDefinitionsBuilder(G_INTTOPTR)
698     // List the common cases
699     .legalForCartesianProduct(AddrSpaces64, {S64})
700     .legalForCartesianProduct(AddrSpaces32, {S32})
701     .scalarize(0)
702     // Accept any address space as long as the size matches
703     .legalIf(sameSize(0, 1))
704     .widenScalarIf(smallerThan(1, 0),
705       [](const LegalityQuery &Query) {
706         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
707       })
708     .narrowScalarIf(greaterThan(1, 0),
709       [](const LegalityQuery &Query) {
710         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
711       });
712 
713   getActionDefinitionsBuilder(G_PTRTOINT)
714     // List the common cases
715     .legalForCartesianProduct(AddrSpaces64, {S64})
716     .legalForCartesianProduct(AddrSpaces32, {S32})
717     .scalarize(0)
718     // Accept any address space as long as the size matches
719     .legalIf(sameSize(0, 1))
720     .widenScalarIf(smallerThan(0, 1),
721       [](const LegalityQuery &Query) {
722         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
723       })
724     .narrowScalarIf(
725       greaterThan(0, 1),
726       [](const LegalityQuery &Query) {
727         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
728       });
729 
730   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
731     .scalarize(0)
732     .custom();
733 
734   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
735   // handle some operations by just promoting the register during
736   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
737   auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned {
738     switch (AS) {
739     // FIXME: Private element size.
740     case AMDGPUAS::PRIVATE_ADDRESS:
741       return 32;
742     // FIXME: Check subtarget
743     case AMDGPUAS::LOCAL_ADDRESS:
744       return ST.useDS128() ? 128 : 64;
745 
746     // Treat constant and global as identical. SMRD loads are sometimes usable
747     // for global loads (ideally constant address space should be eliminated)
748     // depending on the context. Legality cannot be context dependent, but
749     // RegBankSelect can split the load as necessary depending on the pointer
750     // register bank/uniformity and if the memory is invariant or not written in
751     // a kernel.
752     case AMDGPUAS::CONSTANT_ADDRESS:
753     case AMDGPUAS::GLOBAL_ADDRESS:
754       return IsLoad ? 512 : 128;
755     default:
756       return 128;
757     }
758   };
759 
760   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
761                                     bool IsLoad) -> bool {
762     const LLT DstTy = Query.Types[0];
763 
764     // Split vector extloads.
765     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
766     unsigned Align = Query.MMODescrs[0].AlignInBits;
767 
768     if (MemSize < DstTy.getSizeInBits())
769       MemSize = std::max(MemSize, Align);
770 
771     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
772       return true;
773 
774     const LLT PtrTy = Query.Types[1];
775     unsigned AS = PtrTy.getAddressSpace();
776     if (MemSize > maxSizeForAddrSpace(AS, IsLoad))
777       return true;
778 
779     // Catch weird sized loads that don't evenly divide into the access sizes
780     // TODO: May be able to widen depending on alignment etc.
781     unsigned NumRegs = (MemSize + 31) / 32;
782     if (NumRegs == 3) {
783       if (!ST.hasDwordx3LoadStores())
784         return true;
785     } else {
786       // If the alignment allows, these should have been widened.
787       if (!isPowerOf2_32(NumRegs))
788         return true;
789     }
790 
791     if (Align < MemSize) {
792       const SITargetLowering *TLI = ST.getTargetLowering();
793       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
794     }
795 
796     return false;
797   };
798 
799   const auto shouldWidenLoadResult = [=](const LegalityQuery &Query) -> bool {
800     unsigned Size = Query.Types[0].getSizeInBits();
801     if (isPowerOf2_32(Size))
802       return false;
803 
804     if (Size == 96 && ST.hasDwordx3LoadStores())
805       return false;
806 
807     unsigned AddrSpace = Query.Types[1].getAddressSpace();
808     if (Size >= maxSizeForAddrSpace(AddrSpace, true))
809       return false;
810 
811     unsigned Align = Query.MMODescrs[0].AlignInBits;
812     unsigned RoundedSize = NextPowerOf2(Size);
813     return (Align >= RoundedSize);
814   };
815 
816   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
817   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
818   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
819 
820   // TODO: Refine based on subtargets which support unaligned access or 128-bit
821   // LDS
822   // TODO: Unsupported flat for SI.
823 
824   for (unsigned Op : {G_LOAD, G_STORE}) {
825     const bool IsStore = Op == G_STORE;
826 
827     auto &Actions = getActionDefinitionsBuilder(Op);
828     // Whitelist the common cases.
829     // TODO: Loads to s16 on gfx9
830     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
831                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
832                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
833                                       {S128, GlobalPtr, 128, GlobalAlign32},
834                                       {S64, GlobalPtr, 64, GlobalAlign32},
835                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
836                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
837                                       {S32, GlobalPtr, 8, GlobalAlign8},
838                                       {S32, GlobalPtr, 16, GlobalAlign16},
839 
840                                       {S32, LocalPtr, 32, 32},
841                                       {S64, LocalPtr, 64, 32},
842                                       {V2S32, LocalPtr, 64, 32},
843                                       {S32, LocalPtr, 8, 8},
844                                       {S32, LocalPtr, 16, 16},
845                                       {V2S16, LocalPtr, 32, 32},
846 
847                                       {S32, PrivatePtr, 32, 32},
848                                       {S32, PrivatePtr, 8, 8},
849                                       {S32, PrivatePtr, 16, 16},
850                                       {V2S16, PrivatePtr, 32, 32},
851 
852                                       {S32, FlatPtr, 32, GlobalAlign32},
853                                       {S32, FlatPtr, 16, GlobalAlign16},
854                                       {S32, FlatPtr, 8, GlobalAlign8},
855                                       {V2S16, FlatPtr, 32, GlobalAlign32},
856 
857                                       {S32, ConstantPtr, 32, GlobalAlign32},
858                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
859                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
860                                       {S64, ConstantPtr, 64, GlobalAlign32},
861                                       {S128, ConstantPtr, 128, GlobalAlign32},
862                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
863     Actions
864         .customIf(typeIs(1, Constant32Ptr))
865         // Widen suitably aligned loads by loading extra elements.
866         .moreElementsIf([=](const LegalityQuery &Query) {
867             const LLT Ty = Query.Types[0];
868             return Op == G_LOAD && Ty.isVector() &&
869                    shouldWidenLoadResult(Query);
870           }, moreElementsToNextPow2(0))
871         .widenScalarIf([=](const LegalityQuery &Query) {
872             const LLT Ty = Query.Types[0];
873             return Op == G_LOAD && !Ty.isVector() &&
874                    shouldWidenLoadResult(Query);
875           }, widenScalarOrEltToNextPow2(0))
876         .narrowScalarIf(
877             [=](const LegalityQuery &Query) -> bool {
878               return !Query.Types[0].isVector() &&
879                      needToSplitMemOp(Query, Op == G_LOAD);
880             },
881             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
882               const LLT DstTy = Query.Types[0];
883               const LLT PtrTy = Query.Types[1];
884 
885               const unsigned DstSize = DstTy.getSizeInBits();
886               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
887 
888               // Split extloads.
889               if (DstSize > MemSize)
890                 return std::make_pair(0, LLT::scalar(MemSize));
891 
892               if (!isPowerOf2_32(DstSize)) {
893                 // We're probably decomposing an odd sized store. Try to split
894                 // to the widest type. TODO: Account for alignment. As-is it
895                 // should be OK, since the new parts will be further legalized.
896                 unsigned FloorSize = PowerOf2Floor(DstSize);
897                 return std::make_pair(0, LLT::scalar(FloorSize));
898               }
899 
900               if (DstSize > 32 && (DstSize % 32 != 0)) {
901                 // FIXME: Need a way to specify non-extload of larger size if
902                 // suitably aligned.
903                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
904               }
905 
906               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
907                                                      Op == G_LOAD);
908               if (MemSize > MaxSize)
909                 return std::make_pair(0, LLT::scalar(MaxSize));
910 
911               unsigned Align = Query.MMODescrs[0].AlignInBits;
912               return std::make_pair(0, LLT::scalar(Align));
913             })
914         .fewerElementsIf(
915             [=](const LegalityQuery &Query) -> bool {
916               return Query.Types[0].isVector() &&
917                      needToSplitMemOp(Query, Op == G_LOAD);
918             },
919             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
920               const LLT DstTy = Query.Types[0];
921               const LLT PtrTy = Query.Types[1];
922 
923               LLT EltTy = DstTy.getElementType();
924               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
925                                                      Op == G_LOAD);
926 
927               // FIXME: Handle widened to power of 2 results better. This ends
928               // up scalarizing.
929               // FIXME: 3 element stores scalarized on SI
930 
931               // Split if it's too large for the address space.
932               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
933                 unsigned NumElts = DstTy.getNumElements();
934                 unsigned EltSize = EltTy.getSizeInBits();
935 
936                 if (MaxSize % EltSize == 0) {
937                   return std::make_pair(
938                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
939                 }
940 
941                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
942 
943                 // FIXME: Refine when odd breakdowns handled
944                 // The scalars will need to be re-legalized.
945                 if (NumPieces == 1 || NumPieces >= NumElts ||
946                     NumElts % NumPieces != 0)
947                   return std::make_pair(0, EltTy);
948 
949                 return std::make_pair(0,
950                                       LLT::vector(NumElts / NumPieces, EltTy));
951               }
952 
953               // FIXME: We could probably handle weird extending loads better.
954               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
955               if (DstTy.getSizeInBits() > MemSize)
956                 return std::make_pair(0, EltTy);
957 
958               unsigned EltSize = EltTy.getSizeInBits();
959               unsigned DstSize = DstTy.getSizeInBits();
960               if (!isPowerOf2_32(DstSize)) {
961                 // We're probably decomposing an odd sized store. Try to split
962                 // to the widest type. TODO: Account for alignment. As-is it
963                 // should be OK, since the new parts will be further legalized.
964                 unsigned FloorSize = PowerOf2Floor(DstSize);
965                 return std::make_pair(
966                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
967               }
968 
969               // Need to split because of alignment.
970               unsigned Align = Query.MMODescrs[0].AlignInBits;
971               if (EltSize > Align &&
972                   (EltSize / Align < DstTy.getNumElements())) {
973                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
974               }
975 
976               // May need relegalization for the scalars.
977               return std::make_pair(0, EltTy);
978             })
979         .minScalar(0, S32);
980 
981     if (IsStore)
982       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
983 
984     // TODO: Need a bitcast lower option?
985     Actions
986         .legalIf([=](const LegalityQuery &Query) {
987           const LLT Ty0 = Query.Types[0];
988           unsigned Size = Ty0.getSizeInBits();
989           unsigned MemSize = Query.MMODescrs[0].SizeInBits;
990           unsigned Align = Query.MMODescrs[0].AlignInBits;
991 
992           // FIXME: Widening store from alignment not valid.
993           if (MemSize < Size)
994             MemSize = std::max(MemSize, Align);
995 
996           // No extending vector loads.
997           if (Size > MemSize && Ty0.isVector())
998             return false;
999 
1000           switch (MemSize) {
1001           case 8:
1002           case 16:
1003             return Size == 32;
1004           case 32:
1005           case 64:
1006           case 128:
1007             return true;
1008           case 96:
1009             return ST.hasDwordx3LoadStores();
1010           case 256:
1011           case 512:
1012             return true;
1013           default:
1014             return false;
1015           }
1016         })
1017         .widenScalarToNextPow2(0)
1018         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
1019   }
1020 
1021   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1022                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1023                                                   {S32, GlobalPtr, 16, 2 * 8},
1024                                                   {S32, LocalPtr, 8, 8},
1025                                                   {S32, LocalPtr, 16, 16},
1026                                                   {S32, PrivatePtr, 8, 8},
1027                                                   {S32, PrivatePtr, 16, 16},
1028                                                   {S32, ConstantPtr, 8, 8},
1029                                                   {S32, ConstantPtr, 16, 2 * 8}});
1030   if (ST.hasFlatAddressSpace()) {
1031     ExtLoads.legalForTypesWithMemDesc(
1032         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1033   }
1034 
1035   ExtLoads.clampScalar(0, S32, S32)
1036           .widenScalarToNextPow2(0)
1037           .unsupportedIfMemSizeNotPow2()
1038           .lower();
1039 
1040   auto &Atomics = getActionDefinitionsBuilder(
1041     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1042      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1043      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1044      G_ATOMICRMW_UMIN})
1045     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1046                {S64, GlobalPtr}, {S64, LocalPtr}});
1047   if (ST.hasFlatAddressSpace()) {
1048     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1049   }
1050 
1051   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1052     .legalFor({{S32, LocalPtr}});
1053 
1054   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1055   // demarshalling
1056   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1057     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1058                 {S32, FlatPtr}, {S64, FlatPtr}})
1059     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1060                {S32, RegionPtr}, {S64, RegionPtr}});
1061   // TODO: Pointer types, any 32-bit or 64-bit vector
1062 
1063   // Condition should be s32 for scalar, s1 for vector.
1064   getActionDefinitionsBuilder(G_SELECT)
1065     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1066           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1067           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1068     .clampScalar(0, S16, S64)
1069     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1070     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1071     .scalarize(1)
1072     .clampMaxNumElements(0, S32, 2)
1073     .clampMaxNumElements(0, LocalPtr, 2)
1074     .clampMaxNumElements(0, PrivatePtr, 2)
1075     .scalarize(0)
1076     .widenScalarToNextPow2(0)
1077     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1078 
1079   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1080   // be more flexible with the shift amount type.
1081   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1082     .legalFor({{S32, S32}, {S64, S32}});
1083   if (ST.has16BitInsts()) {
1084     if (ST.hasVOP3PInsts()) {
1085       Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
1086             .clampMaxNumElements(0, S16, 2);
1087     } else
1088       Shifts.legalFor({{S16, S32}, {S16, S16}});
1089 
1090     // TODO: Support 16-bit shift amounts
1091     Shifts.clampScalar(1, S32, S32);
1092     Shifts.clampScalar(0, S16, S64);
1093     Shifts.widenScalarToNextPow2(0, 16);
1094   } else {
1095     // Make sure we legalize the shift amount type first, as the general
1096     // expansion for the shifted type will produce much worse code if it hasn't
1097     // been truncated already.
1098     Shifts.clampScalar(1, S32, S32);
1099     Shifts.clampScalar(0, S32, S64);
1100     Shifts.widenScalarToNextPow2(0, 32);
1101   }
1102   Shifts.scalarize(0);
1103 
1104   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1105     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1106     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1107     unsigned IdxTypeIdx = 2;
1108 
1109     getActionDefinitionsBuilder(Op)
1110       .customIf([=](const LegalityQuery &Query) {
1111           const LLT EltTy = Query.Types[EltTypeIdx];
1112           const LLT VecTy = Query.Types[VecTypeIdx];
1113           const LLT IdxTy = Query.Types[IdxTypeIdx];
1114           return (EltTy.getSizeInBits() == 16 ||
1115                   EltTy.getSizeInBits() % 32 == 0) &&
1116                  VecTy.getSizeInBits() % 32 == 0 &&
1117                  VecTy.getSizeInBits() <= 1024 &&
1118                  IdxTy.getSizeInBits() == 32;
1119         })
1120       .clampScalar(EltTypeIdx, S32, S64)
1121       .clampScalar(VecTypeIdx, S32, S64)
1122       .clampScalar(IdxTypeIdx, S32, S32);
1123   }
1124 
1125   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1126     .unsupportedIf([=](const LegalityQuery &Query) {
1127         const LLT &EltTy = Query.Types[1].getElementType();
1128         return Query.Types[0] != EltTy;
1129       });
1130 
1131   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1132     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1133     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1134 
1135     // FIXME: Doesn't handle extract of illegal sizes.
1136     getActionDefinitionsBuilder(Op)
1137       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1138       // FIXME: Multiples of 16 should not be legal.
1139       .legalIf([=](const LegalityQuery &Query) {
1140           const LLT BigTy = Query.Types[BigTyIdx];
1141           const LLT LitTy = Query.Types[LitTyIdx];
1142           return (BigTy.getSizeInBits() % 32 == 0) &&
1143                  (LitTy.getSizeInBits() % 16 == 0);
1144         })
1145       .widenScalarIf(
1146         [=](const LegalityQuery &Query) {
1147           const LLT BigTy = Query.Types[BigTyIdx];
1148           return (BigTy.getScalarSizeInBits() < 16);
1149         },
1150         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1151       .widenScalarIf(
1152         [=](const LegalityQuery &Query) {
1153           const LLT LitTy = Query.Types[LitTyIdx];
1154           return (LitTy.getScalarSizeInBits() < 16);
1155         },
1156         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1157       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1158       .widenScalarToNextPow2(BigTyIdx, 32);
1159 
1160   }
1161 
1162   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1163     .legalForCartesianProduct(AllS32Vectors, {S32})
1164     .legalForCartesianProduct(AllS64Vectors, {S64})
1165     .clampNumElements(0, V16S32, V32S32)
1166     .clampNumElements(0, V2S64, V16S64)
1167     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1168 
1169   if (ST.hasScalarPackInsts()) {
1170     BuildVector
1171       // FIXME: Should probably widen s1 vectors straight to s32
1172       .minScalarOrElt(0, S16)
1173       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1174       .minScalar(1, S32);
1175 
1176     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1177       .legalFor({V2S16, S32})
1178       .lower();
1179     BuildVector.minScalarOrElt(0, S32);
1180   } else {
1181     BuildVector.customFor({V2S16, S16});
1182     BuildVector.minScalarOrElt(0, S32);
1183 
1184     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1185       .customFor({V2S16, S32})
1186       .lower();
1187   }
1188 
1189   BuildVector.legalIf(isRegisterType(0));
1190 
1191   // FIXME: Clamp maximum size
1192   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1193     .legalIf(isRegisterType(0));
1194 
1195   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1196   // pre-legalize.
1197   if (ST.hasVOP3PInsts()) {
1198     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1199       .customFor({V2S16, V2S16})
1200       .lower();
1201   } else
1202     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1203 
1204   // Merge/Unmerge
1205   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1206     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1207     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1208 
1209     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1210       const LLT Ty = Query.Types[TypeIdx];
1211       if (Ty.isVector()) {
1212         const LLT &EltTy = Ty.getElementType();
1213         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1214           return true;
1215         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1216           return true;
1217       }
1218       return false;
1219     };
1220 
1221     auto &Builder = getActionDefinitionsBuilder(Op)
1222       // Try to widen to s16 first for small types.
1223       // TODO: Only do this on targets with legal s16 shifts
1224       .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1225 
1226       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1227       .lowerFor({{S16, V2S16}})
1228       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1229       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1230                            elementTypeIs(1, S16)),
1231                        changeTo(1, V2S16))
1232       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1233       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1234       // valid.
1235       .clampScalar(LitTyIdx, S32, S512)
1236       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1237       // Break up vectors with weird elements into scalars
1238       .fewerElementsIf(
1239         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1240         scalarize(0))
1241       .fewerElementsIf(
1242         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1243         scalarize(1))
1244       .clampScalar(BigTyIdx, S32, S1024);
1245 
1246     if (Op == G_MERGE_VALUES) {
1247       Builder.widenScalarIf(
1248         // TODO: Use 16-bit shifts if legal for 8-bit values?
1249         [=](const LegalityQuery &Query) {
1250           const LLT Ty = Query.Types[LitTyIdx];
1251           return Ty.getSizeInBits() < 32;
1252         },
1253         changeTo(LitTyIdx, S32));
1254     }
1255 
1256     Builder.widenScalarIf(
1257       [=](const LegalityQuery &Query) {
1258         const LLT Ty = Query.Types[BigTyIdx];
1259         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1260           Ty.getSizeInBits() % 16 != 0;
1261       },
1262       [=](const LegalityQuery &Query) {
1263         // Pick the next power of 2, or a multiple of 64 over 128.
1264         // Whichever is smaller.
1265         const LLT &Ty = Query.Types[BigTyIdx];
1266         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1267         if (NewSizeInBits >= 256) {
1268           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1269           if (RoundedTo < NewSizeInBits)
1270             NewSizeInBits = RoundedTo;
1271         }
1272         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1273       })
1274       .legalIf([=](const LegalityQuery &Query) {
1275           const LLT &BigTy = Query.Types[BigTyIdx];
1276           const LLT &LitTy = Query.Types[LitTyIdx];
1277 
1278           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1279             return false;
1280           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1281             return false;
1282 
1283           return BigTy.getSizeInBits() % 16 == 0 &&
1284                  LitTy.getSizeInBits() % 16 == 0 &&
1285                  BigTy.getSizeInBits() <= 1024;
1286         })
1287       // Any vectors left are the wrong size. Scalarize them.
1288       .scalarize(0)
1289       .scalarize(1);
1290   }
1291 
1292   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1293   // RegBankSelect.
1294   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1295     .legalFor({{S32}, {S64}});
1296 
1297   if (ST.hasVOP3PInsts()) {
1298     SextInReg.lowerFor({{V2S16}})
1299       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1300       // get more vector shift opportunities, since we'll get those when
1301       // expanded.
1302       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1303   } else if (ST.has16BitInsts()) {
1304     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1305   } else {
1306     // Prefer to promote to s32 before lowering if we don't have 16-bit
1307     // shifts. This avoid a lot of intermediate truncate and extend operations.
1308     SextInReg.lowerFor({{S32}, {S64}});
1309   }
1310 
1311   SextInReg
1312     .scalarize(0)
1313     .clampScalar(0, S32, S64)
1314     .lower();
1315 
1316   getActionDefinitionsBuilder(G_FSHR)
1317     .legalFor({{S32, S32}})
1318     .scalarize(0)
1319     .lower();
1320 
1321   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1322     .legalFor({S64});
1323 
1324   getActionDefinitionsBuilder({
1325       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1326       G_FCOPYSIGN,
1327 
1328       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1329       G_READ_REGISTER,
1330       G_WRITE_REGISTER,
1331 
1332       G_SADDO, G_SSUBO,
1333 
1334        // TODO: Implement
1335       G_FMINIMUM, G_FMAXIMUM,
1336       G_FSHL
1337     }).lower();
1338 
1339   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1340         G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1341         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1342     .unsupported();
1343 
1344   computeTables();
1345   verify(*ST.getInstrInfo());
1346 }
1347 
1348 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1349                                          MachineRegisterInfo &MRI,
1350                                          MachineIRBuilder &B,
1351                                          GISelChangeObserver &Observer) const {
1352   switch (MI.getOpcode()) {
1353   case TargetOpcode::G_ADDRSPACE_CAST:
1354     return legalizeAddrSpaceCast(MI, MRI, B);
1355   case TargetOpcode::G_FRINT:
1356     return legalizeFrint(MI, MRI, B);
1357   case TargetOpcode::G_FCEIL:
1358     return legalizeFceil(MI, MRI, B);
1359   case TargetOpcode::G_INTRINSIC_TRUNC:
1360     return legalizeIntrinsicTrunc(MI, MRI, B);
1361   case TargetOpcode::G_SITOFP:
1362     return legalizeITOFP(MI, MRI, B, true);
1363   case TargetOpcode::G_UITOFP:
1364     return legalizeITOFP(MI, MRI, B, false);
1365   case TargetOpcode::G_FPTOSI:
1366     return legalizeFPTOI(MI, MRI, B, true);
1367   case TargetOpcode::G_FPTOUI:
1368     return legalizeFPTOI(MI, MRI, B, false);
1369   case TargetOpcode::G_FMINNUM:
1370   case TargetOpcode::G_FMAXNUM:
1371   case TargetOpcode::G_FMINNUM_IEEE:
1372   case TargetOpcode::G_FMAXNUM_IEEE:
1373     return legalizeMinNumMaxNum(MI, MRI, B);
1374   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1375     return legalizeExtractVectorElt(MI, MRI, B);
1376   case TargetOpcode::G_INSERT_VECTOR_ELT:
1377     return legalizeInsertVectorElt(MI, MRI, B);
1378   case TargetOpcode::G_SHUFFLE_VECTOR:
1379     return legalizeShuffleVector(MI, MRI, B);
1380   case TargetOpcode::G_FSIN:
1381   case TargetOpcode::G_FCOS:
1382     return legalizeSinCos(MI, MRI, B);
1383   case TargetOpcode::G_GLOBAL_VALUE:
1384     return legalizeGlobalValue(MI, MRI, B);
1385   case TargetOpcode::G_LOAD:
1386     return legalizeLoad(MI, MRI, B, Observer);
1387   case TargetOpcode::G_FMAD:
1388     return legalizeFMad(MI, MRI, B);
1389   case TargetOpcode::G_FDIV:
1390     return legalizeFDIV(MI, MRI, B);
1391   case TargetOpcode::G_UDIV:
1392   case TargetOpcode::G_UREM:
1393     return legalizeUDIV_UREM(MI, MRI, B);
1394   case TargetOpcode::G_SDIV:
1395   case TargetOpcode::G_SREM:
1396     return legalizeSDIV_SREM(MI, MRI, B);
1397   case TargetOpcode::G_ATOMIC_CMPXCHG:
1398     return legalizeAtomicCmpXChg(MI, MRI, B);
1399   case TargetOpcode::G_FLOG:
1400     return legalizeFlog(MI, B, 1.0f / numbers::log2ef);
1401   case TargetOpcode::G_FLOG10:
1402     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1403   case TargetOpcode::G_FEXP:
1404     return legalizeFExp(MI, B);
1405   case TargetOpcode::G_FPOW:
1406     return legalizeFPow(MI, B);
1407   case TargetOpcode::G_FFLOOR:
1408     return legalizeFFloor(MI, MRI, B);
1409   case TargetOpcode::G_BUILD_VECTOR:
1410     return legalizeBuildVector(MI, MRI, B);
1411   default:
1412     return false;
1413   }
1414 
1415   llvm_unreachable("expected switch to return");
1416 }
1417 
1418 Register AMDGPULegalizerInfo::getSegmentAperture(
1419   unsigned AS,
1420   MachineRegisterInfo &MRI,
1421   MachineIRBuilder &B) const {
1422   MachineFunction &MF = B.getMF();
1423   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1424   const LLT S32 = LLT::scalar(32);
1425 
1426   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1427 
1428   if (ST.hasApertureRegs()) {
1429     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1430     // getreg.
1431     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1432         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1433         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1434     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1435         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1436         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1437     unsigned Encoding =
1438         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1439         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1440         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1441 
1442     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1443 
1444     B.buildInstr(AMDGPU::S_GETREG_B32)
1445       .addDef(GetReg)
1446       .addImm(Encoding);
1447     MRI.setType(GetReg, S32);
1448 
1449     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1450     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1451   }
1452 
1453   Register QueuePtr = MRI.createGenericVirtualRegister(
1454     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1455 
1456   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1457   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1458     return Register();
1459 
1460   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1461   // private_segment_aperture_base_hi.
1462   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1463 
1464   // TODO: can we be smarter about machine pointer info?
1465   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1466   MachineMemOperand *MMO = MF.getMachineMemOperand(
1467     PtrInfo,
1468     MachineMemOperand::MOLoad |
1469     MachineMemOperand::MODereferenceable |
1470     MachineMemOperand::MOInvariant,
1471     4,
1472     MinAlign(64, StructOffset));
1473 
1474   Register LoadAddr;
1475 
1476   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1477   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1478 }
1479 
1480 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1481   MachineInstr &MI, MachineRegisterInfo &MRI,
1482   MachineIRBuilder &B) const {
1483   MachineFunction &MF = B.getMF();
1484 
1485   B.setInstr(MI);
1486 
1487   const LLT S32 = LLT::scalar(32);
1488   Register Dst = MI.getOperand(0).getReg();
1489   Register Src = MI.getOperand(1).getReg();
1490 
1491   LLT DstTy = MRI.getType(Dst);
1492   LLT SrcTy = MRI.getType(Src);
1493   unsigned DestAS = DstTy.getAddressSpace();
1494   unsigned SrcAS = SrcTy.getAddressSpace();
1495 
1496   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1497   // vector element.
1498   assert(!DstTy.isVector());
1499 
1500   const AMDGPUTargetMachine &TM
1501     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1502 
1503   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1504   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1505     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1506     return true;
1507   }
1508 
1509   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1510     // Truncate.
1511     B.buildExtract(Dst, Src, 0);
1512     MI.eraseFromParent();
1513     return true;
1514   }
1515 
1516   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1517     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1518     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1519 
1520     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1521     // another. Merge operands are required to be the same type, but creating an
1522     // extra ptrtoint would be kind of pointless.
1523     auto HighAddr = B.buildConstant(
1524       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1525     B.buildMerge(Dst, {Src, HighAddr});
1526     MI.eraseFromParent();
1527     return true;
1528   }
1529 
1530   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1531     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1532            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1533     unsigned NullVal = TM.getNullPointerValue(DestAS);
1534 
1535     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1536     auto FlatNull = B.buildConstant(SrcTy, 0);
1537 
1538     // Extract low 32-bits of the pointer.
1539     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1540 
1541     auto CmpRes =
1542         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1543     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1544 
1545     MI.eraseFromParent();
1546     return true;
1547   }
1548 
1549   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1550     return false;
1551 
1552   if (!ST.hasFlatAddressSpace())
1553     return false;
1554 
1555   auto SegmentNull =
1556       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1557   auto FlatNull =
1558       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1559 
1560   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1561   if (!ApertureReg.isValid())
1562     return false;
1563 
1564   auto CmpRes =
1565       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1566 
1567   // Coerce the type of the low half of the result so we can use merge_values.
1568   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1569 
1570   // TODO: Should we allow mismatched types but matching sizes in merges to
1571   // avoid the ptrtoint?
1572   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1573   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1574 
1575   MI.eraseFromParent();
1576   return true;
1577 }
1578 
1579 bool AMDGPULegalizerInfo::legalizeFrint(
1580   MachineInstr &MI, MachineRegisterInfo &MRI,
1581   MachineIRBuilder &B) const {
1582   B.setInstr(MI);
1583 
1584   Register Src = MI.getOperand(1).getReg();
1585   LLT Ty = MRI.getType(Src);
1586   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1587 
1588   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1589   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1590 
1591   auto C1 = B.buildFConstant(Ty, C1Val);
1592   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1593 
1594   // TODO: Should this propagate fast-math-flags?
1595   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1596   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1597 
1598   auto C2 = B.buildFConstant(Ty, C2Val);
1599   auto Fabs = B.buildFAbs(Ty, Src);
1600 
1601   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1602   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1603   return true;
1604 }
1605 
1606 bool AMDGPULegalizerInfo::legalizeFceil(
1607   MachineInstr &MI, MachineRegisterInfo &MRI,
1608   MachineIRBuilder &B) const {
1609   B.setInstr(MI);
1610 
1611   const LLT S1 = LLT::scalar(1);
1612   const LLT S64 = LLT::scalar(64);
1613 
1614   Register Src = MI.getOperand(1).getReg();
1615   assert(MRI.getType(Src) == S64);
1616 
1617   // result = trunc(src)
1618   // if (src > 0.0 && src != result)
1619   //   result += 1.0
1620 
1621   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1622 
1623   const auto Zero = B.buildFConstant(S64, 0.0);
1624   const auto One = B.buildFConstant(S64, 1.0);
1625   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1626   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1627   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1628   auto Add = B.buildSelect(S64, And, One, Zero);
1629 
1630   // TODO: Should this propagate fast-math-flags?
1631   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1632   return true;
1633 }
1634 
1635 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1636                                               MachineIRBuilder &B) {
1637   const unsigned FractBits = 52;
1638   const unsigned ExpBits = 11;
1639   LLT S32 = LLT::scalar(32);
1640 
1641   auto Const0 = B.buildConstant(S32, FractBits - 32);
1642   auto Const1 = B.buildConstant(S32, ExpBits);
1643 
1644   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1645     .addUse(Const0.getReg(0))
1646     .addUse(Const1.getReg(0));
1647 
1648   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1649 }
1650 
1651 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1652   MachineInstr &MI, MachineRegisterInfo &MRI,
1653   MachineIRBuilder &B) const {
1654   B.setInstr(MI);
1655 
1656   const LLT S1 = LLT::scalar(1);
1657   const LLT S32 = LLT::scalar(32);
1658   const LLT S64 = LLT::scalar(64);
1659 
1660   Register Src = MI.getOperand(1).getReg();
1661   assert(MRI.getType(Src) == S64);
1662 
1663   // TODO: Should this use extract since the low half is unused?
1664   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1665   Register Hi = Unmerge.getReg(1);
1666 
1667   // Extract the upper half, since this is where we will find the sign and
1668   // exponent.
1669   auto Exp = extractF64Exponent(Hi, B);
1670 
1671   const unsigned FractBits = 52;
1672 
1673   // Extract the sign bit.
1674   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1675   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1676 
1677   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1678 
1679   const auto Zero32 = B.buildConstant(S32, 0);
1680 
1681   // Extend back to 64-bits.
1682   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1683 
1684   auto Shr = B.buildAShr(S64, FractMask, Exp);
1685   auto Not = B.buildNot(S64, Shr);
1686   auto Tmp0 = B.buildAnd(S64, Src, Not);
1687   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1688 
1689   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1690   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1691 
1692   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1693   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1694   return true;
1695 }
1696 
1697 bool AMDGPULegalizerInfo::legalizeITOFP(
1698   MachineInstr &MI, MachineRegisterInfo &MRI,
1699   MachineIRBuilder &B, bool Signed) const {
1700   B.setInstr(MI);
1701 
1702   Register Dst = MI.getOperand(0).getReg();
1703   Register Src = MI.getOperand(1).getReg();
1704 
1705   const LLT S64 = LLT::scalar(64);
1706   const LLT S32 = LLT::scalar(32);
1707 
1708   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1709 
1710   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1711 
1712   auto CvtHi = Signed ?
1713     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1714     B.buildUITOFP(S64, Unmerge.getReg(1));
1715 
1716   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1717 
1718   auto ThirtyTwo = B.buildConstant(S32, 32);
1719   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1720     .addUse(CvtHi.getReg(0))
1721     .addUse(ThirtyTwo.getReg(0));
1722 
1723   // TODO: Should this propagate fast-math-flags?
1724   B.buildFAdd(Dst, LdExp, CvtLo);
1725   MI.eraseFromParent();
1726   return true;
1727 }
1728 
1729 // TODO: Copied from DAG implementation. Verify logic and document how this
1730 // actually works.
1731 bool AMDGPULegalizerInfo::legalizeFPTOI(
1732   MachineInstr &MI, MachineRegisterInfo &MRI,
1733   MachineIRBuilder &B, bool Signed) const {
1734   B.setInstr(MI);
1735 
1736   Register Dst = MI.getOperand(0).getReg();
1737   Register Src = MI.getOperand(1).getReg();
1738 
1739   const LLT S64 = LLT::scalar(64);
1740   const LLT S32 = LLT::scalar(32);
1741 
1742   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1743 
1744   unsigned Flags = MI.getFlags();
1745 
1746   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
1747   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
1748   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
1749 
1750   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
1751   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
1752   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
1753 
1754   auto Hi = Signed ?
1755     B.buildFPTOSI(S32, FloorMul) :
1756     B.buildFPTOUI(S32, FloorMul);
1757   auto Lo = B.buildFPTOUI(S32, Fma);
1758 
1759   B.buildMerge(Dst, { Lo, Hi });
1760   MI.eraseFromParent();
1761 
1762   return true;
1763 }
1764 
1765 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1766   MachineInstr &MI, MachineRegisterInfo &MRI,
1767   MachineIRBuilder &B) const {
1768   MachineFunction &MF = B.getMF();
1769   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1770 
1771   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1772                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1773 
1774   // With ieee_mode disabled, the instructions have the correct behavior
1775   // already for G_FMINNUM/G_FMAXNUM
1776   if (!MFI->getMode().IEEE)
1777     return !IsIEEEOp;
1778 
1779   if (IsIEEEOp)
1780     return true;
1781 
1782   MachineIRBuilder HelperBuilder(MI);
1783   GISelObserverWrapper DummyObserver;
1784   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1785   HelperBuilder.setInstr(MI);
1786   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1787 }
1788 
1789 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1790   MachineInstr &MI, MachineRegisterInfo &MRI,
1791   MachineIRBuilder &B) const {
1792   // TODO: Should move some of this into LegalizerHelper.
1793 
1794   // TODO: Promote dynamic indexing of s16 to s32
1795 
1796   // FIXME: Artifact combiner probably should have replaced the truncated
1797   // constant before this, so we shouldn't need
1798   // getConstantVRegValWithLookThrough.
1799   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1800     MI.getOperand(2).getReg(), MRI);
1801   if (!IdxVal) // Dynamic case will be selected to register indexing.
1802     return true;
1803 
1804   Register Dst = MI.getOperand(0).getReg();
1805   Register Vec = MI.getOperand(1).getReg();
1806 
1807   LLT VecTy = MRI.getType(Vec);
1808   LLT EltTy = VecTy.getElementType();
1809   assert(EltTy == MRI.getType(Dst));
1810 
1811   B.setInstr(MI);
1812 
1813   if (IdxVal->Value < VecTy.getNumElements())
1814     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
1815   else
1816     B.buildUndef(Dst);
1817 
1818   MI.eraseFromParent();
1819   return true;
1820 }
1821 
1822 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1823   MachineInstr &MI, MachineRegisterInfo &MRI,
1824   MachineIRBuilder &B) const {
1825   // TODO: Should move some of this into LegalizerHelper.
1826 
1827   // TODO: Promote dynamic indexing of s16 to s32
1828 
1829   // FIXME: Artifact combiner probably should have replaced the truncated
1830   // constant before this, so we shouldn't need
1831   // getConstantVRegValWithLookThrough.
1832   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1833     MI.getOperand(3).getReg(), MRI);
1834   if (!IdxVal) // Dynamic case will be selected to register indexing.
1835     return true;
1836 
1837   Register Dst = MI.getOperand(0).getReg();
1838   Register Vec = MI.getOperand(1).getReg();
1839   Register Ins = MI.getOperand(2).getReg();
1840 
1841   LLT VecTy = MRI.getType(Vec);
1842   LLT EltTy = VecTy.getElementType();
1843   assert(EltTy == MRI.getType(Ins));
1844 
1845   B.setInstr(MI);
1846 
1847   if (IdxVal->Value < VecTy.getNumElements())
1848     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
1849   else
1850     B.buildUndef(Dst);
1851 
1852   MI.eraseFromParent();
1853   return true;
1854 }
1855 
1856 bool AMDGPULegalizerInfo::legalizeShuffleVector(
1857   MachineInstr &MI, MachineRegisterInfo &MRI,
1858   MachineIRBuilder &B) const {
1859   const LLT V2S16 = LLT::vector(2, 16);
1860 
1861   Register Dst = MI.getOperand(0).getReg();
1862   Register Src0 = MI.getOperand(1).getReg();
1863   LLT DstTy = MRI.getType(Dst);
1864   LLT SrcTy = MRI.getType(Src0);
1865 
1866   if (SrcTy == V2S16 && DstTy == V2S16 &&
1867       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
1868     return true;
1869 
1870   MachineIRBuilder HelperBuilder(MI);
1871   GISelObserverWrapper DummyObserver;
1872   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
1873   HelperBuilder.setInstr(MI);
1874   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
1875 }
1876 
1877 bool AMDGPULegalizerInfo::legalizeSinCos(
1878   MachineInstr &MI, MachineRegisterInfo &MRI,
1879   MachineIRBuilder &B) const {
1880   B.setInstr(MI);
1881 
1882   Register DstReg = MI.getOperand(0).getReg();
1883   Register SrcReg = MI.getOperand(1).getReg();
1884   LLT Ty = MRI.getType(DstReg);
1885   unsigned Flags = MI.getFlags();
1886 
1887   Register TrigVal;
1888   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1889   if (ST.hasTrigReducedRange()) {
1890     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1891     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1892       .addUse(MulVal.getReg(0))
1893       .setMIFlags(Flags).getReg(0);
1894   } else
1895     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1896 
1897   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1898     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1899   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1900     .addUse(TrigVal)
1901     .setMIFlags(Flags);
1902   MI.eraseFromParent();
1903   return true;
1904 }
1905 
1906 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1907   Register DstReg, LLT PtrTy,
1908   MachineIRBuilder &B, const GlobalValue *GV,
1909   unsigned Offset, unsigned GAFlags) const {
1910   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1911   // to the following code sequence:
1912   //
1913   // For constant address space:
1914   //   s_getpc_b64 s[0:1]
1915   //   s_add_u32 s0, s0, $symbol
1916   //   s_addc_u32 s1, s1, 0
1917   //
1918   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1919   //   a fixup or relocation is emitted to replace $symbol with a literal
1920   //   constant, which is a pc-relative offset from the encoding of the $symbol
1921   //   operand to the global variable.
1922   //
1923   // For global address space:
1924   //   s_getpc_b64 s[0:1]
1925   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1926   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1927   //
1928   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1929   //   fixups or relocations are emitted to replace $symbol@*@lo and
1930   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1931   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
1932   //   operand to the global variable.
1933   //
1934   // What we want here is an offset from the value returned by s_getpc
1935   // (which is the address of the s_add_u32 instruction) to the global
1936   // variable, but since the encoding of $symbol starts 4 bytes after the start
1937   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1938   // small. This requires us to add 4 to the global variable offset in order to
1939   // compute the correct address.
1940 
1941   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1942 
1943   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1944     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1945 
1946   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1947     .addDef(PCReg);
1948 
1949   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1950   if (GAFlags == SIInstrInfo::MO_NONE)
1951     MIB.addImm(0);
1952   else
1953     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1954 
1955   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1956 
1957   if (PtrTy.getSizeInBits() == 32)
1958     B.buildExtract(DstReg, PCReg, 0);
1959   return true;
1960  }
1961 
1962 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1963   MachineInstr &MI, MachineRegisterInfo &MRI,
1964   MachineIRBuilder &B) const {
1965   Register DstReg = MI.getOperand(0).getReg();
1966   LLT Ty = MRI.getType(DstReg);
1967   unsigned AS = Ty.getAddressSpace();
1968 
1969   const GlobalValue *GV = MI.getOperand(1).getGlobal();
1970   MachineFunction &MF = B.getMF();
1971   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1972   B.setInstr(MI);
1973 
1974   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1975     if (!MFI->isEntryFunction()) {
1976       const Function &Fn = MF.getFunction();
1977       DiagnosticInfoUnsupported BadLDSDecl(
1978         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
1979         DS_Warning);
1980       Fn.getContext().diagnose(BadLDSDecl);
1981 
1982       // We currently don't have a way to correctly allocate LDS objects that
1983       // aren't directly associated with a kernel. We do force inlining of
1984       // functions that use local objects. However, if these dead functions are
1985       // not eliminated, we don't want a compile time error. Just emit a warning
1986       // and a trap, since there should be no callable path here.
1987       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
1988       B.buildUndef(DstReg);
1989       MI.eraseFromParent();
1990       return true;
1991     }
1992 
1993     // TODO: We could emit code to handle the initialization somewhere.
1994     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1995       const SITargetLowering *TLI = ST.getTargetLowering();
1996       if (!TLI->shouldUseLDSConstAddress(GV)) {
1997         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
1998         return true; // Leave in place;
1999       }
2000 
2001       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
2002       MI.eraseFromParent();
2003       return true;
2004     }
2005 
2006     const Function &Fn = MF.getFunction();
2007     DiagnosticInfoUnsupported BadInit(
2008       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2009     Fn.getContext().diagnose(BadInit);
2010     return true;
2011   }
2012 
2013   const SITargetLowering *TLI = ST.getTargetLowering();
2014 
2015   if (TLI->shouldEmitFixup(GV)) {
2016     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2017     MI.eraseFromParent();
2018     return true;
2019   }
2020 
2021   if (TLI->shouldEmitPCReloc(GV)) {
2022     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2023     MI.eraseFromParent();
2024     return true;
2025   }
2026 
2027   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2028   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2029 
2030   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2031     MachinePointerInfo::getGOT(MF),
2032     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2033     MachineMemOperand::MOInvariant,
2034     8 /*Size*/, 8 /*Align*/);
2035 
2036   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2037 
2038   if (Ty.getSizeInBits() == 32) {
2039     // Truncate if this is a 32-bit constant adrdess.
2040     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2041     B.buildExtract(DstReg, Load, 0);
2042   } else
2043     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2044 
2045   MI.eraseFromParent();
2046   return true;
2047 }
2048 
2049 bool AMDGPULegalizerInfo::legalizeLoad(
2050   MachineInstr &MI, MachineRegisterInfo &MRI,
2051   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
2052   B.setInstr(MI);
2053   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2054   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
2055   Observer.changingInstr(MI);
2056   MI.getOperand(1).setReg(Cast.getReg(0));
2057   Observer.changedInstr(MI);
2058   return true;
2059 }
2060 
2061 bool AMDGPULegalizerInfo::legalizeFMad(
2062   MachineInstr &MI, MachineRegisterInfo &MRI,
2063   MachineIRBuilder &B) const {
2064   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2065   assert(Ty.isScalar());
2066 
2067   MachineFunction &MF = B.getMF();
2068   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2069 
2070   // TODO: Always legal with future ftz flag.
2071   // FIXME: Do we need just output?
2072   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2073     return true;
2074   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2075     return true;
2076 
2077   MachineIRBuilder HelperBuilder(MI);
2078   GISelObserverWrapper DummyObserver;
2079   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2080   HelperBuilder.setMBB(*MI.getParent());
2081   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2082 }
2083 
2084 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2085   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2086   Register DstReg = MI.getOperand(0).getReg();
2087   Register PtrReg = MI.getOperand(1).getReg();
2088   Register CmpVal = MI.getOperand(2).getReg();
2089   Register NewVal = MI.getOperand(3).getReg();
2090 
2091   assert(SITargetLowering::isFlatGlobalAddrSpace(
2092            MRI.getType(PtrReg).getAddressSpace()) &&
2093          "this should not have been custom lowered");
2094 
2095   LLT ValTy = MRI.getType(CmpVal);
2096   LLT VecTy = LLT::vector(2, ValTy);
2097 
2098   B.setInstr(MI);
2099   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2100 
2101   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2102     .addDef(DstReg)
2103     .addUse(PtrReg)
2104     .addUse(PackedVal)
2105     .setMemRefs(MI.memoperands());
2106 
2107   MI.eraseFromParent();
2108   return true;
2109 }
2110 
2111 bool AMDGPULegalizerInfo::legalizeFlog(
2112   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2113   Register Dst = MI.getOperand(0).getReg();
2114   Register Src = MI.getOperand(1).getReg();
2115   LLT Ty = B.getMRI()->getType(Dst);
2116   unsigned Flags = MI.getFlags();
2117   B.setInstr(MI);
2118 
2119   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2120   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2121 
2122   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2123   MI.eraseFromParent();
2124   return true;
2125 }
2126 
2127 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2128                                        MachineIRBuilder &B) const {
2129   Register Dst = MI.getOperand(0).getReg();
2130   Register Src = MI.getOperand(1).getReg();
2131   unsigned Flags = MI.getFlags();
2132   LLT Ty = B.getMRI()->getType(Dst);
2133   B.setInstr(MI);
2134 
2135   auto K = B.buildFConstant(Ty, numbers::log2e);
2136   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2137   B.buildFExp2(Dst, Mul, Flags);
2138   MI.eraseFromParent();
2139   return true;
2140 }
2141 
2142 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2143                                        MachineIRBuilder &B) const {
2144   Register Dst = MI.getOperand(0).getReg();
2145   Register Src0 = MI.getOperand(1).getReg();
2146   Register Src1 = MI.getOperand(2).getReg();
2147   unsigned Flags = MI.getFlags();
2148   LLT Ty = B.getMRI()->getType(Dst);
2149   B.setInstr(MI);
2150   const LLT S16 = LLT::scalar(16);
2151   const LLT S32 = LLT::scalar(32);
2152 
2153   if (Ty == S32) {
2154     auto Log = B.buildFLog2(S32, Src0, Flags);
2155     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2156       .addUse(Log.getReg(0))
2157       .addUse(Src1)
2158       .setMIFlags(Flags);
2159     B.buildFExp2(Dst, Mul, Flags);
2160   } else if (Ty == S16) {
2161     // There's no f16 fmul_legacy, so we need to convert for it.
2162     auto Log = B.buildFLog2(S16, Src0, Flags);
2163     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2164     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2165     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2166       .addUse(Ext0.getReg(0))
2167       .addUse(Ext1.getReg(0))
2168       .setMIFlags(Flags);
2169 
2170     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2171   } else
2172     return false;
2173 
2174   MI.eraseFromParent();
2175   return true;
2176 }
2177 
2178 // Find a source register, ignoring any possible source modifiers.
2179 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2180   Register ModSrc = OrigSrc;
2181   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2182     ModSrc = SrcFNeg->getOperand(1).getReg();
2183     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2184       ModSrc = SrcFAbs->getOperand(1).getReg();
2185   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2186     ModSrc = SrcFAbs->getOperand(1).getReg();
2187   return ModSrc;
2188 }
2189 
2190 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2191                                          MachineRegisterInfo &MRI,
2192                                          MachineIRBuilder &B) const {
2193   B.setInstr(MI);
2194 
2195   const LLT S1 = LLT::scalar(1);
2196   const LLT S64 = LLT::scalar(64);
2197   Register Dst = MI.getOperand(0).getReg();
2198   Register OrigSrc = MI.getOperand(1).getReg();
2199   unsigned Flags = MI.getFlags();
2200   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2201          "this should not have been custom lowered");
2202 
2203   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2204   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2205   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2206   // V_FRACT bug is:
2207   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2208   //
2209   // Convert floor(x) to (x - fract(x))
2210 
2211   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2212     .addUse(OrigSrc)
2213     .setMIFlags(Flags);
2214 
2215   // Give source modifier matching some assistance before obscuring a foldable
2216   // pattern.
2217 
2218   // TODO: We can avoid the neg on the fract? The input sign to fract
2219   // shouldn't matter?
2220   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2221 
2222   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2223 
2224   Register Min = MRI.createGenericVirtualRegister(S64);
2225 
2226   // We don't need to concern ourselves with the snan handling difference, so
2227   // use the one which will directly select.
2228   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2229   if (MFI->getMode().IEEE)
2230     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2231   else
2232     B.buildFMinNum(Min, Fract, Const, Flags);
2233 
2234   Register CorrectedFract = Min;
2235   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2236     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2237     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2238   }
2239 
2240   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2241   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2242 
2243   MI.eraseFromParent();
2244   return true;
2245 }
2246 
2247 // Turn an illegal packed v2s16 build vector into bit operations.
2248 // TODO: This should probably be a bitcast action in LegalizerHelper.
2249 bool AMDGPULegalizerInfo::legalizeBuildVector(
2250   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2251   Register Dst = MI.getOperand(0).getReg();
2252   const LLT S32 = LLT::scalar(32);
2253   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2254 
2255   Register Src0 = MI.getOperand(1).getReg();
2256   Register Src1 = MI.getOperand(2).getReg();
2257   assert(MRI.getType(Src0) == LLT::scalar(16));
2258 
2259   B.setInstr(MI);
2260   auto Merge = B.buildMerge(S32, {Src0, Src1});
2261   B.buildBitcast(Dst, Merge);
2262 
2263   MI.eraseFromParent();
2264   return true;
2265 }
2266 
2267 // Return the use branch instruction, otherwise null if the usage is invalid.
2268 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2269                                        MachineRegisterInfo &MRI,
2270                                        MachineInstr *&Br) {
2271   Register CondDef = MI.getOperand(0).getReg();
2272   if (!MRI.hasOneNonDBGUse(CondDef))
2273     return nullptr;
2274 
2275   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2276   if (UseMI.getParent() != MI.getParent() ||
2277       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2278     return nullptr;
2279 
2280   // Make sure the cond br is followed by a G_BR
2281   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2282   if (Next != MI.getParent()->end()) {
2283     if (Next->getOpcode() != AMDGPU::G_BR)
2284       return nullptr;
2285     Br = &*Next;
2286   }
2287 
2288   return &UseMI;
2289 }
2290 
2291 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B,
2292                                                MachineRegisterInfo &MRI,
2293                                                Register LiveIn,
2294                                                Register PhyReg) const {
2295   assert(PhyReg.isPhysical() && "Physical register expected");
2296 
2297   // Insert the live-in copy, if required, by defining destination virtual
2298   // register.
2299   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
2300   if (!MRI.getVRegDef(LiveIn)) {
2301     // FIXME: Should have scoped insert pt
2302     MachineBasicBlock &OrigInsBB = B.getMBB();
2303     auto OrigInsPt = B.getInsertPt();
2304 
2305     MachineBasicBlock &EntryMBB = B.getMF().front();
2306     EntryMBB.addLiveIn(PhyReg);
2307     B.setInsertPt(EntryMBB, EntryMBB.begin());
2308     B.buildCopy(LiveIn, PhyReg);
2309 
2310     B.setInsertPt(OrigInsBB, OrigInsPt);
2311   }
2312 
2313   return LiveIn;
2314 }
2315 
2316 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B,
2317                                                 MachineRegisterInfo &MRI,
2318                                                 Register PhyReg, LLT Ty,
2319                                                 bool InsertLiveInCopy) const {
2320   assert(PhyReg.isPhysical() && "Physical register expected");
2321 
2322   // Get or create virtual live-in regester
2323   Register LiveIn = MRI.getLiveInVirtReg(PhyReg);
2324   if (!LiveIn) {
2325     LiveIn = MRI.createGenericVirtualRegister(Ty);
2326     MRI.addLiveIn(PhyReg, LiveIn);
2327   }
2328 
2329   // When the actual true copy required is from virtual register to physical
2330   // register (to be inserted later), live-in copy insertion from physical
2331   // to register virtual register is not required
2332   if (!InsertLiveInCopy)
2333     return LiveIn;
2334 
2335   return insertLiveInCopy(B, MRI, LiveIn, PhyReg);
2336 }
2337 
2338 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor(
2339     MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2340   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2341   const ArgDescriptor *Arg;
2342   const TargetRegisterClass *RC;
2343   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
2344   if (!Arg) {
2345     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
2346     return nullptr;
2347   }
2348   return Arg;
2349 }
2350 
2351 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2352                                          const ArgDescriptor *Arg) const {
2353   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2354     return false; // TODO: Handle these
2355 
2356   Register SrcReg = Arg->getRegister();
2357   assert(SrcReg.isPhysical() && "Physical register expected");
2358   assert(DstReg.isVirtual() && "Virtual register expected");
2359 
2360   MachineRegisterInfo &MRI = *B.getMRI();
2361 
2362   LLT Ty = MRI.getType(DstReg);
2363   Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty);
2364 
2365   if (Arg->isMasked()) {
2366     // TODO: Should we try to emit this once in the entry block?
2367     const LLT S32 = LLT::scalar(32);
2368     const unsigned Mask = Arg->getMask();
2369     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2370 
2371     Register AndMaskSrc = LiveIn;
2372 
2373     if (Shift != 0) {
2374       auto ShiftAmt = B.buildConstant(S32, Shift);
2375       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2376     }
2377 
2378     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2379   } else {
2380     B.buildCopy(DstReg, LiveIn);
2381   }
2382 
2383   return true;
2384 }
2385 
2386 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2387     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2388     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2389   B.setInstr(MI);
2390 
2391   const ArgDescriptor *Arg = getArgDescriptor(B, ArgType);
2392   if (!Arg)
2393     return false;
2394 
2395   if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg))
2396     return false;
2397 
2398   MI.eraseFromParent();
2399   return true;
2400 }
2401 
2402 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2403                                        MachineRegisterInfo &MRI,
2404                                        MachineIRBuilder &B) const {
2405   B.setInstr(MI);
2406   Register Dst = MI.getOperand(0).getReg();
2407   LLT DstTy = MRI.getType(Dst);
2408   LLT S16 = LLT::scalar(16);
2409   LLT S32 = LLT::scalar(32);
2410   LLT S64 = LLT::scalar(64);
2411 
2412   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2413     return true;
2414 
2415   if (DstTy == S16)
2416     return legalizeFDIV16(MI, MRI, B);
2417   if (DstTy == S32)
2418     return legalizeFDIV32(MI, MRI, B);
2419   if (DstTy == S64)
2420     return legalizeFDIV64(MI, MRI, B);
2421 
2422   return false;
2423 }
2424 
2425 static Register buildDivRCP(MachineIRBuilder &B, Register Src) {
2426   const LLT S32 = LLT::scalar(32);
2427 
2428   auto Cvt0 = B.buildUITOFP(S32, Src);
2429   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Cvt0});
2430   auto FPUIntMaxPlus1 = B.buildFConstant(S32, BitsToFloat(0x4f800000));
2431   auto Mul = B.buildFMul(S32, RcpIFlag, FPUIntMaxPlus1);
2432   return B.buildFPTOUI(S32, Mul).getReg(0);
2433 }
2434 
2435 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2436                                                   Register DstReg,
2437                                                   Register Num,
2438                                                   Register Den,
2439                                                   bool IsRem) const {
2440   const LLT S1 = LLT::scalar(1);
2441   const LLT S32 = LLT::scalar(32);
2442 
2443   // RCP =  URECIP(Den) = 2^32 / Den + e
2444   // e is rounding error.
2445   auto RCP = buildDivRCP(B, Den);
2446 
2447   // RCP_LO = mul(RCP, Den)
2448   auto RCP_LO = B.buildMul(S32, RCP, Den);
2449 
2450   // RCP_HI = mulhu (RCP, Den) */
2451   auto RCP_HI = B.buildUMulH(S32, RCP, Den);
2452 
2453   // NEG_RCP_LO = -RCP_LO
2454   auto Zero = B.buildConstant(S32, 0);
2455   auto NEG_RCP_LO = B.buildSub(S32, Zero, RCP_LO);
2456 
2457   // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
2458   auto CmpRcpHiZero = B.buildICmp(CmpInst::ICMP_EQ, S1, RCP_HI, Zero);
2459   auto ABS_RCP_LO = B.buildSelect(S32, CmpRcpHiZero, NEG_RCP_LO, RCP_LO);
2460 
2461   // Calculate the rounding error from the URECIP instruction
2462   // E = mulhu(ABS_RCP_LO, RCP)
2463   auto E = B.buildUMulH(S32, ABS_RCP_LO, RCP);
2464 
2465   // RCP_A_E = RCP + E
2466   auto RCP_A_E = B.buildAdd(S32, RCP, E);
2467 
2468   // RCP_S_E = RCP - E
2469   auto RCP_S_E = B.buildSub(S32, RCP, E);
2470 
2471   // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
2472   auto Tmp0 = B.buildSelect(S32, CmpRcpHiZero, RCP_A_E, RCP_S_E);
2473 
2474   // Quotient = mulhu(Tmp0, Num)stmp
2475   auto Quotient = B.buildUMulH(S32, Tmp0, Num);
2476 
2477   // Num_S_Remainder = Quotient * Den
2478   auto Num_S_Remainder = B.buildMul(S32, Quotient, Den);
2479 
2480   // Remainder = Num - Num_S_Remainder
2481   auto Remainder = B.buildSub(S32, Num, Num_S_Remainder);
2482 
2483   // Remainder_GE_Den = Remainder >= Den
2484   auto Remainder_GE_Den = B.buildICmp(CmpInst::ICMP_UGE, S1, Remainder, Den);
2485 
2486   // Remainder_GE_Zero = Num >= Num_S_Remainder;
2487   auto Remainder_GE_Zero = B.buildICmp(CmpInst::ICMP_UGE, S1,
2488                                        Num, Num_S_Remainder);
2489 
2490   // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
2491   auto Tmp1 = B.buildAnd(S1, Remainder_GE_Den, Remainder_GE_Zero);
2492 
2493   // Calculate Division result:
2494 
2495   // Quotient_A_One = Quotient + 1
2496   auto One = B.buildConstant(S32, 1);
2497   auto Quotient_A_One = B.buildAdd(S32, Quotient, One);
2498 
2499   // Quotient_S_One = Quotient - 1
2500   auto Quotient_S_One = B.buildSub(S32, Quotient, One);
2501 
2502   // Div = (Tmp1 == 0 ? Quotient_A_One : Quotient)
2503   auto Div = B.buildSelect(S32, Tmp1, Quotient, Quotient_A_One);
2504 
2505   // Div = (Remainder_GE_Zero ? Div : Quotient_S_One)
2506   if (IsRem) {
2507     Div = B.buildSelect(S32, Remainder_GE_Zero, Div, Quotient_S_One);
2508 
2509     // Calculate Rem result:
2510     auto Remainder_S_Den = B.buildSub(S32, Remainder, Den);
2511 
2512     // Remainder_A_Den = Remainder + Den
2513     auto Remainder_A_Den = B.buildAdd(S32, Remainder, Den);
2514 
2515     // Rem = (Tmp1 ? Remainder_S_Den : Remainder)
2516     auto Rem = B.buildSelect(S32, Tmp1, Remainder_S_Den, Remainder);
2517 
2518     // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den)
2519     B.buildSelect(DstReg, Remainder_GE_Zero, Rem, Remainder_A_Den);
2520   } else {
2521     B.buildSelect(DstReg, Remainder_GE_Zero, Div, Quotient_S_One);
2522   }
2523 }
2524 
2525 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2526                                               MachineRegisterInfo &MRI,
2527                                               MachineIRBuilder &B) const {
2528   B.setInstr(MI);
2529   const bool IsRem = MI.getOpcode() == AMDGPU::G_UREM;
2530   Register DstReg = MI.getOperand(0).getReg();
2531   Register Num = MI.getOperand(1).getReg();
2532   Register Den = MI.getOperand(2).getReg();
2533   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsRem);
2534   MI.eraseFromParent();
2535   return true;
2536 }
2537 
2538 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2539 //
2540 // Return lo, hi of result
2541 //
2542 // %cvt.lo = G_UITOFP Val.lo
2543 // %cvt.hi = G_UITOFP Val.hi
2544 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2545 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2546 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2547 // %mul2 = G_FMUL %mul1, 2**(-32)
2548 // %trunc = G_INTRINSIC_TRUNC %mul2
2549 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2550 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2551 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2552                                                        Register Val) {
2553   const LLT S32 = LLT::scalar(32);
2554   auto Unmerge = B.buildUnmerge(S32, Val);
2555 
2556   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2557   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2558 
2559   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2560                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2561 
2562   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2563   auto Mul1 =
2564       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2565 
2566   // 2**(-32)
2567   auto Mul2 =
2568       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2569   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2570 
2571   // -(2**32)
2572   auto Mad2 = B.buildFMAD(S32, Trunc,
2573                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2574 
2575   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2576   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2577 
2578   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2579 }
2580 
2581 bool AMDGPULegalizerInfo::legalizeUDIV_UREM64(MachineInstr &MI,
2582                                               MachineRegisterInfo &MRI,
2583                                               MachineIRBuilder &B) const {
2584   B.setInstr(MI);
2585 
2586   const bool IsDiv = MI.getOpcode() == TargetOpcode::G_UDIV;
2587   const LLT S32 = LLT::scalar(32);
2588   const LLT S64 = LLT::scalar(64);
2589   const LLT S1 = LLT::scalar(1);
2590   Register Numer = MI.getOperand(1).getReg();
2591   Register Denom = MI.getOperand(2).getReg();
2592   Register RcpLo, RcpHi;
2593 
2594   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2595 
2596   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2597 
2598   auto Zero64 = B.buildConstant(S64, 0);
2599   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2600 
2601   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2602   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2603 
2604   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2605   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2606   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2607 
2608   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2609   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2610   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2611   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2612 
2613   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2614   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2615   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2616   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2617   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2618 
2619   auto Zero32 = B.buildConstant(S32, 0);
2620   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2621   auto Add2_HiC =
2622       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2623   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2624   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2625 
2626   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2627   Register NumerLo = UnmergeNumer.getReg(0);
2628   Register NumerHi = UnmergeNumer.getReg(1);
2629 
2630   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2631   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2632   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2633   Register Mul3_Lo = UnmergeMul3.getReg(0);
2634   Register Mul3_Hi = UnmergeMul3.getReg(1);
2635   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2636   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2637   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2638   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2639 
2640   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2641   Register DenomLo = UnmergeDenom.getReg(0);
2642   Register DenomHi = UnmergeDenom.getReg(1);
2643 
2644   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2645   auto C1 = B.buildSExt(S32, CmpHi);
2646 
2647   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2648   auto C2 = B.buildSExt(S32, CmpLo);
2649 
2650   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2651   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2652 
2653   // TODO: Here and below portions of the code can be enclosed into if/endif.
2654   // Currently control flow is unconditional and we have 4 selects after
2655   // potential endif to substitute PHIs.
2656 
2657   // if C3 != 0 ...
2658   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2659   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2660   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2661   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2662 
2663   auto One64 = B.buildConstant(S64, 1);
2664   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2665 
2666   auto C4 =
2667       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2668   auto C5 =
2669       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2670   auto C6 = B.buildSelect(
2671       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2672 
2673   // if (C6 != 0)
2674   auto Add4 = B.buildAdd(S64, Add3, One64);
2675   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2676 
2677   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2678   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2679   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2680 
2681   // endif C6
2682   // endif C3
2683 
2684   if (IsDiv) {
2685     auto Sel1 = B.buildSelect(
2686         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2687     B.buildSelect(MI.getOperand(0),
2688                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2689   } else {
2690     auto Sel2 = B.buildSelect(
2691         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2692     B.buildSelect(MI.getOperand(0),
2693                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2694   }
2695 
2696   MI.eraseFromParent();
2697   return true;
2698 }
2699 
2700 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2701                                             MachineRegisterInfo &MRI,
2702                                             MachineIRBuilder &B) const {
2703   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2704   if (Ty == LLT::scalar(32))
2705     return legalizeUDIV_UREM32(MI, MRI, B);
2706   if (Ty == LLT::scalar(64))
2707     return legalizeUDIV_UREM64(MI, MRI, B);
2708   return false;
2709 }
2710 
2711 bool AMDGPULegalizerInfo::legalizeSDIV_SREM32(MachineInstr &MI,
2712                                               MachineRegisterInfo &MRI,
2713                                               MachineIRBuilder &B) const {
2714   B.setInstr(MI);
2715   const LLT S32 = LLT::scalar(32);
2716 
2717   const bool IsRem = MI.getOpcode() == AMDGPU::G_SREM;
2718   Register DstReg = MI.getOperand(0).getReg();
2719   Register LHS = MI.getOperand(1).getReg();
2720   Register RHS = MI.getOperand(2).getReg();
2721 
2722   auto ThirtyOne = B.buildConstant(S32, 31);
2723   auto LHSign = B.buildAShr(S32, LHS, ThirtyOne);
2724   auto RHSign = B.buildAShr(S32, LHS, ThirtyOne);
2725 
2726   LHS = B.buildAdd(S32, LHS, LHSign).getReg(0);
2727   RHS = B.buildAdd(S32, RHS, RHSign).getReg(0);
2728 
2729   LHS = B.buildXor(S32, LHS, LHSign).getReg(0);
2730   RHS = B.buildXor(S32, RHS, RHSign).getReg(0);
2731 
2732   Register UDivRem = MRI.createGenericVirtualRegister(S32);
2733   legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsRem);
2734 
2735   if (IsRem) {
2736     auto RSign = LHSign; // Remainder sign is the same as LHS
2737     UDivRem = B.buildXor(S32, UDivRem, RSign).getReg(0);
2738     B.buildSub(DstReg, UDivRem, RSign);
2739   } else {
2740     auto DSign = B.buildXor(S32, LHSign, RHSign);
2741     UDivRem = B.buildXor(S32, UDivRem, DSign).getReg(0);
2742     B.buildSub(DstReg, UDivRem, DSign);
2743   }
2744 
2745   MI.eraseFromParent();
2746   return true;
2747 }
2748 
2749 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2750                                             MachineRegisterInfo &MRI,
2751                                             MachineIRBuilder &B) const {
2752   if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32))
2753     return legalizeSDIV_SREM32(MI, MRI, B);
2754   return false;
2755 }
2756 
2757 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2758                                                  MachineRegisterInfo &MRI,
2759                                                  MachineIRBuilder &B) const {
2760   Register Res = MI.getOperand(0).getReg();
2761   Register LHS = MI.getOperand(1).getReg();
2762   Register RHS = MI.getOperand(2).getReg();
2763 
2764   uint16_t Flags = MI.getFlags();
2765 
2766   LLT ResTy = MRI.getType(Res);
2767   LLT S32 = LLT::scalar(32);
2768   LLT S64 = LLT::scalar(64);
2769 
2770   const MachineFunction &MF = B.getMF();
2771   bool Unsafe =
2772     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2773 
2774   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2775     return false;
2776 
2777   if (!Unsafe && ResTy == S32 &&
2778       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2779     return false;
2780 
2781   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2782     // 1 / x -> RCP(x)
2783     if (CLHS->isExactlyValue(1.0)) {
2784       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2785         .addUse(RHS)
2786         .setMIFlags(Flags);
2787 
2788       MI.eraseFromParent();
2789       return true;
2790     }
2791 
2792     // -1 / x -> RCP( FNEG(x) )
2793     if (CLHS->isExactlyValue(-1.0)) {
2794       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2795       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2796         .addUse(FNeg.getReg(0))
2797         .setMIFlags(Flags);
2798 
2799       MI.eraseFromParent();
2800       return true;
2801     }
2802   }
2803 
2804   // x / y -> x * (1.0 / y)
2805   if (Unsafe) {
2806     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2807       .addUse(RHS)
2808       .setMIFlags(Flags);
2809     B.buildFMul(Res, LHS, RCP, Flags);
2810 
2811     MI.eraseFromParent();
2812     return true;
2813   }
2814 
2815   return false;
2816 }
2817 
2818 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2819                                          MachineRegisterInfo &MRI,
2820                                          MachineIRBuilder &B) const {
2821   B.setInstr(MI);
2822   Register Res = MI.getOperand(0).getReg();
2823   Register LHS = MI.getOperand(1).getReg();
2824   Register RHS = MI.getOperand(2).getReg();
2825 
2826   uint16_t Flags = MI.getFlags();
2827 
2828   LLT S16 = LLT::scalar(16);
2829   LLT S32 = LLT::scalar(32);
2830 
2831   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2832   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2833 
2834   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2835     .addUse(RHSExt.getReg(0))
2836     .setMIFlags(Flags);
2837 
2838   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2839   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2840 
2841   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2842     .addUse(RDst.getReg(0))
2843     .addUse(RHS)
2844     .addUse(LHS)
2845     .setMIFlags(Flags);
2846 
2847   MI.eraseFromParent();
2848   return true;
2849 }
2850 
2851 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2852 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2853 static void toggleSPDenormMode(bool Enable,
2854                                MachineIRBuilder &B,
2855                                const GCNSubtarget &ST,
2856                                AMDGPU::SIModeRegisterDefaults Mode) {
2857   // Set SP denorm mode to this value.
2858   unsigned SPDenormMode =
2859     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
2860 
2861   if (ST.hasDenormModeInst()) {
2862     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2863     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
2864 
2865     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2866     B.buildInstr(AMDGPU::S_DENORM_MODE)
2867       .addImm(NewDenormModeValue);
2868 
2869   } else {
2870     // Select FP32 bit field in mode register.
2871     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2872                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2873                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2874 
2875     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2876       .addImm(SPDenormMode)
2877       .addImm(SPDenormModeBitField);
2878   }
2879 }
2880 
2881 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2882                                          MachineRegisterInfo &MRI,
2883                                          MachineIRBuilder &B) const {
2884   B.setInstr(MI);
2885   Register Res = MI.getOperand(0).getReg();
2886   Register LHS = MI.getOperand(1).getReg();
2887   Register RHS = MI.getOperand(2).getReg();
2888   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2889   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2890 
2891   uint16_t Flags = MI.getFlags();
2892 
2893   LLT S32 = LLT::scalar(32);
2894   LLT S1 = LLT::scalar(1);
2895 
2896   auto One = B.buildFConstant(S32, 1.0f);
2897 
2898   auto DenominatorScaled =
2899     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2900       .addUse(RHS)
2901       .addUse(LHS)
2902       .addImm(1)
2903       .setMIFlags(Flags);
2904   auto NumeratorScaled =
2905     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2906       .addUse(LHS)
2907       .addUse(RHS)
2908       .addImm(0)
2909       .setMIFlags(Flags);
2910 
2911   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2912     .addUse(DenominatorScaled.getReg(0))
2913     .setMIFlags(Flags);
2914   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2915 
2916   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2917   // aren't modeled as reading it.
2918   if (!Mode.allFP32Denormals())
2919     toggleSPDenormMode(true, B, ST, Mode);
2920 
2921   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2922   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2923   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2924   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2925   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2926   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2927 
2928   if (!Mode.allFP32Denormals())
2929     toggleSPDenormMode(false, B, ST, Mode);
2930 
2931   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2932     .addUse(Fma4.getReg(0))
2933     .addUse(Fma1.getReg(0))
2934     .addUse(Fma3.getReg(0))
2935     .addUse(NumeratorScaled.getReg(1))
2936     .setMIFlags(Flags);
2937 
2938   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2939     .addUse(Fmas.getReg(0))
2940     .addUse(RHS)
2941     .addUse(LHS)
2942     .setMIFlags(Flags);
2943 
2944   MI.eraseFromParent();
2945   return true;
2946 }
2947 
2948 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
2949                                          MachineRegisterInfo &MRI,
2950                                          MachineIRBuilder &B) const {
2951   B.setInstr(MI);
2952   Register Res = MI.getOperand(0).getReg();
2953   Register LHS = MI.getOperand(1).getReg();
2954   Register RHS = MI.getOperand(2).getReg();
2955 
2956   uint16_t Flags = MI.getFlags();
2957 
2958   LLT S64 = LLT::scalar(64);
2959   LLT S1 = LLT::scalar(1);
2960 
2961   auto One = B.buildFConstant(S64, 1.0);
2962 
2963   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2964     .addUse(LHS)
2965     .addUse(RHS)
2966     .addImm(1)
2967     .setMIFlags(Flags);
2968 
2969   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
2970 
2971   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
2972     .addUse(DivScale0.getReg(0))
2973     .setMIFlags(Flags);
2974 
2975   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
2976   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
2977   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
2978 
2979   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2980     .addUse(LHS)
2981     .addUse(RHS)
2982     .addImm(0)
2983     .setMIFlags(Flags);
2984 
2985   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
2986   auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags);
2987   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
2988 
2989   Register Scale;
2990   if (!ST.hasUsableDivScaleConditionOutput()) {
2991     // Workaround a hardware bug on SI where the condition output from div_scale
2992     // is not usable.
2993 
2994     LLT S32 = LLT::scalar(32);
2995 
2996     auto NumUnmerge = B.buildUnmerge(S32, LHS);
2997     auto DenUnmerge = B.buildUnmerge(S32, RHS);
2998     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
2999     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3000 
3001     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3002                               Scale1Unmerge.getReg(1));
3003     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3004                               Scale0Unmerge.getReg(1));
3005     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3006   } else {
3007     Scale = DivScale1.getReg(1);
3008   }
3009 
3010   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3011     .addUse(Fma4.getReg(0))
3012     .addUse(Fma3.getReg(0))
3013     .addUse(Mul.getReg(0))
3014     .addUse(Scale)
3015     .setMIFlags(Flags);
3016 
3017   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3018     .addUse(Fmas.getReg(0))
3019     .addUse(RHS)
3020     .addUse(LHS)
3021     .setMIFlags(Flags);
3022 
3023   MI.eraseFromParent();
3024   return true;
3025 }
3026 
3027 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3028                                                  MachineRegisterInfo &MRI,
3029                                                  MachineIRBuilder &B) const {
3030   B.setInstr(MI);
3031   Register Res = MI.getOperand(0).getReg();
3032   Register LHS = MI.getOperand(2).getReg();
3033   Register RHS = MI.getOperand(3).getReg();
3034   uint16_t Flags = MI.getFlags();
3035 
3036   LLT S32 = LLT::scalar(32);
3037   LLT S1 = LLT::scalar(1);
3038 
3039   auto Abs = B.buildFAbs(S32, RHS, Flags);
3040   const APFloat C0Val(1.0f);
3041 
3042   auto C0 = B.buildConstant(S32, 0x6f800000);
3043   auto C1 = B.buildConstant(S32, 0x2f800000);
3044   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3045 
3046   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3047   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3048 
3049   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3050 
3051   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3052     .addUse(Mul0.getReg(0))
3053     .setMIFlags(Flags);
3054 
3055   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3056 
3057   B.buildFMul(Res, Sel, Mul1, Flags);
3058 
3059   MI.eraseFromParent();
3060   return true;
3061 }
3062 
3063 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3064                                                  MachineRegisterInfo &MRI,
3065                                                  MachineIRBuilder &B) const {
3066   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3067   if (!MFI->isEntryFunction()) {
3068     return legalizePreloadedArgIntrin(MI, MRI, B,
3069                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3070   }
3071 
3072   B.setInstr(MI);
3073 
3074   uint64_t Offset =
3075     ST.getTargetLowering()->getImplicitParameterOffset(
3076       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3077   Register DstReg = MI.getOperand(0).getReg();
3078   LLT DstTy = MRI.getType(DstReg);
3079   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3080 
3081   const ArgDescriptor *Arg;
3082   const TargetRegisterClass *RC;
3083   std::tie(Arg, RC)
3084     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
3085   if (!Arg)
3086     return false;
3087 
3088   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3089   if (!loadInputValue(KernargPtrReg, B, Arg))
3090     return false;
3091 
3092   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3093   MI.eraseFromParent();
3094   return true;
3095 }
3096 
3097 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3098                                               MachineRegisterInfo &MRI,
3099                                               MachineIRBuilder &B,
3100                                               unsigned AddrSpace) const {
3101   B.setInstr(MI);
3102   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3103   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
3104   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3105   MI.eraseFromParent();
3106   return true;
3107 }
3108 
3109 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3110 // offset (the offset that is included in bounds checking and swizzling, to be
3111 // split between the instruction's voffset and immoffset fields) and soffset
3112 // (the offset that is excluded from bounds checking and swizzling, to go in
3113 // the instruction's soffset field).  This function takes the first kind of
3114 // offset and figures out how to split it between voffset and immoffset.
3115 std::tuple<Register, unsigned, unsigned>
3116 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3117                                         Register OrigOffset) const {
3118   const unsigned MaxImm = 4095;
3119   Register BaseReg;
3120   unsigned TotalConstOffset;
3121   MachineInstr *OffsetDef;
3122   const LLT S32 = LLT::scalar(32);
3123 
3124   std::tie(BaseReg, TotalConstOffset, OffsetDef)
3125     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
3126 
3127   unsigned ImmOffset = TotalConstOffset;
3128 
3129   // If the immediate value is too big for the immoffset field, put the value
3130   // and -4096 into the immoffset field so that the value that is copied/added
3131   // for the voffset field is a multiple of 4096, and it stands more chance
3132   // of being CSEd with the copy/add for another similar load/store.
3133   // However, do not do that rounding down to a multiple of 4096 if that is a
3134   // negative number, as it appears to be illegal to have a negative offset
3135   // in the vgpr, even if adding the immediate offset makes it positive.
3136   unsigned Overflow = ImmOffset & ~MaxImm;
3137   ImmOffset -= Overflow;
3138   if ((int32_t)Overflow < 0) {
3139     Overflow += ImmOffset;
3140     ImmOffset = 0;
3141   }
3142 
3143   if (Overflow != 0) {
3144     if (!BaseReg) {
3145       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3146     } else {
3147       auto OverflowVal = B.buildConstant(S32, Overflow);
3148       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3149     }
3150   }
3151 
3152   if (!BaseReg)
3153     BaseReg = B.buildConstant(S32, 0).getReg(0);
3154 
3155   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3156 }
3157 
3158 /// Handle register layout difference for f16 images for some subtargets.
3159 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3160                                              MachineRegisterInfo &MRI,
3161                                              Register Reg) const {
3162   if (!ST.hasUnpackedD16VMem())
3163     return Reg;
3164 
3165   const LLT S16 = LLT::scalar(16);
3166   const LLT S32 = LLT::scalar(32);
3167   LLT StoreVT = MRI.getType(Reg);
3168   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3169 
3170   auto Unmerge = B.buildUnmerge(S16, Reg);
3171 
3172   SmallVector<Register, 4> WideRegs;
3173   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3174     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3175 
3176   int NumElts = StoreVT.getNumElements();
3177 
3178   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3179 }
3180 
3181 Register AMDGPULegalizerInfo::fixStoreSourceType(
3182   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3183   MachineRegisterInfo *MRI = B.getMRI();
3184   LLT Ty = MRI->getType(VData);
3185 
3186   const LLT S16 = LLT::scalar(16);
3187 
3188   // Fixup illegal register types for i8 stores.
3189   if (Ty == LLT::scalar(8) || Ty == S16) {
3190     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3191     return AnyExt;
3192   }
3193 
3194   if (Ty.isVector()) {
3195     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3196       if (IsFormat)
3197         return handleD16VData(B, *MRI, VData);
3198     }
3199   }
3200 
3201   return VData;
3202 }
3203 
3204 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3205                                               MachineRegisterInfo &MRI,
3206                                               MachineIRBuilder &B,
3207                                               bool IsTyped,
3208                                               bool IsFormat) const {
3209   B.setInstr(MI);
3210 
3211   Register VData = MI.getOperand(1).getReg();
3212   LLT Ty = MRI.getType(VData);
3213   LLT EltTy = Ty.getScalarType();
3214   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3215   const LLT S32 = LLT::scalar(32);
3216 
3217   VData = fixStoreSourceType(B, VData, IsFormat);
3218   Register RSrc = MI.getOperand(2).getReg();
3219 
3220   MachineMemOperand *MMO = *MI.memoperands_begin();
3221   const int MemSize = MMO->getSize();
3222 
3223   unsigned ImmOffset;
3224   unsigned TotalOffset;
3225 
3226   // The typed intrinsics add an immediate after the registers.
3227   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3228 
3229   // The struct intrinsic variants add one additional operand over raw.
3230   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3231   Register VIndex;
3232   int OpOffset = 0;
3233   if (HasVIndex) {
3234     VIndex = MI.getOperand(3).getReg();
3235     OpOffset = 1;
3236   }
3237 
3238   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3239   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3240 
3241   unsigned Format = 0;
3242   if (IsTyped) {
3243     Format = MI.getOperand(5 + OpOffset).getImm();
3244     ++OpOffset;
3245   }
3246 
3247   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3248 
3249   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3250   if (TotalOffset != 0)
3251     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3252 
3253   unsigned Opc;
3254   if (IsTyped) {
3255     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3256                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3257   } else if (IsFormat) {
3258     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3259                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3260   } else {
3261     switch (MemSize) {
3262     case 1:
3263       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3264       break;
3265     case 2:
3266       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3267       break;
3268     default:
3269       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3270       break;
3271     }
3272   }
3273 
3274   if (!VIndex)
3275     VIndex = B.buildConstant(S32, 0).getReg(0);
3276 
3277   auto MIB = B.buildInstr(Opc)
3278     .addUse(VData)              // vdata
3279     .addUse(RSrc)               // rsrc
3280     .addUse(VIndex)             // vindex
3281     .addUse(VOffset)            // voffset
3282     .addUse(SOffset)            // soffset
3283     .addImm(ImmOffset);         // offset(imm)
3284 
3285   if (IsTyped)
3286     MIB.addImm(Format);
3287 
3288   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3289      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3290      .addMemOperand(MMO);
3291 
3292   MI.eraseFromParent();
3293   return true;
3294 }
3295 
3296 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3297                                              MachineRegisterInfo &MRI,
3298                                              MachineIRBuilder &B,
3299                                              bool IsFormat,
3300                                              bool IsTyped) const {
3301   B.setInstr(MI);
3302 
3303   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3304   MachineMemOperand *MMO = *MI.memoperands_begin();
3305   const int MemSize = MMO->getSize();
3306   const LLT S32 = LLT::scalar(32);
3307 
3308   Register Dst = MI.getOperand(0).getReg();
3309   Register RSrc = MI.getOperand(2).getReg();
3310 
3311   // The typed intrinsics add an immediate after the registers.
3312   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3313 
3314   // The struct intrinsic variants add one additional operand over raw.
3315   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3316   Register VIndex;
3317   int OpOffset = 0;
3318   if (HasVIndex) {
3319     VIndex = MI.getOperand(3).getReg();
3320     OpOffset = 1;
3321   }
3322 
3323   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3324   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3325 
3326   unsigned Format = 0;
3327   if (IsTyped) {
3328     Format = MI.getOperand(5 + OpOffset).getImm();
3329     ++OpOffset;
3330   }
3331 
3332   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3333   unsigned ImmOffset;
3334   unsigned TotalOffset;
3335 
3336   LLT Ty = MRI.getType(Dst);
3337   LLT EltTy = Ty.getScalarType();
3338   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3339   const bool Unpacked = ST.hasUnpackedD16VMem();
3340 
3341   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3342   if (TotalOffset != 0)
3343     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3344 
3345   unsigned Opc;
3346 
3347   if (IsTyped) {
3348     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3349                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3350   } else if (IsFormat) {
3351     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3352                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3353   } else {
3354     switch (MemSize) {
3355     case 1:
3356       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3357       break;
3358     case 2:
3359       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3360       break;
3361     default:
3362       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3363       break;
3364     }
3365   }
3366 
3367   Register LoadDstReg;
3368 
3369   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3370   LLT UnpackedTy = Ty.changeElementSize(32);
3371 
3372   if (IsExtLoad)
3373     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3374   else if (Unpacked && IsD16 && Ty.isVector())
3375     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3376   else
3377     LoadDstReg = Dst;
3378 
3379   if (!VIndex)
3380     VIndex = B.buildConstant(S32, 0).getReg(0);
3381 
3382   auto MIB = B.buildInstr(Opc)
3383     .addDef(LoadDstReg)         // vdata
3384     .addUse(RSrc)               // rsrc
3385     .addUse(VIndex)             // vindex
3386     .addUse(VOffset)            // voffset
3387     .addUse(SOffset)            // soffset
3388     .addImm(ImmOffset);         // offset(imm)
3389 
3390   if (IsTyped)
3391     MIB.addImm(Format);
3392 
3393   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3394      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3395      .addMemOperand(MMO);
3396 
3397   if (LoadDstReg != Dst) {
3398     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3399 
3400     // Widen result for extending loads was widened.
3401     if (IsExtLoad)
3402       B.buildTrunc(Dst, LoadDstReg);
3403     else {
3404       // Repack to original 16-bit vector result
3405       // FIXME: G_TRUNC should work, but legalization currently fails
3406       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3407       SmallVector<Register, 4> Repack;
3408       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3409         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3410       B.buildMerge(Dst, Repack);
3411     }
3412   }
3413 
3414   MI.eraseFromParent();
3415   return true;
3416 }
3417 
3418 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3419                                                MachineIRBuilder &B,
3420                                                bool IsInc) const {
3421   B.setInstr(MI);
3422   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3423                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3424   B.buildInstr(Opc)
3425     .addDef(MI.getOperand(0).getReg())
3426     .addUse(MI.getOperand(2).getReg())
3427     .addUse(MI.getOperand(3).getReg())
3428     .cloneMemRefs(MI);
3429   MI.eraseFromParent();
3430   return true;
3431 }
3432 
3433 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3434   switch (IntrID) {
3435   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3436   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3437     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3438   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3439   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3440     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3441   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3442   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3443     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3444   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3445   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3446     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3447   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3448   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3449     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3450   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3451   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3452     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3453   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3454   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3455     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3456   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3457   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3458     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3459   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3460   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3461     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3462   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3463   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3464     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3465   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3466   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3467     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3468   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3469   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3470     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3471   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3472   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3473     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3474   default:
3475     llvm_unreachable("unhandled atomic opcode");
3476   }
3477 }
3478 
3479 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3480                                                MachineIRBuilder &B,
3481                                                Intrinsic::ID IID) const {
3482   B.setInstr(MI);
3483 
3484   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3485                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3486 
3487   Register Dst = MI.getOperand(0).getReg();
3488   Register VData = MI.getOperand(2).getReg();
3489 
3490   Register CmpVal;
3491   int OpOffset = 0;
3492 
3493   if (IsCmpSwap) {
3494     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3495     ++OpOffset;
3496   }
3497 
3498   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3499   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
3500 
3501   // The struct intrinsic variants add one additional operand over raw.
3502   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3503   Register VIndex;
3504   if (HasVIndex) {
3505     VIndex = MI.getOperand(4 + OpOffset).getReg();
3506     ++OpOffset;
3507   }
3508 
3509   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3510   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3511   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3512 
3513   MachineMemOperand *MMO = *MI.memoperands_begin();
3514 
3515   unsigned ImmOffset;
3516   unsigned TotalOffset;
3517   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3518   if (TotalOffset != 0)
3519     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3520 
3521   if (!VIndex)
3522     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3523 
3524   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
3525     .addDef(Dst)
3526     .addUse(VData); // vdata
3527 
3528   if (IsCmpSwap)
3529     MIB.addReg(CmpVal);
3530 
3531   MIB.addUse(RSrc)               // rsrc
3532      .addUse(VIndex)             // vindex
3533      .addUse(VOffset)            // voffset
3534      .addUse(SOffset)            // soffset
3535      .addImm(ImmOffset)          // offset(imm)
3536      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3537      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3538      .addMemOperand(MMO);
3539 
3540   MI.eraseFromParent();
3541   return true;
3542 }
3543 
3544 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3545 /// vector with s16 typed elements.
3546 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI,
3547                                         SmallVectorImpl<Register> &PackedAddrs,
3548                                         int AddrIdx, int DimIdx, int NumVAddrs,
3549                                         int NumGradients) {
3550   const LLT S16 = LLT::scalar(16);
3551   const LLT V2S16 = LLT::vector(2, 16);
3552 
3553   for (int I = AddrIdx; I < AddrIdx + NumVAddrs; ++I) {
3554     Register AddrReg = MI.getOperand(I).getReg();
3555 
3556     if (I < DimIdx) {
3557       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
3558       PackedAddrs.push_back(AddrReg);
3559     } else {
3560       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
3561       // derivatives dx/dh and dx/dv are packed with undef.
3562       if (((I + 1) >= (AddrIdx + NumVAddrs)) ||
3563           ((NumGradients / 2) % 2 == 1 &&
3564            (I == DimIdx + (NumGradients / 2) - 1 ||
3565             I == DimIdx + NumGradients - 1))) {
3566         PackedAddrs.push_back(
3567             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
3568                 .getReg(0));
3569       } else {
3570         PackedAddrs.push_back(
3571             B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()})
3572                 .getReg(0));
3573         ++I;
3574       }
3575     }
3576   }
3577 }
3578 
3579 /// Convert from separate vaddr components to a single vector address register,
3580 /// and replace the remaining operands with $noreg.
3581 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
3582                                      int DimIdx, int NumVAddrs) {
3583   SmallVector<Register, 8> AddrRegs(NumVAddrs);
3584   for (int I = 0; I != NumVAddrs; ++I) {
3585     AddrRegs[I] = MI.getOperand(DimIdx + I).getReg();
3586     assert(B.getMRI()->getType(AddrRegs[I]) == LLT::scalar(32));
3587   }
3588 
3589   auto VAddr = B.buildBuildVector(LLT::vector(NumVAddrs, 32), AddrRegs);
3590   MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
3591   for (int I = 1; I != NumVAddrs; ++I)
3592     MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
3593 }
3594 
3595 /// Return number of address arguments, and the number of gradients
3596 static std::pair<int, int>
3597 getImageNumVAddr(const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
3598                  const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode) {
3599   const AMDGPU::MIMGDimInfo *DimInfo
3600     = AMDGPU::getMIMGDimInfo(ImageDimIntr->Dim);
3601 
3602   int NumGradients = BaseOpcode->Gradients ? DimInfo->NumGradients : 0;
3603   int NumCoords = BaseOpcode->Coordinates ? DimInfo->NumCoords : 0;
3604   int NumLCM = BaseOpcode->LodOrClampOrMip ? 1 : 0;
3605   int NumVAddr = BaseOpcode->NumExtraArgs + NumGradients + NumCoords + NumLCM;
3606   return {NumVAddr, NumGradients};
3607 }
3608 
3609 static int getDMaskIdx(const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode,
3610                        int NumDefs) {
3611   assert(!BaseOpcode->Atomic);
3612   return NumDefs + 1 + (BaseOpcode->Store ? 1 : 0);
3613 }
3614 
3615 /// Return first address operand index in an image intrinsic.
3616 static int getImageVAddrIdxBegin(const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode,
3617                                  int NumDefs) {
3618   if (BaseOpcode->Atomic)
3619     return NumDefs + 1 + (BaseOpcode->AtomicX2 ? 2 : 1);
3620   return getDMaskIdx(BaseOpcode, NumDefs) + 1;
3621 }
3622 
3623 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
3624 ///
3625 /// Depending on the subtarget, load/store with 16-bit element data need to be
3626 /// rewritten to use the low half of 32-bit registers, or directly use a packed
3627 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
3628 /// registers.
3629 ///
3630 /// We don't want to directly select image instructions just yet, but also want
3631 /// to exposes all register repacking to the legalizer/combiners. We also don't
3632 /// want a selected instrution entering RegBankSelect. In order to avoid
3633 /// defining a multitude of intermediate image instructions, directly hack on
3634 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
3635 /// now unnecessary arguments with $noreg.
3636 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3637     MachineInstr &MI, MachineIRBuilder &B,
3638     GISelChangeObserver &Observer,
3639     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3640   B.setInstr(MI);
3641 
3642   const int NumDefs = MI.getNumExplicitDefs();
3643   bool IsTFE = NumDefs == 2;
3644   // We are only processing the operands of d16 image operations on subtargets
3645   // that use the unpacked register layout, or need to repack the TFE result.
3646 
3647   // TODO: Do we need to guard against already legalized intrinsics?
3648   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3649     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3650 
3651   MachineRegisterInfo *MRI = B.getMRI();
3652   const LLT S32 = LLT::scalar(32);
3653   const LLT S16 = LLT::scalar(16);
3654   const LLT V2S16 = LLT::vector(2, 16);
3655 
3656   // Index of first address argument
3657   const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs);
3658 
3659   // Check for 16 bit addresses and pack if true.
3660   int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
3661   LLT AddrTy = MRI->getType(MI.getOperand(DimIdx).getReg());
3662   const bool IsA16 = AddrTy == S16;
3663 
3664   int NumVAddrs, NumGradients;
3665   std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode);
3666   const int DMaskIdx = BaseOpcode->Atomic ? -1 :
3667     getDMaskIdx(BaseOpcode, NumDefs);
3668   unsigned DMask = 0;
3669 
3670   int DMaskLanes = 0;
3671   if (!BaseOpcode->Atomic) {
3672     DMask = MI.getOperand(DMaskIdx).getImm();
3673     if (BaseOpcode->Gather4) {
3674       DMaskLanes = 4;
3675     } else if (DMask != 0) {
3676       DMaskLanes = countPopulation(DMask);
3677     } else if (!IsTFE && !BaseOpcode->Store) {
3678       // If dmask is 0, this is a no-op load. This can be eliminated.
3679       B.buildUndef(MI.getOperand(0));
3680       MI.eraseFromParent();
3681       return true;
3682     }
3683   }
3684 
3685   Observer.changingInstr(MI);
3686   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
3687 
3688   unsigned NewOpcode = NumDefs == 0 ?
3689     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
3690 
3691   // Track that we legalized this
3692   MI.setDesc(B.getTII().get(NewOpcode));
3693 
3694   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
3695   // dmask to be at least 1 otherwise the instruction will fail
3696   if (IsTFE && DMask == 0) {
3697     DMask = 0x1;
3698     DMaskLanes = 1;
3699     MI.getOperand(DMaskIdx).setImm(DMask);
3700   }
3701 
3702   // If the register allocator cannot place the address registers contiguously
3703   // without introducing moves, then using the non-sequential address encoding
3704   // is always preferable, since it saves VALU instructions and is usually a
3705   // wash in terms of code size or even better.
3706   //
3707   // However, we currently have no way of hinting to the register allocator
3708   // that MIMG addresses should be placed contiguously when it is possible to
3709   // do so, so force non-NSA for the common 2-address case as a heuristic.
3710   //
3711   // SIShrinkInstructions will convert NSA encodings to non-NSA after register
3712   // allocation when possible.
3713   const bool UseNSA = NumVAddrs >= 3 &&
3714                       ST.hasFeature(AMDGPU::FeatureNSAEncoding);
3715 
3716   // Rewrite the addressing register layout before doing anything else.
3717   if (IsA16) {
3718     // FIXME: this feature is missing from gfx10. When that is fixed, this check
3719     // should be introduced.
3720     if (!ST.hasR128A16() && !ST.hasGFX10A16())
3721       return false;
3722 
3723     if (NumVAddrs > 1) {
3724       SmallVector<Register, 4> PackedRegs;
3725       packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, NumVAddrs,
3726                                   NumGradients);
3727 
3728       if (!UseNSA && PackedRegs.size() > 1) {
3729         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
3730         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
3731         PackedRegs[0] = Concat.getReg(0);
3732         PackedRegs.resize(1);
3733       }
3734 
3735       const int NumPacked = PackedRegs.size();
3736       for (int I = 0; I != NumVAddrs; ++I) {
3737         assert(MI.getOperand(AddrIdx + I).getReg() != AMDGPU::NoRegister);
3738 
3739         if (I < NumPacked)
3740           MI.getOperand(AddrIdx + I).setReg(PackedRegs[I]);
3741         else
3742           MI.getOperand(AddrIdx + I).setReg(AMDGPU::NoRegister);
3743       }
3744     }
3745   } else if (!UseNSA && NumVAddrs > 1) {
3746     convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
3747   }
3748 
3749   if (BaseOpcode->Store) { // No TFE for stores?
3750     // TODO: Handle dmask trim
3751     Register VData = MI.getOperand(1).getReg();
3752     LLT Ty = MRI->getType(VData);
3753     if (!Ty.isVector() || Ty.getElementType() != S16)
3754       return true;
3755 
3756     B.setInstr(MI);
3757 
3758     Register RepackedReg = handleD16VData(B, *MRI, VData);
3759     if (RepackedReg != VData) {
3760       MI.getOperand(1).setReg(RepackedReg);
3761     }
3762 
3763     return true;
3764   }
3765 
3766   Register DstReg = MI.getOperand(0).getReg();
3767   LLT Ty = MRI->getType(DstReg);
3768   const LLT EltTy = Ty.getScalarType();
3769   const bool IsD16 = Ty.getScalarType() == S16;
3770   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
3771 
3772   // Confirm that the return type is large enough for the dmask specified
3773   if (NumElts < DMaskLanes)
3774     return false;
3775 
3776   if (NumElts > 4 || DMaskLanes > 4)
3777     return false;
3778 
3779   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
3780   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
3781 
3782   // The raw dword aligned data component of the load. The only legal cases
3783   // where this matters should be when using the packed D16 format, for
3784   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
3785   LLT RoundedTy;
3786 
3787   // S32 vector to to cover all data, plus TFE result element.
3788   LLT TFETy;
3789 
3790   // Register type to use for each loaded component. Will be S32 or V2S16.
3791   LLT RegTy;
3792 
3793   if (IsD16 && ST.hasUnpackedD16VMem()) {
3794     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
3795     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
3796     RegTy = S32;
3797   } else {
3798     unsigned EltSize = EltTy.getSizeInBits();
3799     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
3800     unsigned RoundedSize = 32 * RoundedElts;
3801     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
3802     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
3803     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
3804   }
3805 
3806   // The return type does not need adjustment.
3807   // TODO: Should we change s16 case to s32 or <2 x s16>?
3808   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
3809     return true;
3810 
3811   Register Dst1Reg;
3812 
3813   // Insert after the instruction.
3814   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3815 
3816   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
3817   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
3818   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
3819   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
3820 
3821   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
3822 
3823   MI.getOperand(0).setReg(NewResultReg);
3824 
3825   // In the IR, TFE is supposed to be used with a 2 element struct return
3826   // type. The intruction really returns these two values in one contiguous
3827   // register, with one additional dword beyond the loaded data. Rewrite the
3828   // return type to use a single register result.
3829 
3830   if (IsTFE) {
3831     Dst1Reg = MI.getOperand(1).getReg();
3832     if (MRI->getType(Dst1Reg) != S32)
3833       return false;
3834 
3835     // TODO: Make sure the TFE operand bit is set.
3836     MI.RemoveOperand(1);
3837 
3838     // Handle the easy case that requires no repack instructions.
3839     if (Ty == S32) {
3840       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
3841       return true;
3842     }
3843   }
3844 
3845   // Now figure out how to copy the new result register back into the old
3846   // result.
3847   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
3848 
3849   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
3850 
3851   if (ResultNumRegs == 1) {
3852     assert(!IsTFE);
3853     ResultRegs[0] = NewResultReg;
3854   } else {
3855     // We have to repack into a new vector of some kind.
3856     for (int I = 0; I != NumDataRegs; ++I)
3857       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
3858     B.buildUnmerge(ResultRegs, NewResultReg);
3859 
3860     // Drop the final TFE element to get the data part. The TFE result is
3861     // directly written to the right place already.
3862     if (IsTFE)
3863       ResultRegs.resize(NumDataRegs);
3864   }
3865 
3866   // For an s16 scalar result, we form an s32 result with a truncate regardless
3867   // of packed vs. unpacked.
3868   if (IsD16 && !Ty.isVector()) {
3869     B.buildTrunc(DstReg, ResultRegs[0]);
3870     return true;
3871   }
3872 
3873   // Avoid a build/concat_vector of 1 entry.
3874   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
3875     B.buildBitcast(DstReg, ResultRegs[0]);
3876     return true;
3877   }
3878 
3879   assert(Ty.isVector());
3880 
3881   if (IsD16) {
3882     // For packed D16 results with TFE enabled, all the data components are
3883     // S32. Cast back to the expected type.
3884     //
3885     // TODO: We don't really need to use load s32 elements. We would only need one
3886     // cast for the TFE result if a multiple of v2s16 was used.
3887     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
3888       for (Register &Reg : ResultRegs)
3889         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
3890     } else if (ST.hasUnpackedD16VMem()) {
3891       for (Register &Reg : ResultRegs)
3892         Reg = B.buildTrunc(S16, Reg).getReg(0);
3893     }
3894   }
3895 
3896   auto padWithUndef = [&](LLT Ty, int NumElts) {
3897     if (NumElts == 0)
3898       return;
3899     Register Undef = B.buildUndef(Ty).getReg(0);
3900     for (int I = 0; I != NumElts; ++I)
3901       ResultRegs.push_back(Undef);
3902   };
3903 
3904   // Pad out any elements eliminated due to the dmask.
3905   LLT ResTy = MRI->getType(ResultRegs[0]);
3906   if (!ResTy.isVector()) {
3907     padWithUndef(ResTy, NumElts - ResultRegs.size());
3908     B.buildBuildVector(DstReg, ResultRegs);
3909     return true;
3910   }
3911 
3912   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
3913   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
3914 
3915   // Deal with the one annoying legal case.
3916   const LLT V3S16 = LLT::vector(3, 16);
3917   if (Ty == V3S16) {
3918     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
3919     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
3920     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
3921     return true;
3922   }
3923 
3924   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
3925   B.buildConcatVectors(DstReg, ResultRegs);
3926   return true;
3927 }
3928 
3929 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
3930   MachineInstr &MI, MachineIRBuilder &B,
3931   GISelChangeObserver &Observer) const {
3932   Register Dst = MI.getOperand(0).getReg();
3933   LLT Ty = B.getMRI()->getType(Dst);
3934   unsigned Size = Ty.getSizeInBits();
3935   MachineFunction &MF = B.getMF();
3936 
3937   Observer.changingInstr(MI);
3938 
3939   // FIXME: We don't really need this intermediate instruction. The intrinsic
3940   // should be fixed to have a memory operand. Since it's readnone, we're not
3941   // allowed to add one.
3942   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
3943   MI.RemoveOperand(1); // Remove intrinsic ID
3944 
3945   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
3946   // TODO: Should this use datalayout alignment?
3947   const unsigned MemSize = (Size + 7) / 8;
3948   const unsigned MemAlign = 4;
3949   MachineMemOperand *MMO = MF.getMachineMemOperand(
3950     MachinePointerInfo(),
3951     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
3952     MachineMemOperand::MOInvariant, MemSize, MemAlign);
3953   MI.addMemOperand(MF, MMO);
3954 
3955   // There are no 96-bit result scalar loads, but widening to 128-bit should
3956   // always be legal. We may need to restore this to a 96-bit result if it turns
3957   // out this needs to be converted to a vector load during RegBankSelect.
3958   if (!isPowerOf2_32(Size)) {
3959     LegalizerHelper Helper(MF, *this, Observer, B);
3960     B.setInstr(MI);
3961 
3962     if (Ty.isVector())
3963       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
3964     else
3965       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
3966   }
3967 
3968   Observer.changedInstr(MI);
3969   return true;
3970 }
3971 
3972 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
3973                                                 MachineRegisterInfo &MRI,
3974                                                 MachineIRBuilder &B) const {
3975   B.setInstr(MI);
3976 
3977   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
3978   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
3979       !ST.isTrapHandlerEnabled()) {
3980     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
3981   } else {
3982     // Pass queue pointer to trap handler as input, and insert trap instruction
3983     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
3984     const ArgDescriptor *Arg =
3985         getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR);
3986     if (!Arg)
3987       return false;
3988     MachineRegisterInfo &MRI = *B.getMRI();
3989     Register SGPR01(AMDGPU::SGPR0_SGPR1);
3990     Register LiveIn = getLiveInRegister(
3991         B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64),
3992         /*InsertLiveInCopy=*/false);
3993     if (!loadInputValue(LiveIn, B, Arg))
3994       return false;
3995     B.buildCopy(SGPR01, LiveIn);
3996     B.buildInstr(AMDGPU::S_TRAP)
3997         .addImm(GCNSubtarget::TrapIDLLVMTrap)
3998         .addReg(SGPR01, RegState::Implicit);
3999   }
4000 
4001   MI.eraseFromParent();
4002   return true;
4003 }
4004 
4005 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4006     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4007   B.setInstr(MI);
4008 
4009   // Is non-HSA path or trap-handler disabled? then, report a warning
4010   // accordingly
4011   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4012       !ST.isTrapHandlerEnabled()) {
4013     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4014                                      "debugtrap handler not supported",
4015                                      MI.getDebugLoc(), DS_Warning);
4016     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4017     Ctx.diagnose(NoTrap);
4018   } else {
4019     // Insert debug-trap instruction
4020     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4021   }
4022 
4023   MI.eraseFromParent();
4024   return true;
4025 }
4026 
4027 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
4028                                             MachineIRBuilder &B,
4029                                             GISelChangeObserver &Observer) const {
4030   MachineRegisterInfo &MRI = *B.getMRI();
4031 
4032   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4033   auto IntrID = MI.getIntrinsicID();
4034   switch (IntrID) {
4035   case Intrinsic::amdgcn_if:
4036   case Intrinsic::amdgcn_else: {
4037     MachineInstr *Br = nullptr;
4038     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
4039       const SIRegisterInfo *TRI
4040         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4041 
4042       B.setInstr(*BrCond);
4043       Register Def = MI.getOperand(1).getReg();
4044       Register Use = MI.getOperand(3).getReg();
4045 
4046       MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB();
4047       if (Br)
4048         BrTarget = Br->getOperand(0).getMBB();
4049 
4050       if (IntrID == Intrinsic::amdgcn_if) {
4051         B.buildInstr(AMDGPU::SI_IF)
4052           .addDef(Def)
4053           .addUse(Use)
4054           .addMBB(BrTarget);
4055       } else {
4056         B.buildInstr(AMDGPU::SI_ELSE)
4057           .addDef(Def)
4058           .addUse(Use)
4059           .addMBB(BrTarget)
4060           .addImm(0);
4061       }
4062 
4063       if (Br)
4064         Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB());
4065 
4066       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4067       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4068       MI.eraseFromParent();
4069       BrCond->eraseFromParent();
4070       return true;
4071     }
4072 
4073     return false;
4074   }
4075   case Intrinsic::amdgcn_loop: {
4076     MachineInstr *Br = nullptr;
4077     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
4078       const SIRegisterInfo *TRI
4079         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4080 
4081       B.setInstr(*BrCond);
4082 
4083       MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB();
4084       if (Br)
4085         BrTarget = Br->getOperand(0).getMBB();
4086 
4087       Register Reg = MI.getOperand(2).getReg();
4088       B.buildInstr(AMDGPU::SI_LOOP)
4089         .addUse(Reg)
4090         .addMBB(BrTarget);
4091 
4092       if (Br)
4093         Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB());
4094 
4095       MI.eraseFromParent();
4096       BrCond->eraseFromParent();
4097       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4098       return true;
4099     }
4100 
4101     return false;
4102   }
4103   case Intrinsic::amdgcn_kernarg_segment_ptr:
4104     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4105       B.setInstr(MI);
4106       // This only makes sense to call in a kernel, so just lower to null.
4107       B.buildConstant(MI.getOperand(0).getReg(), 0);
4108       MI.eraseFromParent();
4109       return true;
4110     }
4111 
4112     return legalizePreloadedArgIntrin(
4113       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4114   case Intrinsic::amdgcn_implicitarg_ptr:
4115     return legalizeImplicitArgPtr(MI, MRI, B);
4116   case Intrinsic::amdgcn_workitem_id_x:
4117     return legalizePreloadedArgIntrin(MI, MRI, B,
4118                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4119   case Intrinsic::amdgcn_workitem_id_y:
4120     return legalizePreloadedArgIntrin(MI, MRI, B,
4121                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4122   case Intrinsic::amdgcn_workitem_id_z:
4123     return legalizePreloadedArgIntrin(MI, MRI, B,
4124                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4125   case Intrinsic::amdgcn_workgroup_id_x:
4126     return legalizePreloadedArgIntrin(MI, MRI, B,
4127                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4128   case Intrinsic::amdgcn_workgroup_id_y:
4129     return legalizePreloadedArgIntrin(MI, MRI, B,
4130                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4131   case Intrinsic::amdgcn_workgroup_id_z:
4132     return legalizePreloadedArgIntrin(MI, MRI, B,
4133                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4134   case Intrinsic::amdgcn_dispatch_ptr:
4135     return legalizePreloadedArgIntrin(MI, MRI, B,
4136                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4137   case Intrinsic::amdgcn_queue_ptr:
4138     return legalizePreloadedArgIntrin(MI, MRI, B,
4139                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4140   case Intrinsic::amdgcn_implicit_buffer_ptr:
4141     return legalizePreloadedArgIntrin(
4142       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4143   case Intrinsic::amdgcn_dispatch_id:
4144     return legalizePreloadedArgIntrin(MI, MRI, B,
4145                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4146   case Intrinsic::amdgcn_fdiv_fast:
4147     return legalizeFDIVFastIntrin(MI, MRI, B);
4148   case Intrinsic::amdgcn_is_shared:
4149     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4150   case Intrinsic::amdgcn_is_private:
4151     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4152   case Intrinsic::amdgcn_wavefrontsize: {
4153     B.setInstr(MI);
4154     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4155     MI.eraseFromParent();
4156     return true;
4157   }
4158   case Intrinsic::amdgcn_s_buffer_load:
4159     return legalizeSBufferLoad(MI, B, Observer);
4160   case Intrinsic::amdgcn_raw_buffer_store:
4161   case Intrinsic::amdgcn_struct_buffer_store:
4162     return legalizeBufferStore(MI, MRI, B, false, false);
4163   case Intrinsic::amdgcn_raw_buffer_store_format:
4164   case Intrinsic::amdgcn_struct_buffer_store_format:
4165     return legalizeBufferStore(MI, MRI, B, false, true);
4166   case Intrinsic::amdgcn_raw_tbuffer_store:
4167   case Intrinsic::amdgcn_struct_tbuffer_store:
4168     return legalizeBufferStore(MI, MRI, B, true, true);
4169   case Intrinsic::amdgcn_raw_buffer_load:
4170   case Intrinsic::amdgcn_struct_buffer_load:
4171     return legalizeBufferLoad(MI, MRI, B, false, false);
4172   case Intrinsic::amdgcn_raw_buffer_load_format:
4173   case Intrinsic::amdgcn_struct_buffer_load_format:
4174     return legalizeBufferLoad(MI, MRI, B, true, false);
4175   case Intrinsic::amdgcn_raw_tbuffer_load:
4176   case Intrinsic::amdgcn_struct_tbuffer_load:
4177     return legalizeBufferLoad(MI, MRI, B, true, true);
4178   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4179   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4180   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4181   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4182   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4183   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4184   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4185   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4186   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4187   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4188   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4189   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4190   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4191   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4192   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4193   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4194   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4195   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4196   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4197   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4198   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4199   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4200   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4201   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4202   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4203   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4204     return legalizeBufferAtomic(MI, B, IntrID);
4205   case Intrinsic::amdgcn_atomic_inc:
4206     return legalizeAtomicIncDec(MI, B, true);
4207   case Intrinsic::amdgcn_atomic_dec:
4208     return legalizeAtomicIncDec(MI, B, false);
4209   case Intrinsic::trap:
4210     return legalizeTrapIntrinsic(MI, MRI, B);
4211   case Intrinsic::debugtrap:
4212     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4213   default: {
4214     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4215             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4216       return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr);
4217     return true;
4218   }
4219   }
4220 
4221   return true;
4222 }
4223