1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
19 #endif
20 
21 #include "AMDGPULegalizerInfo.h"
22 
23 #include "AMDGPU.h"
24 #include "AMDGPUGlobalISelUtils.h"
25 #include "AMDGPUTargetMachine.h"
26 #include "SIMachineFunctionInfo.h"
27 #include "llvm/ADT/ScopeExit.h"
28 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
29 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
30 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
31 #include "llvm/CodeGen/TargetOpcodes.h"
32 #include "llvm/CodeGen/ValueTypes.h"
33 #include "llvm/IR/DerivedTypes.h"
34 #include "llvm/IR/DiagnosticInfo.h"
35 #include "llvm/IR/Type.h"
36 #include "llvm/Support/Debug.h"
37 
38 #define DEBUG_TYPE "amdgpu-legalinfo"
39 
40 using namespace llvm;
41 using namespace LegalizeActions;
42 using namespace LegalizeMutations;
43 using namespace LegalityPredicates;
44 using namespace MIPatternMatch;
45 
46 // Round the number of elements to the next power of two elements
47 static LLT getPow2VectorType(LLT Ty) {
48   unsigned NElts = Ty.getNumElements();
49   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
50   return Ty.changeNumElements(Pow2NElts);
51 }
52 
53 // Round the number of bits to the next power of two bits
54 static LLT getPow2ScalarType(LLT Ty) {
55   unsigned Bits = Ty.getSizeInBits();
56   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
57   return LLT::scalar(Pow2Bits);
58 }
59 
60 static LegalityPredicate isMultiple32(unsigned TypeIdx,
61                                       unsigned MaxSize = 1024) {
62   return [=](const LegalityQuery &Query) {
63     const LLT Ty = Query.Types[TypeIdx];
64     const LLT EltTy = Ty.getScalarType();
65     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
66   };
67 }
68 
69 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
70   return [=](const LegalityQuery &Query) {
71     return Query.Types[TypeIdx].getSizeInBits() == Size;
72   };
73 }
74 
75 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
76   return [=](const LegalityQuery &Query) {
77     const LLT Ty = Query.Types[TypeIdx];
78     return Ty.isVector() &&
79            Ty.getNumElements() % 2 != 0 &&
80            Ty.getElementType().getSizeInBits() < 32 &&
81            Ty.getSizeInBits() % 32 != 0;
82   };
83 }
84 
85 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
86   return [=](const LegalityQuery &Query) {
87     const LLT Ty = Query.Types[TypeIdx];
88     const LLT EltTy = Ty.getScalarType();
89     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
90   };
91 }
92 
93 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
94   return [=](const LegalityQuery &Query) {
95     const LLT Ty = Query.Types[TypeIdx];
96     const LLT EltTy = Ty.getElementType();
97     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
98   };
99 }
100 
101 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
102   return [=](const LegalityQuery &Query) {
103     const LLT Ty = Query.Types[TypeIdx];
104     const LLT EltTy = Ty.getElementType();
105     unsigned Size = Ty.getSizeInBits();
106     unsigned Pieces = (Size + 63) / 64;
107     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
108     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
109   };
110 }
111 
112 // Increase the number of vector elements to reach the next multiple of 32-bit
113 // type.
114 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
115   return [=](const LegalityQuery &Query) {
116     const LLT Ty = Query.Types[TypeIdx];
117 
118     const LLT EltTy = Ty.getElementType();
119     const int Size = Ty.getSizeInBits();
120     const int EltSize = EltTy.getSizeInBits();
121     const int NextMul32 = (Size + 31) / 32;
122 
123     assert(EltSize < 32);
124 
125     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
126     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
127   };
128 }
129 
130 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
131   return [=](const LegalityQuery &Query) {
132     const LLT QueryTy = Query.Types[TypeIdx];
133     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
134   };
135 }
136 
137 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
138   return [=](const LegalityQuery &Query) {
139     const LLT QueryTy = Query.Types[TypeIdx];
140     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
141   };
142 }
143 
144 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
145   return [=](const LegalityQuery &Query) {
146     const LLT QueryTy = Query.Types[TypeIdx];
147     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
148   };
149 }
150 
151 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
152 // v2s16.
153 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
154   return [=](const LegalityQuery &Query) {
155     const LLT Ty = Query.Types[TypeIdx];
156     if (Ty.isVector()) {
157       const int EltSize = Ty.getElementType().getSizeInBits();
158       return EltSize == 32 || EltSize == 64 ||
159             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
160              EltSize == 128 || EltSize == 256;
161     }
162 
163     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
164   };
165 }
166 
167 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
168   return [=](const LegalityQuery &Query) {
169     const LLT QueryTy = Query.Types[TypeIdx];
170     return QueryTy.isVector() && QueryTy.getElementType() == Type;
171   };
172 }
173 
174 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
175   return [=](const LegalityQuery &Query) {
176     const LLT QueryTy = Query.Types[TypeIdx];
177     if (!QueryTy.isVector())
178       return false;
179     const LLT EltTy = QueryTy.getElementType();
180     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
181   };
182 }
183 
184 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
185   return [=](const LegalityQuery &Query) {
186     const LLT Ty = Query.Types[TypeIdx];
187     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
188            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
189   };
190 }
191 
192 static LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1) {
193   return [=](const LegalityQuery &Query) {
194     return Query.Types[TypeIdx0].getSizeInBits() <
195            Query.Types[TypeIdx1].getSizeInBits();
196   };
197 }
198 
199 static LegalityPredicate greaterThan(unsigned TypeIdx0, unsigned TypeIdx1) {
200   return [=](const LegalityQuery &Query) {
201     return Query.Types[TypeIdx0].getSizeInBits() >
202            Query.Types[TypeIdx1].getSizeInBits();
203   };
204 }
205 
206 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
207                                          const GCNTargetMachine &TM)
208   :  ST(ST_) {
209   using namespace TargetOpcode;
210 
211   auto GetAddrSpacePtr = [&TM](unsigned AS) {
212     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
213   };
214 
215   const LLT S1 = LLT::scalar(1);
216   const LLT S16 = LLT::scalar(16);
217   const LLT S32 = LLT::scalar(32);
218   const LLT S64 = LLT::scalar(64);
219   const LLT S128 = LLT::scalar(128);
220   const LLT S256 = LLT::scalar(256);
221   const LLT S512 = LLT::scalar(512);
222   const LLT S1024 = LLT::scalar(1024);
223 
224   const LLT V2S16 = LLT::vector(2, 16);
225   const LLT V4S16 = LLT::vector(4, 16);
226 
227   const LLT V2S32 = LLT::vector(2, 32);
228   const LLT V3S32 = LLT::vector(3, 32);
229   const LLT V4S32 = LLT::vector(4, 32);
230   const LLT V5S32 = LLT::vector(5, 32);
231   const LLT V6S32 = LLT::vector(6, 32);
232   const LLT V7S32 = LLT::vector(7, 32);
233   const LLT V8S32 = LLT::vector(8, 32);
234   const LLT V9S32 = LLT::vector(9, 32);
235   const LLT V10S32 = LLT::vector(10, 32);
236   const LLT V11S32 = LLT::vector(11, 32);
237   const LLT V12S32 = LLT::vector(12, 32);
238   const LLT V13S32 = LLT::vector(13, 32);
239   const LLT V14S32 = LLT::vector(14, 32);
240   const LLT V15S32 = LLT::vector(15, 32);
241   const LLT V16S32 = LLT::vector(16, 32);
242   const LLT V32S32 = LLT::vector(32, 32);
243 
244   const LLT V2S64 = LLT::vector(2, 64);
245   const LLT V3S64 = LLT::vector(3, 64);
246   const LLT V4S64 = LLT::vector(4, 64);
247   const LLT V5S64 = LLT::vector(5, 64);
248   const LLT V6S64 = LLT::vector(6, 64);
249   const LLT V7S64 = LLT::vector(7, 64);
250   const LLT V8S64 = LLT::vector(8, 64);
251   const LLT V16S64 = LLT::vector(16, 64);
252 
253   std::initializer_list<LLT> AllS32Vectors =
254     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
255      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
256   std::initializer_list<LLT> AllS64Vectors =
257     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
258 
259   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
260   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
261   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
262   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
263   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
264   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
265   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
266 
267   const LLT CodePtr = FlatPtr;
268 
269   const std::initializer_list<LLT> AddrSpaces64 = {
270     GlobalPtr, ConstantPtr, FlatPtr
271   };
272 
273   const std::initializer_list<LLT> AddrSpaces32 = {
274     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
275   };
276 
277   const std::initializer_list<LLT> FPTypesBase = {
278     S32, S64
279   };
280 
281   const std::initializer_list<LLT> FPTypes16 = {
282     S32, S64, S16
283   };
284 
285   const std::initializer_list<LLT> FPTypesPK16 = {
286     S32, S64, S16, V2S16
287   };
288 
289   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
290 
291   setAction({G_BRCOND, S1}, Legal); // VCC branches
292   setAction({G_BRCOND, S32}, Legal); // SCC branches
293 
294   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
295   // elements for v3s16
296   getActionDefinitionsBuilder(G_PHI)
297     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
298     .legalFor(AllS32Vectors)
299     .legalFor(AllS64Vectors)
300     .legalFor(AddrSpaces64)
301     .legalFor(AddrSpaces32)
302     .clampScalar(0, S32, S256)
303     .widenScalarToNextPow2(0, 32)
304     .clampMaxNumElements(0, S32, 16)
305     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
306     .legalIf(isPointer(0));
307 
308   if (ST.hasVOP3PInsts()) {
309     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
310       .legalFor({S32, S16, V2S16})
311       .clampScalar(0, S16, S32)
312       .clampMaxNumElements(0, S16, 2)
313       .scalarize(0)
314       .widenScalarToNextPow2(0, 32);
315   } else if (ST.has16BitInsts()) {
316     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
317       .legalFor({S32, S16})
318       .clampScalar(0, S16, S32)
319       .scalarize(0)
320       .widenScalarToNextPow2(0, 32);
321   } else {
322     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
323       .legalFor({S32})
324       .clampScalar(0, S32, S32)
325       .scalarize(0);
326   }
327 
328   // FIXME: Not really legal. Placeholder for custom lowering.
329   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
330     .customFor({S32, S64})
331     .clampScalar(0, S32, S64)
332     .widenScalarToNextPow2(0, 32)
333     .scalarize(0);
334 
335   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
336     .legalFor({S32})
337     .clampScalar(0, S32, S32)
338     .scalarize(0);
339 
340   // Report legal for any types we can handle anywhere. For the cases only legal
341   // on the SALU, RegBankSelect will be able to re-legalize.
342   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
343     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
344     .clampScalar(0, S32, S64)
345     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
346     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
347     .widenScalarToNextPow2(0)
348     .scalarize(0);
349 
350   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
351                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
352     .legalFor({{S32, S1}, {S32, S32}})
353     .minScalar(0, S32)
354     // TODO: .scalarize(0)
355     .lower();
356 
357   getActionDefinitionsBuilder(G_BITCAST)
358     // Don't worry about the size constraint.
359     .legalIf(all(isRegisterType(0), isRegisterType(1)))
360     .lower();
361 
362 
363   getActionDefinitionsBuilder(G_CONSTANT)
364     .legalFor({S1, S32, S64, S16, GlobalPtr,
365                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
366     .clampScalar(0, S32, S64)
367     .widenScalarToNextPow2(0)
368     .legalIf(isPointer(0));
369 
370   getActionDefinitionsBuilder(G_FCONSTANT)
371     .legalFor({S32, S64, S16})
372     .clampScalar(0, S16, S64);
373 
374   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
375     .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
376                ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
377     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
378     .clampScalarOrElt(0, S32, S1024)
379     .legalIf(isMultiple32(0))
380     .widenScalarToNextPow2(0, 32)
381     .clampMaxNumElements(0, S32, 16);
382 
383   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
384   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
385     .unsupportedFor({PrivatePtr})
386     .custom();
387   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
388 
389   auto &FPOpActions = getActionDefinitionsBuilder(
390     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
391     .legalFor({S32, S64});
392   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
393     .customFor({S32, S64});
394   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
395     .customFor({S32, S64});
396 
397   if (ST.has16BitInsts()) {
398     if (ST.hasVOP3PInsts())
399       FPOpActions.legalFor({S16, V2S16});
400     else
401       FPOpActions.legalFor({S16});
402 
403     TrigActions.customFor({S16});
404     FDIVActions.customFor({S16});
405   }
406 
407   auto &MinNumMaxNum = getActionDefinitionsBuilder({
408       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
409 
410   if (ST.hasVOP3PInsts()) {
411     MinNumMaxNum.customFor(FPTypesPK16)
412       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
413       .clampMaxNumElements(0, S16, 2)
414       .clampScalar(0, S16, S64)
415       .scalarize(0);
416   } else if (ST.has16BitInsts()) {
417     MinNumMaxNum.customFor(FPTypes16)
418       .clampScalar(0, S16, S64)
419       .scalarize(0);
420   } else {
421     MinNumMaxNum.customFor(FPTypesBase)
422       .clampScalar(0, S32, S64)
423       .scalarize(0);
424   }
425 
426   if (ST.hasVOP3PInsts())
427     FPOpActions.clampMaxNumElements(0, S16, 2);
428 
429   FPOpActions
430     .scalarize(0)
431     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
432 
433   TrigActions
434     .scalarize(0)
435     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
436 
437   FDIVActions
438     .scalarize(0)
439     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
440 
441   getActionDefinitionsBuilder({G_FNEG, G_FABS})
442     .legalFor(FPTypesPK16)
443     .clampMaxNumElements(0, S16, 2)
444     .scalarize(0)
445     .clampScalar(0, S16, S64);
446 
447   if (ST.has16BitInsts()) {
448     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
449       .legalFor({S32, S64, S16})
450       .scalarize(0)
451       .clampScalar(0, S16, S64);
452   } else {
453     getActionDefinitionsBuilder(G_FSQRT)
454       .legalFor({S32, S64})
455       .scalarize(0)
456       .clampScalar(0, S32, S64);
457 
458     if (ST.hasFractBug()) {
459       getActionDefinitionsBuilder(G_FFLOOR)
460         .customFor({S64})
461         .legalFor({S32, S64})
462         .scalarize(0)
463         .clampScalar(0, S32, S64);
464     } else {
465       getActionDefinitionsBuilder(G_FFLOOR)
466         .legalFor({S32, S64})
467         .scalarize(0)
468         .clampScalar(0, S32, S64);
469     }
470   }
471 
472   getActionDefinitionsBuilder(G_FPTRUNC)
473     .legalFor({{S32, S64}, {S16, S32}})
474     .scalarize(0)
475     .lower();
476 
477   getActionDefinitionsBuilder(G_FPEXT)
478     .legalFor({{S64, S32}, {S32, S16}})
479     .lowerFor({{S64, S16}}) // FIXME: Implement
480     .scalarize(0);
481 
482   getActionDefinitionsBuilder(G_FSUB)
483       // Use actual fsub instruction
484       .legalFor({S32})
485       // Must use fadd + fneg
486       .lowerFor({S64, S16, V2S16})
487       .scalarize(0)
488       .clampScalar(0, S32, S64);
489 
490   // Whether this is legal depends on the floating point mode for the function.
491   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
492   if (ST.hasMadF16())
493     FMad.customFor({S32, S16});
494   else
495     FMad.customFor({S32});
496   FMad.scalarize(0)
497       .lower();
498 
499   // TODO: Do we need to clamp maximum bitwidth?
500   getActionDefinitionsBuilder(G_TRUNC)
501     .legalIf(isScalar(0))
502     .legalFor({{V2S16, V2S32}})
503     .clampMaxNumElements(0, S16, 2)
504     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
505     // situations (like an invalid implicit use), we don't want to infinite loop
506     // in the legalizer.
507     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
508     .alwaysLegal();
509 
510   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
511     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
512                {S32, S1}, {S64, S1}, {S16, S1}})
513     .scalarize(0)
514     .clampScalar(0, S32, S64)
515     .widenScalarToNextPow2(1, 32);
516 
517   // TODO: Split s1->s64 during regbankselect for VALU.
518   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
519     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
520     .lowerFor({{S32, S64}})
521     .lowerIf(typeIs(1, S1))
522     .customFor({{S64, S64}});
523   if (ST.has16BitInsts())
524     IToFP.legalFor({{S16, S16}});
525   IToFP.clampScalar(1, S32, S64)
526        .scalarize(0)
527        .widenScalarToNextPow2(1);
528 
529   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
530     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
531     .customFor({{S64, S64}});
532   if (ST.has16BitInsts())
533     FPToI.legalFor({{S16, S16}});
534   else
535     FPToI.minScalar(1, S32);
536 
537   FPToI.minScalar(0, S32)
538        .scalarize(0)
539        .lower();
540 
541   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
542     .scalarize(0)
543     .lower();
544 
545   if (ST.has16BitInsts()) {
546     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
547       .legalFor({S16, S32, S64})
548       .clampScalar(0, S16, S64)
549       .scalarize(0);
550   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
551     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
552       .legalFor({S32, S64})
553       .clampScalar(0, S32, S64)
554       .scalarize(0);
555   } else {
556     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
557       .legalFor({S32})
558       .customFor({S64})
559       .clampScalar(0, S32, S64)
560       .scalarize(0);
561   }
562 
563   getActionDefinitionsBuilder({G_PTR_ADD, G_PTR_MASK})
564     .scalarize(0)
565     .alwaysLegal();
566 
567   auto &CmpBuilder =
568     getActionDefinitionsBuilder(G_ICMP)
569     // The compare output type differs based on the register bank of the output,
570     // so make both s1 and s32 legal.
571     //
572     // Scalar compares producing output in scc will be promoted to s32, as that
573     // is the allocatable register type that will be needed for the copy from
574     // scc. This will be promoted during RegBankSelect, and we assume something
575     // before that won't try to use s32 result types.
576     //
577     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
578     // bank.
579     .legalForCartesianProduct(
580       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
581     .legalForCartesianProduct(
582       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
583   if (ST.has16BitInsts()) {
584     CmpBuilder.legalFor({{S1, S16}});
585   }
586 
587   CmpBuilder
588     .widenScalarToNextPow2(1)
589     .clampScalar(1, S32, S64)
590     .scalarize(0)
591     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
592 
593   getActionDefinitionsBuilder(G_FCMP)
594     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
595     .widenScalarToNextPow2(1)
596     .clampScalar(1, S32, S64)
597     .scalarize(0);
598 
599   // FIXME: fpow has a selection pattern that should move to custom lowering.
600   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
601   if (ST.has16BitInsts())
602     Exp2Ops.legalFor({S32, S16});
603   else
604     Exp2Ops.legalFor({S32});
605   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
606   Exp2Ops.scalarize(0);
607 
608   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
609   if (ST.has16BitInsts())
610     ExpOps.customFor({{S32}, {S16}});
611   else
612     ExpOps.customFor({S32});
613   ExpOps.clampScalar(0, MinScalarFPTy, S32)
614         .scalarize(0);
615 
616   // The 64-bit versions produce 32-bit results, but only on the SALU.
617   getActionDefinitionsBuilder(G_CTPOP)
618     .legalFor({{S32, S32}, {S32, S64}})
619     .clampScalar(0, S32, S32)
620     .clampScalar(1, S32, S64)
621     .scalarize(0)
622     .widenScalarToNextPow2(0, 32)
623     .widenScalarToNextPow2(1, 32);
624 
625   // The hardware instructions return a different result on 0 than the generic
626   // instructions expect. The hardware produces -1, but these produce the
627   // bitwidth.
628   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
629     .scalarize(0)
630     .clampScalar(0, S32, S32)
631     .clampScalar(1, S32, S64)
632     .widenScalarToNextPow2(0, 32)
633     .widenScalarToNextPow2(1, 32)
634     .lower();
635 
636   // The 64-bit versions produce 32-bit results, but only on the SALU.
637   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
638     .legalFor({{S32, S32}, {S32, S64}})
639     .clampScalar(0, S32, S32)
640     .clampScalar(1, S32, S64)
641     .scalarize(0)
642     .widenScalarToNextPow2(0, 32)
643     .widenScalarToNextPow2(1, 32);
644 
645   getActionDefinitionsBuilder(G_BITREVERSE)
646     .legalFor({S32})
647     .clampScalar(0, S32, S32)
648     .scalarize(0);
649 
650   if (ST.has16BitInsts()) {
651     getActionDefinitionsBuilder(G_BSWAP)
652       .legalFor({S16, S32, V2S16})
653       .clampMaxNumElements(0, S16, 2)
654       // FIXME: Fixing non-power-of-2 before clamp is workaround for
655       // narrowScalar limitation.
656       .widenScalarToNextPow2(0)
657       .clampScalar(0, S16, S32)
658       .scalarize(0);
659 
660     if (ST.hasVOP3PInsts()) {
661       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
662         .legalFor({S32, S16, V2S16})
663         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
664         .clampMaxNumElements(0, S16, 2)
665         .minScalar(0, S16)
666         .widenScalarToNextPow2(0)
667         .scalarize(0)
668         .lower();
669     } else {
670       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
671         .legalFor({S32, S16})
672         .widenScalarToNextPow2(0)
673         .minScalar(0, S16)
674         .scalarize(0)
675         .lower();
676     }
677   } else {
678     // TODO: Should have same legality without v_perm_b32
679     getActionDefinitionsBuilder(G_BSWAP)
680       .legalFor({S32})
681       .lowerIf(narrowerThan(0, 32))
682       // FIXME: Fixing non-power-of-2 before clamp is workaround for
683       // narrowScalar limitation.
684       .widenScalarToNextPow2(0)
685       .maxScalar(0, S32)
686       .scalarize(0)
687       .lower();
688 
689     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
690       .legalFor({S32})
691       .minScalar(0, S32)
692       .widenScalarToNextPow2(0)
693       .scalarize(0)
694       .lower();
695   }
696 
697   getActionDefinitionsBuilder(G_INTTOPTR)
698     // List the common cases
699     .legalForCartesianProduct(AddrSpaces64, {S64})
700     .legalForCartesianProduct(AddrSpaces32, {S32})
701     .scalarize(0)
702     // Accept any address space as long as the size matches
703     .legalIf(sameSize(0, 1))
704     .widenScalarIf(smallerThan(1, 0),
705       [](const LegalityQuery &Query) {
706         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
707       })
708     .narrowScalarIf(greaterThan(1, 0),
709       [](const LegalityQuery &Query) {
710         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
711       });
712 
713   getActionDefinitionsBuilder(G_PTRTOINT)
714     // List the common cases
715     .legalForCartesianProduct(AddrSpaces64, {S64})
716     .legalForCartesianProduct(AddrSpaces32, {S32})
717     .scalarize(0)
718     // Accept any address space as long as the size matches
719     .legalIf(sameSize(0, 1))
720     .widenScalarIf(smallerThan(0, 1),
721       [](const LegalityQuery &Query) {
722         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
723       })
724     .narrowScalarIf(
725       greaterThan(0, 1),
726       [](const LegalityQuery &Query) {
727         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
728       });
729 
730   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
731     .scalarize(0)
732     .custom();
733 
734   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
735   // handle some operations by just promoting the register during
736   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
737   auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned {
738     switch (AS) {
739     // FIXME: Private element size.
740     case AMDGPUAS::PRIVATE_ADDRESS:
741       return 32;
742     // FIXME: Check subtarget
743     case AMDGPUAS::LOCAL_ADDRESS:
744       return ST.useDS128() ? 128 : 64;
745 
746     // Treat constant and global as identical. SMRD loads are sometimes usable
747     // for global loads (ideally constant address space should be eliminated)
748     // depending on the context. Legality cannot be context dependent, but
749     // RegBankSelect can split the load as necessary depending on the pointer
750     // register bank/uniformity and if the memory is invariant or not written in
751     // a kernel.
752     case AMDGPUAS::CONSTANT_ADDRESS:
753     case AMDGPUAS::GLOBAL_ADDRESS:
754       return IsLoad ? 512 : 128;
755     default:
756       return 128;
757     }
758   };
759 
760   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
761                                     bool IsLoad) -> bool {
762     const LLT DstTy = Query.Types[0];
763 
764     // Split vector extloads.
765     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
766     unsigned Align = Query.MMODescrs[0].AlignInBits;
767 
768     if (MemSize < DstTy.getSizeInBits())
769       MemSize = std::max(MemSize, Align);
770 
771     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
772       return true;
773 
774     const LLT PtrTy = Query.Types[1];
775     unsigned AS = PtrTy.getAddressSpace();
776     if (MemSize > maxSizeForAddrSpace(AS, IsLoad))
777       return true;
778 
779     // Catch weird sized loads that don't evenly divide into the access sizes
780     // TODO: May be able to widen depending on alignment etc.
781     unsigned NumRegs = (MemSize + 31) / 32;
782     if (NumRegs == 3) {
783       if (!ST.hasDwordx3LoadStores())
784         return true;
785     } else {
786       // If the alignment allows, these should have been widened.
787       if (!isPowerOf2_32(NumRegs))
788         return true;
789     }
790 
791     if (Align < MemSize) {
792       const SITargetLowering *TLI = ST.getTargetLowering();
793       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
794     }
795 
796     return false;
797   };
798 
799   const auto shouldWidenLoadResult = [=](const LegalityQuery &Query) -> bool {
800     unsigned Size = Query.Types[0].getSizeInBits();
801     if (isPowerOf2_32(Size))
802       return false;
803 
804     if (Size == 96 && ST.hasDwordx3LoadStores())
805       return false;
806 
807     unsigned AddrSpace = Query.Types[1].getAddressSpace();
808     if (Size >= maxSizeForAddrSpace(AddrSpace, true))
809       return false;
810 
811     unsigned Align = Query.MMODescrs[0].AlignInBits;
812     unsigned RoundedSize = NextPowerOf2(Size);
813     return (Align >= RoundedSize);
814   };
815 
816   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
817   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
818   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
819 
820   // TODO: Refine based on subtargets which support unaligned access or 128-bit
821   // LDS
822   // TODO: Unsupported flat for SI.
823 
824   for (unsigned Op : {G_LOAD, G_STORE}) {
825     const bool IsStore = Op == G_STORE;
826 
827     auto &Actions = getActionDefinitionsBuilder(Op);
828     // Whitelist the common cases.
829     // TODO: Loads to s16 on gfx9
830     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
831                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
832                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
833                                       {S128, GlobalPtr, 128, GlobalAlign32},
834                                       {S64, GlobalPtr, 64, GlobalAlign32},
835                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
836                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
837                                       {S32, GlobalPtr, 8, GlobalAlign8},
838                                       {S32, GlobalPtr, 16, GlobalAlign16},
839 
840                                       {S32, LocalPtr, 32, 32},
841                                       {S64, LocalPtr, 64, 32},
842                                       {V2S32, LocalPtr, 64, 32},
843                                       {S32, LocalPtr, 8, 8},
844                                       {S32, LocalPtr, 16, 16},
845                                       {V2S16, LocalPtr, 32, 32},
846 
847                                       {S32, PrivatePtr, 32, 32},
848                                       {S32, PrivatePtr, 8, 8},
849                                       {S32, PrivatePtr, 16, 16},
850                                       {V2S16, PrivatePtr, 32, 32},
851 
852                                       {S32, FlatPtr, 32, GlobalAlign32},
853                                       {S32, FlatPtr, 16, GlobalAlign16},
854                                       {S32, FlatPtr, 8, GlobalAlign8},
855                                       {V2S16, FlatPtr, 32, GlobalAlign32},
856 
857                                       {S32, ConstantPtr, 32, GlobalAlign32},
858                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
859                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
860                                       {S64, ConstantPtr, 64, GlobalAlign32},
861                                       {S128, ConstantPtr, 128, GlobalAlign32},
862                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
863     Actions
864         .customIf(typeIs(1, Constant32Ptr))
865         // Widen suitably aligned loads by loading extra elements.
866         .moreElementsIf([=](const LegalityQuery &Query) {
867             const LLT Ty = Query.Types[0];
868             return Op == G_LOAD && Ty.isVector() &&
869                    shouldWidenLoadResult(Query);
870           }, moreElementsToNextPow2(0))
871         .widenScalarIf([=](const LegalityQuery &Query) {
872             const LLT Ty = Query.Types[0];
873             return Op == G_LOAD && !Ty.isVector() &&
874                    shouldWidenLoadResult(Query);
875           }, widenScalarOrEltToNextPow2(0))
876         .narrowScalarIf(
877             [=](const LegalityQuery &Query) -> bool {
878               return !Query.Types[0].isVector() &&
879                      needToSplitMemOp(Query, Op == G_LOAD);
880             },
881             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
882               const LLT DstTy = Query.Types[0];
883               const LLT PtrTy = Query.Types[1];
884 
885               const unsigned DstSize = DstTy.getSizeInBits();
886               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
887 
888               // Split extloads.
889               if (DstSize > MemSize)
890                 return std::make_pair(0, LLT::scalar(MemSize));
891 
892               if (!isPowerOf2_32(DstSize)) {
893                 // We're probably decomposing an odd sized store. Try to split
894                 // to the widest type. TODO: Account for alignment. As-is it
895                 // should be OK, since the new parts will be further legalized.
896                 unsigned FloorSize = PowerOf2Floor(DstSize);
897                 return std::make_pair(0, LLT::scalar(FloorSize));
898               }
899 
900               if (DstSize > 32 && (DstSize % 32 != 0)) {
901                 // FIXME: Need a way to specify non-extload of larger size if
902                 // suitably aligned.
903                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
904               }
905 
906               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
907                                                      Op == G_LOAD);
908               if (MemSize > MaxSize)
909                 return std::make_pair(0, LLT::scalar(MaxSize));
910 
911               unsigned Align = Query.MMODescrs[0].AlignInBits;
912               return std::make_pair(0, LLT::scalar(Align));
913             })
914         .fewerElementsIf(
915             [=](const LegalityQuery &Query) -> bool {
916               return Query.Types[0].isVector() &&
917                      needToSplitMemOp(Query, Op == G_LOAD);
918             },
919             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
920               const LLT DstTy = Query.Types[0];
921               const LLT PtrTy = Query.Types[1];
922 
923               LLT EltTy = DstTy.getElementType();
924               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
925                                                      Op == G_LOAD);
926 
927               // FIXME: Handle widened to power of 2 results better. This ends
928               // up scalarizing.
929               // FIXME: 3 element stores scalarized on SI
930 
931               // Split if it's too large for the address space.
932               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
933                 unsigned NumElts = DstTy.getNumElements();
934                 unsigned EltSize = EltTy.getSizeInBits();
935 
936                 if (MaxSize % EltSize == 0) {
937                   return std::make_pair(
938                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
939                 }
940 
941                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
942 
943                 // FIXME: Refine when odd breakdowns handled
944                 // The scalars will need to be re-legalized.
945                 if (NumPieces == 1 || NumPieces >= NumElts ||
946                     NumElts % NumPieces != 0)
947                   return std::make_pair(0, EltTy);
948 
949                 return std::make_pair(0,
950                                       LLT::vector(NumElts / NumPieces, EltTy));
951               }
952 
953               // FIXME: We could probably handle weird extending loads better.
954               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
955               if (DstTy.getSizeInBits() > MemSize)
956                 return std::make_pair(0, EltTy);
957 
958               unsigned EltSize = EltTy.getSizeInBits();
959               unsigned DstSize = DstTy.getSizeInBits();
960               if (!isPowerOf2_32(DstSize)) {
961                 // We're probably decomposing an odd sized store. Try to split
962                 // to the widest type. TODO: Account for alignment. As-is it
963                 // should be OK, since the new parts will be further legalized.
964                 unsigned FloorSize = PowerOf2Floor(DstSize);
965                 return std::make_pair(
966                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
967               }
968 
969               // Need to split because of alignment.
970               unsigned Align = Query.MMODescrs[0].AlignInBits;
971               if (EltSize > Align &&
972                   (EltSize / Align < DstTy.getNumElements())) {
973                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
974               }
975 
976               // May need relegalization for the scalars.
977               return std::make_pair(0, EltTy);
978             })
979         .minScalar(0, S32);
980 
981     if (IsStore)
982       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
983 
984     // TODO: Need a bitcast lower option?
985     Actions
986         .legalIf([=](const LegalityQuery &Query) {
987           const LLT Ty0 = Query.Types[0];
988           unsigned Size = Ty0.getSizeInBits();
989           unsigned MemSize = Query.MMODescrs[0].SizeInBits;
990           unsigned Align = Query.MMODescrs[0].AlignInBits;
991 
992           // FIXME: Widening store from alignment not valid.
993           if (MemSize < Size)
994             MemSize = std::max(MemSize, Align);
995 
996           // No extending vector loads.
997           if (Size > MemSize && Ty0.isVector())
998             return false;
999 
1000           switch (MemSize) {
1001           case 8:
1002           case 16:
1003             return Size == 32;
1004           case 32:
1005           case 64:
1006           case 128:
1007             return true;
1008           case 96:
1009             return ST.hasDwordx3LoadStores();
1010           case 256:
1011           case 512:
1012             return true;
1013           default:
1014             return false;
1015           }
1016         })
1017         .widenScalarToNextPow2(0)
1018         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
1019   }
1020 
1021   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1022                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1023                                                   {S32, GlobalPtr, 16, 2 * 8},
1024                                                   {S32, LocalPtr, 8, 8},
1025                                                   {S32, LocalPtr, 16, 16},
1026                                                   {S32, PrivatePtr, 8, 8},
1027                                                   {S32, PrivatePtr, 16, 16},
1028                                                   {S32, ConstantPtr, 8, 8},
1029                                                   {S32, ConstantPtr, 16, 2 * 8}});
1030   if (ST.hasFlatAddressSpace()) {
1031     ExtLoads.legalForTypesWithMemDesc(
1032         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1033   }
1034 
1035   ExtLoads.clampScalar(0, S32, S32)
1036           .widenScalarToNextPow2(0)
1037           .unsupportedIfMemSizeNotPow2()
1038           .lower();
1039 
1040   auto &Atomics = getActionDefinitionsBuilder(
1041     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1042      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1043      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1044      G_ATOMICRMW_UMIN})
1045     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1046                {S64, GlobalPtr}, {S64, LocalPtr}});
1047   if (ST.hasFlatAddressSpace()) {
1048     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1049   }
1050 
1051   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1052     .legalFor({{S32, LocalPtr}});
1053 
1054   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1055   // demarshalling
1056   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1057     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1058                 {S32, FlatPtr}, {S64, FlatPtr}})
1059     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1060                {S32, RegionPtr}, {S64, RegionPtr}});
1061   // TODO: Pointer types, any 32-bit or 64-bit vector
1062 
1063   // Condition should be s32 for scalar, s1 for vector.
1064   getActionDefinitionsBuilder(G_SELECT)
1065     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1066           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1067           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1068     .clampScalar(0, S16, S64)
1069     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1070     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1071     .scalarize(1)
1072     .clampMaxNumElements(0, S32, 2)
1073     .clampMaxNumElements(0, LocalPtr, 2)
1074     .clampMaxNumElements(0, PrivatePtr, 2)
1075     .scalarize(0)
1076     .widenScalarToNextPow2(0)
1077     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1078 
1079   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1080   // be more flexible with the shift amount type.
1081   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1082     .legalFor({{S32, S32}, {S64, S32}});
1083   if (ST.has16BitInsts()) {
1084     if (ST.hasVOP3PInsts()) {
1085       Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
1086             .clampMaxNumElements(0, S16, 2);
1087     } else
1088       Shifts.legalFor({{S16, S32}, {S16, S16}});
1089 
1090     // TODO: Support 16-bit shift amounts
1091     Shifts.clampScalar(1, S32, S32);
1092     Shifts.clampScalar(0, S16, S64);
1093     Shifts.widenScalarToNextPow2(0, 16);
1094   } else {
1095     // Make sure we legalize the shift amount type first, as the general
1096     // expansion for the shifted type will produce much worse code if it hasn't
1097     // been truncated already.
1098     Shifts.clampScalar(1, S32, S32);
1099     Shifts.clampScalar(0, S32, S64);
1100     Shifts.widenScalarToNextPow2(0, 32);
1101   }
1102   Shifts.scalarize(0);
1103 
1104   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1105     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1106     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1107     unsigned IdxTypeIdx = 2;
1108 
1109     getActionDefinitionsBuilder(Op)
1110       .customIf([=](const LegalityQuery &Query) {
1111           const LLT EltTy = Query.Types[EltTypeIdx];
1112           const LLT VecTy = Query.Types[VecTypeIdx];
1113           const LLT IdxTy = Query.Types[IdxTypeIdx];
1114           return (EltTy.getSizeInBits() == 16 ||
1115                   EltTy.getSizeInBits() % 32 == 0) &&
1116                  VecTy.getSizeInBits() % 32 == 0 &&
1117                  VecTy.getSizeInBits() <= 1024 &&
1118                  IdxTy.getSizeInBits() == 32;
1119         })
1120       .clampScalar(EltTypeIdx, S32, S64)
1121       .clampScalar(VecTypeIdx, S32, S64)
1122       .clampScalar(IdxTypeIdx, S32, S32);
1123   }
1124 
1125   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1126     .unsupportedIf([=](const LegalityQuery &Query) {
1127         const LLT &EltTy = Query.Types[1].getElementType();
1128         return Query.Types[0] != EltTy;
1129       });
1130 
1131   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1132     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1133     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1134 
1135     // FIXME: Doesn't handle extract of illegal sizes.
1136     getActionDefinitionsBuilder(Op)
1137       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1138       // FIXME: Multiples of 16 should not be legal.
1139       .legalIf([=](const LegalityQuery &Query) {
1140           const LLT BigTy = Query.Types[BigTyIdx];
1141           const LLT LitTy = Query.Types[LitTyIdx];
1142           return (BigTy.getSizeInBits() % 32 == 0) &&
1143                  (LitTy.getSizeInBits() % 16 == 0);
1144         })
1145       .widenScalarIf(
1146         [=](const LegalityQuery &Query) {
1147           const LLT BigTy = Query.Types[BigTyIdx];
1148           return (BigTy.getScalarSizeInBits() < 16);
1149         },
1150         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1151       .widenScalarIf(
1152         [=](const LegalityQuery &Query) {
1153           const LLT LitTy = Query.Types[LitTyIdx];
1154           return (LitTy.getScalarSizeInBits() < 16);
1155         },
1156         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1157       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1158       .widenScalarToNextPow2(BigTyIdx, 32);
1159 
1160   }
1161 
1162   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1163     .legalForCartesianProduct(AllS32Vectors, {S32})
1164     .legalForCartesianProduct(AllS64Vectors, {S64})
1165     .clampNumElements(0, V16S32, V32S32)
1166     .clampNumElements(0, V2S64, V16S64)
1167     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1168 
1169   if (ST.hasScalarPackInsts()) {
1170     BuildVector
1171       // FIXME: Should probably widen s1 vectors straight to s32
1172       .minScalarOrElt(0, S16)
1173       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1174       .minScalar(1, S32);
1175 
1176     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1177       .legalFor({V2S16, S32})
1178       .lower();
1179     BuildVector.minScalarOrElt(0, S32);
1180   } else {
1181     BuildVector.customFor({V2S16, S16});
1182     BuildVector.minScalarOrElt(0, S32);
1183 
1184     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1185       .customFor({V2S16, S32})
1186       .lower();
1187   }
1188 
1189   BuildVector.legalIf(isRegisterType(0));
1190 
1191   // FIXME: Clamp maximum size
1192   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1193     .legalIf(isRegisterType(0));
1194 
1195   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1196   // pre-legalize.
1197   if (ST.hasVOP3PInsts()) {
1198     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1199       .customFor({V2S16, V2S16})
1200       .lower();
1201   } else
1202     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1203 
1204   // Merge/Unmerge
1205   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1206     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1207     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1208 
1209     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1210       const LLT Ty = Query.Types[TypeIdx];
1211       if (Ty.isVector()) {
1212         const LLT &EltTy = Ty.getElementType();
1213         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1214           return true;
1215         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1216           return true;
1217       }
1218       return false;
1219     };
1220 
1221     auto &Builder = getActionDefinitionsBuilder(Op)
1222       // Try to widen to s16 first for small types.
1223       // TODO: Only do this on targets with legal s16 shifts
1224       .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1225 
1226       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1227       .lowerFor({{S16, V2S16}})
1228       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1229       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1230                            elementTypeIs(1, S16)),
1231                        changeTo(1, V2S16))
1232       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1233       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1234       // valid.
1235       .clampScalar(LitTyIdx, S32, S512)
1236       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1237       // Break up vectors with weird elements into scalars
1238       .fewerElementsIf(
1239         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1240         scalarize(0))
1241       .fewerElementsIf(
1242         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1243         scalarize(1))
1244       .clampScalar(BigTyIdx, S32, S1024);
1245 
1246     if (Op == G_MERGE_VALUES) {
1247       Builder.widenScalarIf(
1248         // TODO: Use 16-bit shifts if legal for 8-bit values?
1249         [=](const LegalityQuery &Query) {
1250           const LLT Ty = Query.Types[LitTyIdx];
1251           return Ty.getSizeInBits() < 32;
1252         },
1253         changeTo(LitTyIdx, S32));
1254     }
1255 
1256     Builder.widenScalarIf(
1257       [=](const LegalityQuery &Query) {
1258         const LLT Ty = Query.Types[BigTyIdx];
1259         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1260           Ty.getSizeInBits() % 16 != 0;
1261       },
1262       [=](const LegalityQuery &Query) {
1263         // Pick the next power of 2, or a multiple of 64 over 128.
1264         // Whichever is smaller.
1265         const LLT &Ty = Query.Types[BigTyIdx];
1266         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1267         if (NewSizeInBits >= 256) {
1268           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1269           if (RoundedTo < NewSizeInBits)
1270             NewSizeInBits = RoundedTo;
1271         }
1272         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1273       })
1274       .legalIf([=](const LegalityQuery &Query) {
1275           const LLT &BigTy = Query.Types[BigTyIdx];
1276           const LLT &LitTy = Query.Types[LitTyIdx];
1277 
1278           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1279             return false;
1280           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1281             return false;
1282 
1283           return BigTy.getSizeInBits() % 16 == 0 &&
1284                  LitTy.getSizeInBits() % 16 == 0 &&
1285                  BigTy.getSizeInBits() <= 1024;
1286         })
1287       // Any vectors left are the wrong size. Scalarize them.
1288       .scalarize(0)
1289       .scalarize(1);
1290   }
1291 
1292   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1293   // RegBankSelect.
1294   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1295     .legalFor({{S32}, {S64}});
1296 
1297   if (ST.hasVOP3PInsts()) {
1298     SextInReg.lowerFor({{V2S16}})
1299       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1300       // get more vector shift opportunities, since we'll get those when
1301       // expanded.
1302       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1303   } else if (ST.has16BitInsts()) {
1304     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1305   } else {
1306     // Prefer to promote to s32 before lowering if we don't have 16-bit
1307     // shifts. This avoid a lot of intermediate truncate and extend operations.
1308     SextInReg.lowerFor({{S32}, {S64}});
1309   }
1310 
1311   SextInReg
1312     .scalarize(0)
1313     .clampScalar(0, S32, S64)
1314     .lower();
1315 
1316   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1317     .legalFor({S64});
1318 
1319   getActionDefinitionsBuilder({
1320       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1321       G_FCOPYSIGN,
1322 
1323       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1324       G_READ_REGISTER,
1325       G_WRITE_REGISTER,
1326 
1327       G_SADDO, G_SSUBO,
1328 
1329        // TODO: Implement
1330       G_FMINIMUM, G_FMAXIMUM
1331     }).lower();
1332 
1333   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1334         G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1335         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1336     .unsupported();
1337 
1338   computeTables();
1339   verify(*ST.getInstrInfo());
1340 }
1341 
1342 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1343                                          MachineRegisterInfo &MRI,
1344                                          MachineIRBuilder &B,
1345                                          GISelChangeObserver &Observer) const {
1346   switch (MI.getOpcode()) {
1347   case TargetOpcode::G_ADDRSPACE_CAST:
1348     return legalizeAddrSpaceCast(MI, MRI, B);
1349   case TargetOpcode::G_FRINT:
1350     return legalizeFrint(MI, MRI, B);
1351   case TargetOpcode::G_FCEIL:
1352     return legalizeFceil(MI, MRI, B);
1353   case TargetOpcode::G_INTRINSIC_TRUNC:
1354     return legalizeIntrinsicTrunc(MI, MRI, B);
1355   case TargetOpcode::G_SITOFP:
1356     return legalizeITOFP(MI, MRI, B, true);
1357   case TargetOpcode::G_UITOFP:
1358     return legalizeITOFP(MI, MRI, B, false);
1359   case TargetOpcode::G_FPTOSI:
1360     return legalizeFPTOI(MI, MRI, B, true);
1361   case TargetOpcode::G_FPTOUI:
1362     return legalizeFPTOI(MI, MRI, B, false);
1363   case TargetOpcode::G_FMINNUM:
1364   case TargetOpcode::G_FMAXNUM:
1365   case TargetOpcode::G_FMINNUM_IEEE:
1366   case TargetOpcode::G_FMAXNUM_IEEE:
1367     return legalizeMinNumMaxNum(MI, MRI, B);
1368   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1369     return legalizeExtractVectorElt(MI, MRI, B);
1370   case TargetOpcode::G_INSERT_VECTOR_ELT:
1371     return legalizeInsertVectorElt(MI, MRI, B);
1372   case TargetOpcode::G_SHUFFLE_VECTOR:
1373     return legalizeShuffleVector(MI, MRI, B);
1374   case TargetOpcode::G_FSIN:
1375   case TargetOpcode::G_FCOS:
1376     return legalizeSinCos(MI, MRI, B);
1377   case TargetOpcode::G_GLOBAL_VALUE:
1378     return legalizeGlobalValue(MI, MRI, B);
1379   case TargetOpcode::G_LOAD:
1380     return legalizeLoad(MI, MRI, B, Observer);
1381   case TargetOpcode::G_FMAD:
1382     return legalizeFMad(MI, MRI, B);
1383   case TargetOpcode::G_FDIV:
1384     return legalizeFDIV(MI, MRI, B);
1385   case TargetOpcode::G_UDIV:
1386   case TargetOpcode::G_UREM:
1387     return legalizeUDIV_UREM(MI, MRI, B);
1388   case TargetOpcode::G_SDIV:
1389   case TargetOpcode::G_SREM:
1390     return legalizeSDIV_SREM(MI, MRI, B);
1391   case TargetOpcode::G_ATOMIC_CMPXCHG:
1392     return legalizeAtomicCmpXChg(MI, MRI, B);
1393   case TargetOpcode::G_FLOG:
1394     return legalizeFlog(MI, B, 1.0f / numbers::log2ef);
1395   case TargetOpcode::G_FLOG10:
1396     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1397   case TargetOpcode::G_FEXP:
1398     return legalizeFExp(MI, B);
1399   case TargetOpcode::G_FPOW:
1400     return legalizeFPow(MI, B);
1401   case TargetOpcode::G_FFLOOR:
1402     return legalizeFFloor(MI, MRI, B);
1403   case TargetOpcode::G_BUILD_VECTOR:
1404     return legalizeBuildVector(MI, MRI, B);
1405   default:
1406     return false;
1407   }
1408 
1409   llvm_unreachable("expected switch to return");
1410 }
1411 
1412 Register AMDGPULegalizerInfo::getSegmentAperture(
1413   unsigned AS,
1414   MachineRegisterInfo &MRI,
1415   MachineIRBuilder &B) const {
1416   MachineFunction &MF = B.getMF();
1417   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1418   const LLT S32 = LLT::scalar(32);
1419 
1420   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1421 
1422   if (ST.hasApertureRegs()) {
1423     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1424     // getreg.
1425     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1426         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1427         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1428     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1429         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1430         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1431     unsigned Encoding =
1432         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1433         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1434         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1435 
1436     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1437 
1438     B.buildInstr(AMDGPU::S_GETREG_B32)
1439       .addDef(GetReg)
1440       .addImm(Encoding);
1441     MRI.setType(GetReg, S32);
1442 
1443     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1444     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1445   }
1446 
1447   Register QueuePtr = MRI.createGenericVirtualRegister(
1448     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1449 
1450   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1451   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1452     return Register();
1453 
1454   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1455   // private_segment_aperture_base_hi.
1456   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1457 
1458   // TODO: can we be smarter about machine pointer info?
1459   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1460   MachineMemOperand *MMO = MF.getMachineMemOperand(
1461     PtrInfo,
1462     MachineMemOperand::MOLoad |
1463     MachineMemOperand::MODereferenceable |
1464     MachineMemOperand::MOInvariant,
1465     4,
1466     MinAlign(64, StructOffset));
1467 
1468   Register LoadAddr;
1469 
1470   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1471   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1472 }
1473 
1474 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1475   MachineInstr &MI, MachineRegisterInfo &MRI,
1476   MachineIRBuilder &B) const {
1477   MachineFunction &MF = B.getMF();
1478 
1479   B.setInstr(MI);
1480 
1481   const LLT S32 = LLT::scalar(32);
1482   Register Dst = MI.getOperand(0).getReg();
1483   Register Src = MI.getOperand(1).getReg();
1484 
1485   LLT DstTy = MRI.getType(Dst);
1486   LLT SrcTy = MRI.getType(Src);
1487   unsigned DestAS = DstTy.getAddressSpace();
1488   unsigned SrcAS = SrcTy.getAddressSpace();
1489 
1490   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1491   // vector element.
1492   assert(!DstTy.isVector());
1493 
1494   const AMDGPUTargetMachine &TM
1495     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1496 
1497   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1498   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1499     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1500     return true;
1501   }
1502 
1503   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1504     // Truncate.
1505     B.buildExtract(Dst, Src, 0);
1506     MI.eraseFromParent();
1507     return true;
1508   }
1509 
1510   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1511     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1512     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1513 
1514     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1515     // another. Merge operands are required to be the same type, but creating an
1516     // extra ptrtoint would be kind of pointless.
1517     auto HighAddr = B.buildConstant(
1518       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1519     B.buildMerge(Dst, {Src, HighAddr});
1520     MI.eraseFromParent();
1521     return true;
1522   }
1523 
1524   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1525     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1526            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1527     unsigned NullVal = TM.getNullPointerValue(DestAS);
1528 
1529     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1530     auto FlatNull = B.buildConstant(SrcTy, 0);
1531 
1532     // Extract low 32-bits of the pointer.
1533     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1534 
1535     auto CmpRes =
1536         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1537     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1538 
1539     MI.eraseFromParent();
1540     return true;
1541   }
1542 
1543   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1544     return false;
1545 
1546   if (!ST.hasFlatAddressSpace())
1547     return false;
1548 
1549   auto SegmentNull =
1550       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1551   auto FlatNull =
1552       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1553 
1554   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1555   if (!ApertureReg.isValid())
1556     return false;
1557 
1558   auto CmpRes =
1559       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1560 
1561   // Coerce the type of the low half of the result so we can use merge_values.
1562   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1563 
1564   // TODO: Should we allow mismatched types but matching sizes in merges to
1565   // avoid the ptrtoint?
1566   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1567   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1568 
1569   MI.eraseFromParent();
1570   return true;
1571 }
1572 
1573 bool AMDGPULegalizerInfo::legalizeFrint(
1574   MachineInstr &MI, MachineRegisterInfo &MRI,
1575   MachineIRBuilder &B) const {
1576   B.setInstr(MI);
1577 
1578   Register Src = MI.getOperand(1).getReg();
1579   LLT Ty = MRI.getType(Src);
1580   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1581 
1582   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1583   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1584 
1585   auto C1 = B.buildFConstant(Ty, C1Val);
1586   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1587 
1588   // TODO: Should this propagate fast-math-flags?
1589   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1590   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1591 
1592   auto C2 = B.buildFConstant(Ty, C2Val);
1593   auto Fabs = B.buildFAbs(Ty, Src);
1594 
1595   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1596   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1597   return true;
1598 }
1599 
1600 bool AMDGPULegalizerInfo::legalizeFceil(
1601   MachineInstr &MI, MachineRegisterInfo &MRI,
1602   MachineIRBuilder &B) const {
1603   B.setInstr(MI);
1604 
1605   const LLT S1 = LLT::scalar(1);
1606   const LLT S64 = LLT::scalar(64);
1607 
1608   Register Src = MI.getOperand(1).getReg();
1609   assert(MRI.getType(Src) == S64);
1610 
1611   // result = trunc(src)
1612   // if (src > 0.0 && src != result)
1613   //   result += 1.0
1614 
1615   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1616 
1617   const auto Zero = B.buildFConstant(S64, 0.0);
1618   const auto One = B.buildFConstant(S64, 1.0);
1619   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1620   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1621   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1622   auto Add = B.buildSelect(S64, And, One, Zero);
1623 
1624   // TODO: Should this propagate fast-math-flags?
1625   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1626   return true;
1627 }
1628 
1629 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1630                                               MachineIRBuilder &B) {
1631   const unsigned FractBits = 52;
1632   const unsigned ExpBits = 11;
1633   LLT S32 = LLT::scalar(32);
1634 
1635   auto Const0 = B.buildConstant(S32, FractBits - 32);
1636   auto Const1 = B.buildConstant(S32, ExpBits);
1637 
1638   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1639     .addUse(Const0.getReg(0))
1640     .addUse(Const1.getReg(0));
1641 
1642   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1643 }
1644 
1645 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1646   MachineInstr &MI, MachineRegisterInfo &MRI,
1647   MachineIRBuilder &B) const {
1648   B.setInstr(MI);
1649 
1650   const LLT S1 = LLT::scalar(1);
1651   const LLT S32 = LLT::scalar(32);
1652   const LLT S64 = LLT::scalar(64);
1653 
1654   Register Src = MI.getOperand(1).getReg();
1655   assert(MRI.getType(Src) == S64);
1656 
1657   // TODO: Should this use extract since the low half is unused?
1658   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1659   Register Hi = Unmerge.getReg(1);
1660 
1661   // Extract the upper half, since this is where we will find the sign and
1662   // exponent.
1663   auto Exp = extractF64Exponent(Hi, B);
1664 
1665   const unsigned FractBits = 52;
1666 
1667   // Extract the sign bit.
1668   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1669   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1670 
1671   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1672 
1673   const auto Zero32 = B.buildConstant(S32, 0);
1674 
1675   // Extend back to 64-bits.
1676   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1677 
1678   auto Shr = B.buildAShr(S64, FractMask, Exp);
1679   auto Not = B.buildNot(S64, Shr);
1680   auto Tmp0 = B.buildAnd(S64, Src, Not);
1681   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1682 
1683   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1684   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1685 
1686   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1687   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1688   return true;
1689 }
1690 
1691 bool AMDGPULegalizerInfo::legalizeITOFP(
1692   MachineInstr &MI, MachineRegisterInfo &MRI,
1693   MachineIRBuilder &B, bool Signed) const {
1694   B.setInstr(MI);
1695 
1696   Register Dst = MI.getOperand(0).getReg();
1697   Register Src = MI.getOperand(1).getReg();
1698 
1699   const LLT S64 = LLT::scalar(64);
1700   const LLT S32 = LLT::scalar(32);
1701 
1702   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1703 
1704   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1705 
1706   auto CvtHi = Signed ?
1707     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1708     B.buildUITOFP(S64, Unmerge.getReg(1));
1709 
1710   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1711 
1712   auto ThirtyTwo = B.buildConstant(S32, 32);
1713   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1714     .addUse(CvtHi.getReg(0))
1715     .addUse(ThirtyTwo.getReg(0));
1716 
1717   // TODO: Should this propagate fast-math-flags?
1718   B.buildFAdd(Dst, LdExp, CvtLo);
1719   MI.eraseFromParent();
1720   return true;
1721 }
1722 
1723 // TODO: Copied from DAG implementation. Verify logic and document how this
1724 // actually works.
1725 bool AMDGPULegalizerInfo::legalizeFPTOI(
1726   MachineInstr &MI, MachineRegisterInfo &MRI,
1727   MachineIRBuilder &B, bool Signed) const {
1728   B.setInstr(MI);
1729 
1730   Register Dst = MI.getOperand(0).getReg();
1731   Register Src = MI.getOperand(1).getReg();
1732 
1733   const LLT S64 = LLT::scalar(64);
1734   const LLT S32 = LLT::scalar(32);
1735 
1736   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1737 
1738   unsigned Flags = MI.getFlags();
1739 
1740   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
1741   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
1742   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
1743 
1744   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
1745   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
1746   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
1747 
1748   auto Hi = Signed ?
1749     B.buildFPTOSI(S32, FloorMul) :
1750     B.buildFPTOUI(S32, FloorMul);
1751   auto Lo = B.buildFPTOUI(S32, Fma);
1752 
1753   B.buildMerge(Dst, { Lo, Hi });
1754   MI.eraseFromParent();
1755 
1756   return true;
1757 }
1758 
1759 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1760   MachineInstr &MI, MachineRegisterInfo &MRI,
1761   MachineIRBuilder &B) const {
1762   MachineFunction &MF = B.getMF();
1763   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1764 
1765   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1766                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1767 
1768   // With ieee_mode disabled, the instructions have the correct behavior
1769   // already for G_FMINNUM/G_FMAXNUM
1770   if (!MFI->getMode().IEEE)
1771     return !IsIEEEOp;
1772 
1773   if (IsIEEEOp)
1774     return true;
1775 
1776   MachineIRBuilder HelperBuilder(MI);
1777   GISelObserverWrapper DummyObserver;
1778   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1779   HelperBuilder.setInstr(MI);
1780   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1781 }
1782 
1783 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1784   MachineInstr &MI, MachineRegisterInfo &MRI,
1785   MachineIRBuilder &B) const {
1786   // TODO: Should move some of this into LegalizerHelper.
1787 
1788   // TODO: Promote dynamic indexing of s16 to s32
1789 
1790   // FIXME: Artifact combiner probably should have replaced the truncated
1791   // constant before this, so we shouldn't need
1792   // getConstantVRegValWithLookThrough.
1793   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1794     MI.getOperand(2).getReg(), MRI);
1795   if (!IdxVal) // Dynamic case will be selected to register indexing.
1796     return true;
1797 
1798   Register Dst = MI.getOperand(0).getReg();
1799   Register Vec = MI.getOperand(1).getReg();
1800 
1801   LLT VecTy = MRI.getType(Vec);
1802   LLT EltTy = VecTy.getElementType();
1803   assert(EltTy == MRI.getType(Dst));
1804 
1805   B.setInstr(MI);
1806 
1807   if (IdxVal->Value < VecTy.getNumElements())
1808     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
1809   else
1810     B.buildUndef(Dst);
1811 
1812   MI.eraseFromParent();
1813   return true;
1814 }
1815 
1816 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1817   MachineInstr &MI, MachineRegisterInfo &MRI,
1818   MachineIRBuilder &B) const {
1819   // TODO: Should move some of this into LegalizerHelper.
1820 
1821   // TODO: Promote dynamic indexing of s16 to s32
1822 
1823   // FIXME: Artifact combiner probably should have replaced the truncated
1824   // constant before this, so we shouldn't need
1825   // getConstantVRegValWithLookThrough.
1826   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1827     MI.getOperand(3).getReg(), MRI);
1828   if (!IdxVal) // Dynamic case will be selected to register indexing.
1829     return true;
1830 
1831   Register Dst = MI.getOperand(0).getReg();
1832   Register Vec = MI.getOperand(1).getReg();
1833   Register Ins = MI.getOperand(2).getReg();
1834 
1835   LLT VecTy = MRI.getType(Vec);
1836   LLT EltTy = VecTy.getElementType();
1837   assert(EltTy == MRI.getType(Ins));
1838 
1839   B.setInstr(MI);
1840 
1841   if (IdxVal->Value < VecTy.getNumElements())
1842     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
1843   else
1844     B.buildUndef(Dst);
1845 
1846   MI.eraseFromParent();
1847   return true;
1848 }
1849 
1850 bool AMDGPULegalizerInfo::legalizeShuffleVector(
1851   MachineInstr &MI, MachineRegisterInfo &MRI,
1852   MachineIRBuilder &B) const {
1853   const LLT V2S16 = LLT::vector(2, 16);
1854 
1855   Register Dst = MI.getOperand(0).getReg();
1856   Register Src0 = MI.getOperand(1).getReg();
1857   LLT DstTy = MRI.getType(Dst);
1858   LLT SrcTy = MRI.getType(Src0);
1859 
1860   if (SrcTy == V2S16 && DstTy == V2S16 &&
1861       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
1862     return true;
1863 
1864   MachineIRBuilder HelperBuilder(MI);
1865   GISelObserverWrapper DummyObserver;
1866   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
1867   HelperBuilder.setInstr(MI);
1868   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
1869 }
1870 
1871 bool AMDGPULegalizerInfo::legalizeSinCos(
1872   MachineInstr &MI, MachineRegisterInfo &MRI,
1873   MachineIRBuilder &B) const {
1874   B.setInstr(MI);
1875 
1876   Register DstReg = MI.getOperand(0).getReg();
1877   Register SrcReg = MI.getOperand(1).getReg();
1878   LLT Ty = MRI.getType(DstReg);
1879   unsigned Flags = MI.getFlags();
1880 
1881   Register TrigVal;
1882   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1883   if (ST.hasTrigReducedRange()) {
1884     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1885     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1886       .addUse(MulVal.getReg(0))
1887       .setMIFlags(Flags).getReg(0);
1888   } else
1889     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1890 
1891   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1892     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1893   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1894     .addUse(TrigVal)
1895     .setMIFlags(Flags);
1896   MI.eraseFromParent();
1897   return true;
1898 }
1899 
1900 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1901   Register DstReg, LLT PtrTy,
1902   MachineIRBuilder &B, const GlobalValue *GV,
1903   unsigned Offset, unsigned GAFlags) const {
1904   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1905   // to the following code sequence:
1906   //
1907   // For constant address space:
1908   //   s_getpc_b64 s[0:1]
1909   //   s_add_u32 s0, s0, $symbol
1910   //   s_addc_u32 s1, s1, 0
1911   //
1912   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1913   //   a fixup or relocation is emitted to replace $symbol with a literal
1914   //   constant, which is a pc-relative offset from the encoding of the $symbol
1915   //   operand to the global variable.
1916   //
1917   // For global address space:
1918   //   s_getpc_b64 s[0:1]
1919   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1920   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1921   //
1922   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1923   //   fixups or relocations are emitted to replace $symbol@*@lo and
1924   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1925   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
1926   //   operand to the global variable.
1927   //
1928   // What we want here is an offset from the value returned by s_getpc
1929   // (which is the address of the s_add_u32 instruction) to the global
1930   // variable, but since the encoding of $symbol starts 4 bytes after the start
1931   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1932   // small. This requires us to add 4 to the global variable offset in order to
1933   // compute the correct address.
1934 
1935   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1936 
1937   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1938     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1939 
1940   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1941     .addDef(PCReg);
1942 
1943   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1944   if (GAFlags == SIInstrInfo::MO_NONE)
1945     MIB.addImm(0);
1946   else
1947     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1948 
1949   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1950 
1951   if (PtrTy.getSizeInBits() == 32)
1952     B.buildExtract(DstReg, PCReg, 0);
1953   return true;
1954  }
1955 
1956 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1957   MachineInstr &MI, MachineRegisterInfo &MRI,
1958   MachineIRBuilder &B) const {
1959   Register DstReg = MI.getOperand(0).getReg();
1960   LLT Ty = MRI.getType(DstReg);
1961   unsigned AS = Ty.getAddressSpace();
1962 
1963   const GlobalValue *GV = MI.getOperand(1).getGlobal();
1964   MachineFunction &MF = B.getMF();
1965   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1966   B.setInstr(MI);
1967 
1968   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1969     if (!MFI->isEntryFunction()) {
1970       const Function &Fn = MF.getFunction();
1971       DiagnosticInfoUnsupported BadLDSDecl(
1972         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
1973         DS_Warning);
1974       Fn.getContext().diagnose(BadLDSDecl);
1975 
1976       // We currently don't have a way to correctly allocate LDS objects that
1977       // aren't directly associated with a kernel. We do force inlining of
1978       // functions that use local objects. However, if these dead functions are
1979       // not eliminated, we don't want a compile time error. Just emit a warning
1980       // and a trap, since there should be no callable path here.
1981       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
1982       B.buildUndef(DstReg);
1983       MI.eraseFromParent();
1984       return true;
1985     }
1986 
1987     // TODO: We could emit code to handle the initialization somewhere.
1988     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1989       const SITargetLowering *TLI = ST.getTargetLowering();
1990       if (!TLI->shouldUseLDSConstAddress(GV)) {
1991         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
1992         return true; // Leave in place;
1993       }
1994 
1995       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1996       MI.eraseFromParent();
1997       return true;
1998     }
1999 
2000     const Function &Fn = MF.getFunction();
2001     DiagnosticInfoUnsupported BadInit(
2002       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2003     Fn.getContext().diagnose(BadInit);
2004     return true;
2005   }
2006 
2007   const SITargetLowering *TLI = ST.getTargetLowering();
2008 
2009   if (TLI->shouldEmitFixup(GV)) {
2010     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2011     MI.eraseFromParent();
2012     return true;
2013   }
2014 
2015   if (TLI->shouldEmitPCReloc(GV)) {
2016     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2017     MI.eraseFromParent();
2018     return true;
2019   }
2020 
2021   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2022   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2023 
2024   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2025     MachinePointerInfo::getGOT(MF),
2026     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2027     MachineMemOperand::MOInvariant,
2028     8 /*Size*/, 8 /*Align*/);
2029 
2030   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2031 
2032   if (Ty.getSizeInBits() == 32) {
2033     // Truncate if this is a 32-bit constant adrdess.
2034     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2035     B.buildExtract(DstReg, Load, 0);
2036   } else
2037     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2038 
2039   MI.eraseFromParent();
2040   return true;
2041 }
2042 
2043 bool AMDGPULegalizerInfo::legalizeLoad(
2044   MachineInstr &MI, MachineRegisterInfo &MRI,
2045   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
2046   B.setInstr(MI);
2047   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2048   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
2049   Observer.changingInstr(MI);
2050   MI.getOperand(1).setReg(Cast.getReg(0));
2051   Observer.changedInstr(MI);
2052   return true;
2053 }
2054 
2055 bool AMDGPULegalizerInfo::legalizeFMad(
2056   MachineInstr &MI, MachineRegisterInfo &MRI,
2057   MachineIRBuilder &B) const {
2058   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2059   assert(Ty.isScalar());
2060 
2061   MachineFunction &MF = B.getMF();
2062   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2063 
2064   // TODO: Always legal with future ftz flag.
2065   // FIXME: Do we need just output?
2066   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2067     return true;
2068   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2069     return true;
2070 
2071   MachineIRBuilder HelperBuilder(MI);
2072   GISelObserverWrapper DummyObserver;
2073   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2074   HelperBuilder.setMBB(*MI.getParent());
2075   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2076 }
2077 
2078 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2079   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2080   Register DstReg = MI.getOperand(0).getReg();
2081   Register PtrReg = MI.getOperand(1).getReg();
2082   Register CmpVal = MI.getOperand(2).getReg();
2083   Register NewVal = MI.getOperand(3).getReg();
2084 
2085   assert(SITargetLowering::isFlatGlobalAddrSpace(
2086            MRI.getType(PtrReg).getAddressSpace()) &&
2087          "this should not have been custom lowered");
2088 
2089   LLT ValTy = MRI.getType(CmpVal);
2090   LLT VecTy = LLT::vector(2, ValTy);
2091 
2092   B.setInstr(MI);
2093   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2094 
2095   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2096     .addDef(DstReg)
2097     .addUse(PtrReg)
2098     .addUse(PackedVal)
2099     .setMemRefs(MI.memoperands());
2100 
2101   MI.eraseFromParent();
2102   return true;
2103 }
2104 
2105 bool AMDGPULegalizerInfo::legalizeFlog(
2106   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2107   Register Dst = MI.getOperand(0).getReg();
2108   Register Src = MI.getOperand(1).getReg();
2109   LLT Ty = B.getMRI()->getType(Dst);
2110   unsigned Flags = MI.getFlags();
2111   B.setInstr(MI);
2112 
2113   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2114   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2115 
2116   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2117   MI.eraseFromParent();
2118   return true;
2119 }
2120 
2121 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2122                                        MachineIRBuilder &B) const {
2123   Register Dst = MI.getOperand(0).getReg();
2124   Register Src = MI.getOperand(1).getReg();
2125   unsigned Flags = MI.getFlags();
2126   LLT Ty = B.getMRI()->getType(Dst);
2127   B.setInstr(MI);
2128 
2129   auto K = B.buildFConstant(Ty, numbers::log2e);
2130   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2131   B.buildFExp2(Dst, Mul, Flags);
2132   MI.eraseFromParent();
2133   return true;
2134 }
2135 
2136 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2137                                        MachineIRBuilder &B) const {
2138   Register Dst = MI.getOperand(0).getReg();
2139   Register Src0 = MI.getOperand(1).getReg();
2140   Register Src1 = MI.getOperand(2).getReg();
2141   unsigned Flags = MI.getFlags();
2142   LLT Ty = B.getMRI()->getType(Dst);
2143   B.setInstr(MI);
2144   const LLT S16 = LLT::scalar(16);
2145   const LLT S32 = LLT::scalar(32);
2146 
2147   if (Ty == S32) {
2148     auto Log = B.buildFLog2(S32, Src0, Flags);
2149     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2150       .addUse(Log.getReg(0))
2151       .addUse(Src1)
2152       .setMIFlags(Flags);
2153     B.buildFExp2(Dst, Mul, Flags);
2154   } else if (Ty == S16) {
2155     // There's no f16 fmul_legacy, so we need to convert for it.
2156     auto Log = B.buildFLog2(S16, Src0, Flags);
2157     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2158     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2159     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2160       .addUse(Ext0.getReg(0))
2161       .addUse(Ext1.getReg(0))
2162       .setMIFlags(Flags);
2163 
2164     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2165   } else
2166     return false;
2167 
2168   MI.eraseFromParent();
2169   return true;
2170 }
2171 
2172 // Find a source register, ignoring any possible source modifiers.
2173 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2174   Register ModSrc = OrigSrc;
2175   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2176     ModSrc = SrcFNeg->getOperand(1).getReg();
2177     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2178       ModSrc = SrcFAbs->getOperand(1).getReg();
2179   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2180     ModSrc = SrcFAbs->getOperand(1).getReg();
2181   return ModSrc;
2182 }
2183 
2184 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2185                                          MachineRegisterInfo &MRI,
2186                                          MachineIRBuilder &B) const {
2187   B.setInstr(MI);
2188 
2189   const LLT S1 = LLT::scalar(1);
2190   const LLT S64 = LLT::scalar(64);
2191   Register Dst = MI.getOperand(0).getReg();
2192   Register OrigSrc = MI.getOperand(1).getReg();
2193   unsigned Flags = MI.getFlags();
2194   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2195          "this should not have been custom lowered");
2196 
2197   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2198   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2199   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2200   // V_FRACT bug is:
2201   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2202   //
2203   // Convert floor(x) to (x - fract(x))
2204 
2205   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2206     .addUse(OrigSrc)
2207     .setMIFlags(Flags);
2208 
2209   // Give source modifier matching some assistance before obscuring a foldable
2210   // pattern.
2211 
2212   // TODO: We can avoid the neg on the fract? The input sign to fract
2213   // shouldn't matter?
2214   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2215 
2216   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2217 
2218   Register Min = MRI.createGenericVirtualRegister(S64);
2219 
2220   // We don't need to concern ourselves with the snan handling difference, so
2221   // use the one which will directly select.
2222   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2223   if (MFI->getMode().IEEE)
2224     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2225   else
2226     B.buildFMinNum(Min, Fract, Const, Flags);
2227 
2228   Register CorrectedFract = Min;
2229   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2230     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2231     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2232   }
2233 
2234   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2235   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2236 
2237   MI.eraseFromParent();
2238   return true;
2239 }
2240 
2241 // Turn an illegal packed v2s16 build vector into bit operations.
2242 // TODO: This should probably be a bitcast action in LegalizerHelper.
2243 bool AMDGPULegalizerInfo::legalizeBuildVector(
2244   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2245   Register Dst = MI.getOperand(0).getReg();
2246   LLT DstTy = MRI.getType(Dst);
2247   const LLT S32 = LLT::scalar(32);
2248   const LLT V2S16 = LLT::vector(2, 16);
2249   (void)DstTy;
2250   (void)V2S16;
2251   assert(DstTy == V2S16);
2252 
2253   Register Src0 = MI.getOperand(1).getReg();
2254   Register Src1 = MI.getOperand(2).getReg();
2255   assert(MRI.getType(Src0) == LLT::scalar(16));
2256 
2257   B.setInstr(MI);
2258   auto Merge = B.buildMerge(S32, {Src0, Src1});
2259   B.buildBitcast(Dst, Merge);
2260 
2261   MI.eraseFromParent();
2262   return true;
2263 }
2264 
2265 // Return the use branch instruction, otherwise null if the usage is invalid.
2266 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2267                                        MachineRegisterInfo &MRI,
2268                                        MachineInstr *&Br) {
2269   Register CondDef = MI.getOperand(0).getReg();
2270   if (!MRI.hasOneNonDBGUse(CondDef))
2271     return nullptr;
2272 
2273   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2274   if (UseMI.getParent() != MI.getParent() ||
2275       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2276     return nullptr;
2277 
2278   // Make sure the cond br is followed by a G_BR
2279   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2280   if (Next != MI.getParent()->end()) {
2281     if (Next->getOpcode() != AMDGPU::G_BR)
2282       return nullptr;
2283     Br = &*Next;
2284   }
2285 
2286   return &UseMI;
2287 }
2288 
2289 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B,
2290                                                MachineRegisterInfo &MRI,
2291                                                Register LiveIn,
2292                                                Register PhyReg) const {
2293   assert(PhyReg.isPhysical() && "Physical register expected");
2294 
2295   // Insert the live-in copy, if required, by defining destination virtual
2296   // register.
2297   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
2298   if (!MRI.getVRegDef(LiveIn)) {
2299     // FIXME: Should have scoped insert pt
2300     MachineBasicBlock &OrigInsBB = B.getMBB();
2301     auto OrigInsPt = B.getInsertPt();
2302 
2303     MachineBasicBlock &EntryMBB = B.getMF().front();
2304     EntryMBB.addLiveIn(PhyReg);
2305     B.setInsertPt(EntryMBB, EntryMBB.begin());
2306     B.buildCopy(LiveIn, PhyReg);
2307 
2308     B.setInsertPt(OrigInsBB, OrigInsPt);
2309   }
2310 
2311   return LiveIn;
2312 }
2313 
2314 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B,
2315                                                 MachineRegisterInfo &MRI,
2316                                                 Register PhyReg, LLT Ty,
2317                                                 bool InsertLiveInCopy) const {
2318   assert(PhyReg.isPhysical() && "Physical register expected");
2319 
2320   // Get or create virtual live-in regester
2321   Register LiveIn = MRI.getLiveInVirtReg(PhyReg);
2322   if (!LiveIn) {
2323     LiveIn = MRI.createGenericVirtualRegister(Ty);
2324     MRI.addLiveIn(PhyReg, LiveIn);
2325   }
2326 
2327   // When the actual true copy required is from virtual register to physical
2328   // register (to be inserted later), live-in copy insertion from physical
2329   // to register virtual register is not required
2330   if (!InsertLiveInCopy)
2331     return LiveIn;
2332 
2333   return insertLiveInCopy(B, MRI, LiveIn, PhyReg);
2334 }
2335 
2336 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor(
2337     MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2338   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2339   const ArgDescriptor *Arg;
2340   const TargetRegisterClass *RC;
2341   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
2342   if (!Arg) {
2343     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
2344     return nullptr;
2345   }
2346   return Arg;
2347 }
2348 
2349 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2350                                          const ArgDescriptor *Arg) const {
2351   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2352     return false; // TODO: Handle these
2353 
2354   Register SrcReg = Arg->getRegister();
2355   assert(SrcReg.isPhysical() && "Physical register expected");
2356   assert(DstReg.isVirtual() && "Virtual register expected");
2357 
2358   MachineRegisterInfo &MRI = *B.getMRI();
2359 
2360   LLT Ty = MRI.getType(DstReg);
2361   Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty);
2362 
2363   if (Arg->isMasked()) {
2364     // TODO: Should we try to emit this once in the entry block?
2365     const LLT S32 = LLT::scalar(32);
2366     const unsigned Mask = Arg->getMask();
2367     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2368 
2369     Register AndMaskSrc = LiveIn;
2370 
2371     if (Shift != 0) {
2372       auto ShiftAmt = B.buildConstant(S32, Shift);
2373       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2374     }
2375 
2376     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2377   } else {
2378     B.buildCopy(DstReg, LiveIn);
2379   }
2380 
2381   return true;
2382 }
2383 
2384 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2385     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2386     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2387   B.setInstr(MI);
2388 
2389   const ArgDescriptor *Arg = getArgDescriptor(B, ArgType);
2390   if (!Arg)
2391     return false;
2392 
2393   if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg))
2394     return false;
2395 
2396   MI.eraseFromParent();
2397   return true;
2398 }
2399 
2400 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2401                                        MachineRegisterInfo &MRI,
2402                                        MachineIRBuilder &B) const {
2403   B.setInstr(MI);
2404   Register Dst = MI.getOperand(0).getReg();
2405   LLT DstTy = MRI.getType(Dst);
2406   LLT S16 = LLT::scalar(16);
2407   LLT S32 = LLT::scalar(32);
2408   LLT S64 = LLT::scalar(64);
2409 
2410   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2411     return true;
2412 
2413   if (DstTy == S16)
2414     return legalizeFDIV16(MI, MRI, B);
2415   if (DstTy == S32)
2416     return legalizeFDIV32(MI, MRI, B);
2417   if (DstTy == S64)
2418     return legalizeFDIV64(MI, MRI, B);
2419 
2420   return false;
2421 }
2422 
2423 static Register buildDivRCP(MachineIRBuilder &B, Register Src) {
2424   const LLT S32 = LLT::scalar(32);
2425 
2426   auto Cvt0 = B.buildUITOFP(S32, Src);
2427   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Cvt0});
2428   auto FPUIntMaxPlus1 = B.buildFConstant(S32, BitsToFloat(0x4f800000));
2429   auto Mul = B.buildFMul(S32, RcpIFlag, FPUIntMaxPlus1);
2430   return B.buildFPTOUI(S32, Mul).getReg(0);
2431 }
2432 
2433 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2434                                                   Register DstReg,
2435                                                   Register Num,
2436                                                   Register Den,
2437                                                   bool IsRem) const {
2438   const LLT S1 = LLT::scalar(1);
2439   const LLT S32 = LLT::scalar(32);
2440 
2441   // RCP =  URECIP(Den) = 2^32 / Den + e
2442   // e is rounding error.
2443   auto RCP = buildDivRCP(B, Den);
2444 
2445   // RCP_LO = mul(RCP, Den)
2446   auto RCP_LO = B.buildMul(S32, RCP, Den);
2447 
2448   // RCP_HI = mulhu (RCP, Den) */
2449   auto RCP_HI = B.buildUMulH(S32, RCP, Den);
2450 
2451   // NEG_RCP_LO = -RCP_LO
2452   auto Zero = B.buildConstant(S32, 0);
2453   auto NEG_RCP_LO = B.buildSub(S32, Zero, RCP_LO);
2454 
2455   // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
2456   auto CmpRcpHiZero = B.buildICmp(CmpInst::ICMP_EQ, S1, RCP_HI, Zero);
2457   auto ABS_RCP_LO = B.buildSelect(S32, CmpRcpHiZero, NEG_RCP_LO, RCP_LO);
2458 
2459   // Calculate the rounding error from the URECIP instruction
2460   // E = mulhu(ABS_RCP_LO, RCP)
2461   auto E = B.buildUMulH(S32, ABS_RCP_LO, RCP);
2462 
2463   // RCP_A_E = RCP + E
2464   auto RCP_A_E = B.buildAdd(S32, RCP, E);
2465 
2466   // RCP_S_E = RCP - E
2467   auto RCP_S_E = B.buildSub(S32, RCP, E);
2468 
2469   // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
2470   auto Tmp0 = B.buildSelect(S32, CmpRcpHiZero, RCP_A_E, RCP_S_E);
2471 
2472   // Quotient = mulhu(Tmp0, Num)stmp
2473   auto Quotient = B.buildUMulH(S32, Tmp0, Num);
2474 
2475   // Num_S_Remainder = Quotient * Den
2476   auto Num_S_Remainder = B.buildMul(S32, Quotient, Den);
2477 
2478   // Remainder = Num - Num_S_Remainder
2479   auto Remainder = B.buildSub(S32, Num, Num_S_Remainder);
2480 
2481   // Remainder_GE_Den = Remainder >= Den
2482   auto Remainder_GE_Den = B.buildICmp(CmpInst::ICMP_UGE, S1, Remainder, Den);
2483 
2484   // Remainder_GE_Zero = Num >= Num_S_Remainder;
2485   auto Remainder_GE_Zero = B.buildICmp(CmpInst::ICMP_UGE, S1,
2486                                        Num, Num_S_Remainder);
2487 
2488   // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
2489   auto Tmp1 = B.buildAnd(S1, Remainder_GE_Den, Remainder_GE_Zero);
2490 
2491   // Calculate Division result:
2492 
2493   // Quotient_A_One = Quotient + 1
2494   auto One = B.buildConstant(S32, 1);
2495   auto Quotient_A_One = B.buildAdd(S32, Quotient, One);
2496 
2497   // Quotient_S_One = Quotient - 1
2498   auto Quotient_S_One = B.buildSub(S32, Quotient, One);
2499 
2500   // Div = (Tmp1 == 0 ? Quotient_A_One : Quotient)
2501   auto Div = B.buildSelect(S32, Tmp1, Quotient, Quotient_A_One);
2502 
2503   // Div = (Remainder_GE_Zero ? Div : Quotient_S_One)
2504   if (IsRem) {
2505     Div = B.buildSelect(S32, Remainder_GE_Zero, Div, Quotient_S_One);
2506 
2507     // Calculate Rem result:
2508     auto Remainder_S_Den = B.buildSub(S32, Remainder, Den);
2509 
2510     // Remainder_A_Den = Remainder + Den
2511     auto Remainder_A_Den = B.buildAdd(S32, Remainder, Den);
2512 
2513     // Rem = (Tmp1 ? Remainder_S_Den : Remainder)
2514     auto Rem = B.buildSelect(S32, Tmp1, Remainder_S_Den, Remainder);
2515 
2516     // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den)
2517     B.buildSelect(DstReg, Remainder_GE_Zero, Rem, Remainder_A_Den);
2518   } else {
2519     B.buildSelect(DstReg, Remainder_GE_Zero, Div, Quotient_S_One);
2520   }
2521 }
2522 
2523 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2524                                               MachineRegisterInfo &MRI,
2525                                               MachineIRBuilder &B) const {
2526   B.setInstr(MI);
2527   const bool IsRem = MI.getOpcode() == AMDGPU::G_UREM;
2528   Register DstReg = MI.getOperand(0).getReg();
2529   Register Num = MI.getOperand(1).getReg();
2530   Register Den = MI.getOperand(2).getReg();
2531   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsRem);
2532   MI.eraseFromParent();
2533   return true;
2534 }
2535 
2536 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2537                                             MachineRegisterInfo &MRI,
2538                                             MachineIRBuilder &B) const {
2539   if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32))
2540     return legalizeUDIV_UREM32(MI, MRI, B);
2541   return false;
2542 }
2543 
2544 bool AMDGPULegalizerInfo::legalizeSDIV_SREM32(MachineInstr &MI,
2545                                               MachineRegisterInfo &MRI,
2546                                               MachineIRBuilder &B) const {
2547   B.setInstr(MI);
2548   const LLT S32 = LLT::scalar(32);
2549 
2550   const bool IsRem = MI.getOpcode() == AMDGPU::G_SREM;
2551   Register DstReg = MI.getOperand(0).getReg();
2552   Register LHS = MI.getOperand(1).getReg();
2553   Register RHS = MI.getOperand(2).getReg();
2554 
2555   auto ThirtyOne = B.buildConstant(S32, 31);
2556   auto LHSign = B.buildAShr(S32, LHS, ThirtyOne);
2557   auto RHSign = B.buildAShr(S32, LHS, ThirtyOne);
2558 
2559   LHS = B.buildAdd(S32, LHS, LHSign).getReg(0);
2560   RHS = B.buildAdd(S32, RHS, RHSign).getReg(0);
2561 
2562   LHS = B.buildXor(S32, LHS, LHSign).getReg(0);
2563   RHS = B.buildXor(S32, RHS, RHSign).getReg(0);
2564 
2565   Register UDivRem = MRI.createGenericVirtualRegister(S32);
2566   legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsRem);
2567 
2568   if (IsRem) {
2569     auto RSign = LHSign; // Remainder sign is the same as LHS
2570     UDivRem = B.buildXor(S32, UDivRem, RSign).getReg(0);
2571     B.buildSub(DstReg, UDivRem, RSign);
2572   } else {
2573     auto DSign = B.buildXor(S32, LHSign, RHSign);
2574     UDivRem = B.buildXor(S32, UDivRem, DSign).getReg(0);
2575     B.buildSub(DstReg, UDivRem, DSign);
2576   }
2577 
2578   MI.eraseFromParent();
2579   return true;
2580 }
2581 
2582 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2583                                             MachineRegisterInfo &MRI,
2584                                             MachineIRBuilder &B) const {
2585   if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32))
2586     return legalizeSDIV_SREM32(MI, MRI, B);
2587   return false;
2588 }
2589 
2590 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2591                                                  MachineRegisterInfo &MRI,
2592                                                  MachineIRBuilder &B) const {
2593   Register Res = MI.getOperand(0).getReg();
2594   Register LHS = MI.getOperand(1).getReg();
2595   Register RHS = MI.getOperand(2).getReg();
2596 
2597   uint16_t Flags = MI.getFlags();
2598 
2599   LLT ResTy = MRI.getType(Res);
2600   LLT S32 = LLT::scalar(32);
2601   LLT S64 = LLT::scalar(64);
2602 
2603   const MachineFunction &MF = B.getMF();
2604   bool Unsafe =
2605     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2606 
2607   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2608     return false;
2609 
2610   if (!Unsafe && ResTy == S32 &&
2611       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2612     return false;
2613 
2614   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2615     // 1 / x -> RCP(x)
2616     if (CLHS->isExactlyValue(1.0)) {
2617       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2618         .addUse(RHS)
2619         .setMIFlags(Flags);
2620 
2621       MI.eraseFromParent();
2622       return true;
2623     }
2624 
2625     // -1 / x -> RCP( FNEG(x) )
2626     if (CLHS->isExactlyValue(-1.0)) {
2627       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2628       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2629         .addUse(FNeg.getReg(0))
2630         .setMIFlags(Flags);
2631 
2632       MI.eraseFromParent();
2633       return true;
2634     }
2635   }
2636 
2637   // x / y -> x * (1.0 / y)
2638   if (Unsafe) {
2639     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2640       .addUse(RHS)
2641       .setMIFlags(Flags);
2642     B.buildFMul(Res, LHS, RCP, Flags);
2643 
2644     MI.eraseFromParent();
2645     return true;
2646   }
2647 
2648   return false;
2649 }
2650 
2651 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2652                                          MachineRegisterInfo &MRI,
2653                                          MachineIRBuilder &B) const {
2654   B.setInstr(MI);
2655   Register Res = MI.getOperand(0).getReg();
2656   Register LHS = MI.getOperand(1).getReg();
2657   Register RHS = MI.getOperand(2).getReg();
2658 
2659   uint16_t Flags = MI.getFlags();
2660 
2661   LLT S16 = LLT::scalar(16);
2662   LLT S32 = LLT::scalar(32);
2663 
2664   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2665   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2666 
2667   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2668     .addUse(RHSExt.getReg(0))
2669     .setMIFlags(Flags);
2670 
2671   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2672   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2673 
2674   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2675     .addUse(RDst.getReg(0))
2676     .addUse(RHS)
2677     .addUse(LHS)
2678     .setMIFlags(Flags);
2679 
2680   MI.eraseFromParent();
2681   return true;
2682 }
2683 
2684 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2685 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2686 static void toggleSPDenormMode(bool Enable,
2687                                MachineIRBuilder &B,
2688                                const GCNSubtarget &ST,
2689                                AMDGPU::SIModeRegisterDefaults Mode) {
2690   // Set SP denorm mode to this value.
2691   unsigned SPDenormMode =
2692     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
2693 
2694   if (ST.hasDenormModeInst()) {
2695     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2696     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
2697 
2698     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2699     B.buildInstr(AMDGPU::S_DENORM_MODE)
2700       .addImm(NewDenormModeValue);
2701 
2702   } else {
2703     // Select FP32 bit field in mode register.
2704     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2705                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2706                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2707 
2708     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2709       .addImm(SPDenormMode)
2710       .addImm(SPDenormModeBitField);
2711   }
2712 }
2713 
2714 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2715                                          MachineRegisterInfo &MRI,
2716                                          MachineIRBuilder &B) const {
2717   B.setInstr(MI);
2718   Register Res = MI.getOperand(0).getReg();
2719   Register LHS = MI.getOperand(1).getReg();
2720   Register RHS = MI.getOperand(2).getReg();
2721   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2722   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2723 
2724   uint16_t Flags = MI.getFlags();
2725 
2726   LLT S32 = LLT::scalar(32);
2727   LLT S1 = LLT::scalar(1);
2728 
2729   auto One = B.buildFConstant(S32, 1.0f);
2730 
2731   auto DenominatorScaled =
2732     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2733       .addUse(RHS)
2734       .addUse(LHS)
2735       .addImm(1)
2736       .setMIFlags(Flags);
2737   auto NumeratorScaled =
2738     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2739       .addUse(LHS)
2740       .addUse(RHS)
2741       .addImm(0)
2742       .setMIFlags(Flags);
2743 
2744   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2745     .addUse(DenominatorScaled.getReg(0))
2746     .setMIFlags(Flags);
2747   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2748 
2749   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2750   // aren't modeled as reading it.
2751   if (!Mode.allFP32Denormals())
2752     toggleSPDenormMode(true, B, ST, Mode);
2753 
2754   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2755   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2756   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2757   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2758   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2759   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2760 
2761   if (!Mode.allFP32Denormals())
2762     toggleSPDenormMode(false, B, ST, Mode);
2763 
2764   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2765     .addUse(Fma4.getReg(0))
2766     .addUse(Fma1.getReg(0))
2767     .addUse(Fma3.getReg(0))
2768     .addUse(NumeratorScaled.getReg(1))
2769     .setMIFlags(Flags);
2770 
2771   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2772     .addUse(Fmas.getReg(0))
2773     .addUse(RHS)
2774     .addUse(LHS)
2775     .setMIFlags(Flags);
2776 
2777   MI.eraseFromParent();
2778   return true;
2779 }
2780 
2781 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
2782                                          MachineRegisterInfo &MRI,
2783                                          MachineIRBuilder &B) const {
2784   B.setInstr(MI);
2785   Register Res = MI.getOperand(0).getReg();
2786   Register LHS = MI.getOperand(1).getReg();
2787   Register RHS = MI.getOperand(2).getReg();
2788 
2789   uint16_t Flags = MI.getFlags();
2790 
2791   LLT S64 = LLT::scalar(64);
2792   LLT S1 = LLT::scalar(1);
2793 
2794   auto One = B.buildFConstant(S64, 1.0);
2795 
2796   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2797     .addUse(LHS)
2798     .addUse(RHS)
2799     .addImm(1)
2800     .setMIFlags(Flags);
2801 
2802   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
2803 
2804   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
2805     .addUse(DivScale0.getReg(0))
2806     .setMIFlags(Flags);
2807 
2808   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
2809   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
2810   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
2811 
2812   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2813     .addUse(LHS)
2814     .addUse(RHS)
2815     .addImm(0)
2816     .setMIFlags(Flags);
2817 
2818   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
2819   auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags);
2820   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
2821 
2822   Register Scale;
2823   if (!ST.hasUsableDivScaleConditionOutput()) {
2824     // Workaround a hardware bug on SI where the condition output from div_scale
2825     // is not usable.
2826 
2827     LLT S32 = LLT::scalar(32);
2828 
2829     auto NumUnmerge = B.buildUnmerge(S32, LHS);
2830     auto DenUnmerge = B.buildUnmerge(S32, RHS);
2831     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
2832     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
2833 
2834     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
2835                               Scale1Unmerge.getReg(1));
2836     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
2837                               Scale0Unmerge.getReg(1));
2838     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
2839   } else {
2840     Scale = DivScale1.getReg(1);
2841   }
2842 
2843   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
2844     .addUse(Fma4.getReg(0))
2845     .addUse(Fma3.getReg(0))
2846     .addUse(Mul.getReg(0))
2847     .addUse(Scale)
2848     .setMIFlags(Flags);
2849 
2850   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
2851     .addUse(Fmas.getReg(0))
2852     .addUse(RHS)
2853     .addUse(LHS)
2854     .setMIFlags(Flags);
2855 
2856   MI.eraseFromParent();
2857   return true;
2858 }
2859 
2860 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
2861                                                  MachineRegisterInfo &MRI,
2862                                                  MachineIRBuilder &B) const {
2863   B.setInstr(MI);
2864   Register Res = MI.getOperand(0).getReg();
2865   Register LHS = MI.getOperand(2).getReg();
2866   Register RHS = MI.getOperand(3).getReg();
2867   uint16_t Flags = MI.getFlags();
2868 
2869   LLT S32 = LLT::scalar(32);
2870   LLT S1 = LLT::scalar(1);
2871 
2872   auto Abs = B.buildFAbs(S32, RHS, Flags);
2873   const APFloat C0Val(1.0f);
2874 
2875   auto C0 = B.buildConstant(S32, 0x6f800000);
2876   auto C1 = B.buildConstant(S32, 0x2f800000);
2877   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
2878 
2879   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
2880   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
2881 
2882   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
2883 
2884   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2885     .addUse(Mul0.getReg(0))
2886     .setMIFlags(Flags);
2887 
2888   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
2889 
2890   B.buildFMul(Res, Sel, Mul1, Flags);
2891 
2892   MI.eraseFromParent();
2893   return true;
2894 }
2895 
2896 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
2897                                                  MachineRegisterInfo &MRI,
2898                                                  MachineIRBuilder &B) const {
2899   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2900   if (!MFI->isEntryFunction()) {
2901     return legalizePreloadedArgIntrin(MI, MRI, B,
2902                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
2903   }
2904 
2905   B.setInstr(MI);
2906 
2907   uint64_t Offset =
2908     ST.getTargetLowering()->getImplicitParameterOffset(
2909       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
2910   Register DstReg = MI.getOperand(0).getReg();
2911   LLT DstTy = MRI.getType(DstReg);
2912   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
2913 
2914   const ArgDescriptor *Arg;
2915   const TargetRegisterClass *RC;
2916   std::tie(Arg, RC)
2917     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2918   if (!Arg)
2919     return false;
2920 
2921   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
2922   if (!loadInputValue(KernargPtrReg, B, Arg))
2923     return false;
2924 
2925   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
2926   MI.eraseFromParent();
2927   return true;
2928 }
2929 
2930 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
2931                                               MachineRegisterInfo &MRI,
2932                                               MachineIRBuilder &B,
2933                                               unsigned AddrSpace) const {
2934   B.setInstr(MI);
2935   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
2936   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
2937   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
2938   MI.eraseFromParent();
2939   return true;
2940 }
2941 
2942 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
2943 // offset (the offset that is included in bounds checking and swizzling, to be
2944 // split between the instruction's voffset and immoffset fields) and soffset
2945 // (the offset that is excluded from bounds checking and swizzling, to go in
2946 // the instruction's soffset field).  This function takes the first kind of
2947 // offset and figures out how to split it between voffset and immoffset.
2948 std::tuple<Register, unsigned, unsigned>
2949 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
2950                                         Register OrigOffset) const {
2951   const unsigned MaxImm = 4095;
2952   Register BaseReg;
2953   unsigned TotalConstOffset;
2954   MachineInstr *OffsetDef;
2955   const LLT S32 = LLT::scalar(32);
2956 
2957   std::tie(BaseReg, TotalConstOffset, OffsetDef)
2958     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
2959 
2960   unsigned ImmOffset = TotalConstOffset;
2961 
2962   // If the immediate value is too big for the immoffset field, put the value
2963   // and -4096 into the immoffset field so that the value that is copied/added
2964   // for the voffset field is a multiple of 4096, and it stands more chance
2965   // of being CSEd with the copy/add for another similar load/store.
2966   // However, do not do that rounding down to a multiple of 4096 if that is a
2967   // negative number, as it appears to be illegal to have a negative offset
2968   // in the vgpr, even if adding the immediate offset makes it positive.
2969   unsigned Overflow = ImmOffset & ~MaxImm;
2970   ImmOffset -= Overflow;
2971   if ((int32_t)Overflow < 0) {
2972     Overflow += ImmOffset;
2973     ImmOffset = 0;
2974   }
2975 
2976   if (Overflow != 0) {
2977     if (!BaseReg) {
2978       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
2979     } else {
2980       auto OverflowVal = B.buildConstant(S32, Overflow);
2981       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
2982     }
2983   }
2984 
2985   if (!BaseReg)
2986     BaseReg = B.buildConstant(S32, 0).getReg(0);
2987 
2988   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
2989 }
2990 
2991 /// Handle register layout difference for f16 images for some subtargets.
2992 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
2993                                              MachineRegisterInfo &MRI,
2994                                              Register Reg) const {
2995   if (!ST.hasUnpackedD16VMem())
2996     return Reg;
2997 
2998   const LLT S16 = LLT::scalar(16);
2999   const LLT S32 = LLT::scalar(32);
3000   LLT StoreVT = MRI.getType(Reg);
3001   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3002 
3003   auto Unmerge = B.buildUnmerge(S16, Reg);
3004 
3005   SmallVector<Register, 4> WideRegs;
3006   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3007     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3008 
3009   int NumElts = StoreVT.getNumElements();
3010 
3011   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3012 }
3013 
3014 Register AMDGPULegalizerInfo::fixStoreSourceType(
3015   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3016   MachineRegisterInfo *MRI = B.getMRI();
3017   LLT Ty = MRI->getType(VData);
3018 
3019   const LLT S16 = LLT::scalar(16);
3020 
3021   // Fixup illegal register types for i8 stores.
3022   if (Ty == LLT::scalar(8) || Ty == S16) {
3023     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3024     return AnyExt;
3025   }
3026 
3027   if (Ty.isVector()) {
3028     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3029       if (IsFormat)
3030         return handleD16VData(B, *MRI, VData);
3031     }
3032   }
3033 
3034   return VData;
3035 }
3036 
3037 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3038                                               MachineRegisterInfo &MRI,
3039                                               MachineIRBuilder &B,
3040                                               bool IsTyped,
3041                                               bool IsFormat) const {
3042   B.setInstr(MI);
3043 
3044   Register VData = MI.getOperand(1).getReg();
3045   LLT Ty = MRI.getType(VData);
3046   LLT EltTy = Ty.getScalarType();
3047   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3048   const LLT S32 = LLT::scalar(32);
3049 
3050   VData = fixStoreSourceType(B, VData, IsFormat);
3051   Register RSrc = MI.getOperand(2).getReg();
3052 
3053   MachineMemOperand *MMO = *MI.memoperands_begin();
3054   const int MemSize = MMO->getSize();
3055 
3056   unsigned ImmOffset;
3057   unsigned TotalOffset;
3058 
3059   // The typed intrinsics add an immediate after the registers.
3060   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3061 
3062   // The struct intrinsic variants add one additional operand over raw.
3063   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3064   Register VIndex;
3065   int OpOffset = 0;
3066   if (HasVIndex) {
3067     VIndex = MI.getOperand(3).getReg();
3068     OpOffset = 1;
3069   }
3070 
3071   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3072   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3073 
3074   unsigned Format = 0;
3075   if (IsTyped) {
3076     Format = MI.getOperand(5 + OpOffset).getImm();
3077     ++OpOffset;
3078   }
3079 
3080   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3081 
3082   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3083   if (TotalOffset != 0)
3084     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3085 
3086   unsigned Opc;
3087   if (IsTyped) {
3088     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3089                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3090   } else if (IsFormat) {
3091     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3092                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3093   } else {
3094     switch (MemSize) {
3095     case 1:
3096       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3097       break;
3098     case 2:
3099       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3100       break;
3101     default:
3102       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3103       break;
3104     }
3105   }
3106 
3107   if (!VIndex)
3108     VIndex = B.buildConstant(S32, 0).getReg(0);
3109 
3110   auto MIB = B.buildInstr(Opc)
3111     .addUse(VData)              // vdata
3112     .addUse(RSrc)               // rsrc
3113     .addUse(VIndex)             // vindex
3114     .addUse(VOffset)            // voffset
3115     .addUse(SOffset)            // soffset
3116     .addImm(ImmOffset);         // offset(imm)
3117 
3118   if (IsTyped)
3119     MIB.addImm(Format);
3120 
3121   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3122      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3123      .addMemOperand(MMO);
3124 
3125   MI.eraseFromParent();
3126   return true;
3127 }
3128 
3129 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3130                                              MachineRegisterInfo &MRI,
3131                                              MachineIRBuilder &B,
3132                                              bool IsFormat,
3133                                              bool IsTyped) const {
3134   B.setInstr(MI);
3135 
3136   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3137   MachineMemOperand *MMO = *MI.memoperands_begin();
3138   const int MemSize = MMO->getSize();
3139   const LLT S32 = LLT::scalar(32);
3140 
3141   Register Dst = MI.getOperand(0).getReg();
3142   Register RSrc = MI.getOperand(2).getReg();
3143 
3144   // The typed intrinsics add an immediate after the registers.
3145   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3146 
3147   // The struct intrinsic variants add one additional operand over raw.
3148   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3149   Register VIndex;
3150   int OpOffset = 0;
3151   if (HasVIndex) {
3152     VIndex = MI.getOperand(3).getReg();
3153     OpOffset = 1;
3154   }
3155 
3156   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3157   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3158 
3159   unsigned Format = 0;
3160   if (IsTyped) {
3161     Format = MI.getOperand(5 + OpOffset).getImm();
3162     ++OpOffset;
3163   }
3164 
3165   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3166   unsigned ImmOffset;
3167   unsigned TotalOffset;
3168 
3169   LLT Ty = MRI.getType(Dst);
3170   LLT EltTy = Ty.getScalarType();
3171   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3172   const bool Unpacked = ST.hasUnpackedD16VMem();
3173 
3174   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3175   if (TotalOffset != 0)
3176     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3177 
3178   unsigned Opc;
3179 
3180   if (IsTyped) {
3181     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3182                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3183   } else if (IsFormat) {
3184     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3185                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3186   } else {
3187     switch (MemSize) {
3188     case 1:
3189       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3190       break;
3191     case 2:
3192       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3193       break;
3194     default:
3195       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3196       break;
3197     }
3198   }
3199 
3200   Register LoadDstReg;
3201 
3202   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3203   LLT UnpackedTy = Ty.changeElementSize(32);
3204 
3205   if (IsExtLoad)
3206     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3207   else if (Unpacked && IsD16 && Ty.isVector())
3208     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3209   else
3210     LoadDstReg = Dst;
3211 
3212   if (!VIndex)
3213     VIndex = B.buildConstant(S32, 0).getReg(0);
3214 
3215   auto MIB = B.buildInstr(Opc)
3216     .addDef(LoadDstReg)         // vdata
3217     .addUse(RSrc)               // rsrc
3218     .addUse(VIndex)             // vindex
3219     .addUse(VOffset)            // voffset
3220     .addUse(SOffset)            // soffset
3221     .addImm(ImmOffset);         // offset(imm)
3222 
3223   if (IsTyped)
3224     MIB.addImm(Format);
3225 
3226   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3227      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3228      .addMemOperand(MMO);
3229 
3230   if (LoadDstReg != Dst) {
3231     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3232 
3233     // Widen result for extending loads was widened.
3234     if (IsExtLoad)
3235       B.buildTrunc(Dst, LoadDstReg);
3236     else {
3237       // Repack to original 16-bit vector result
3238       // FIXME: G_TRUNC should work, but legalization currently fails
3239       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3240       SmallVector<Register, 4> Repack;
3241       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3242         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3243       B.buildMerge(Dst, Repack);
3244     }
3245   }
3246 
3247   MI.eraseFromParent();
3248   return true;
3249 }
3250 
3251 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3252                                                MachineIRBuilder &B,
3253                                                bool IsInc) const {
3254   B.setInstr(MI);
3255   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3256                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3257   B.buildInstr(Opc)
3258     .addDef(MI.getOperand(0).getReg())
3259     .addUse(MI.getOperand(2).getReg())
3260     .addUse(MI.getOperand(3).getReg())
3261     .cloneMemRefs(MI);
3262   MI.eraseFromParent();
3263   return true;
3264 }
3265 
3266 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3267   switch (IntrID) {
3268   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3269   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3270     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3271   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3272   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3273     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3274   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3275   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3276     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3277   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3278   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3279     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3280   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3281   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3282     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3283   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3284   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3285     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3286   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3287   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3288     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3289   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3290   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3291     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3292   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3293   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3294     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3295   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3296   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3297     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3298   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3299   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3300     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3301   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3302   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3303     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3304   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3305   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3306     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3307   default:
3308     llvm_unreachable("unhandled atomic opcode");
3309   }
3310 }
3311 
3312 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3313                                                MachineIRBuilder &B,
3314                                                Intrinsic::ID IID) const {
3315   B.setInstr(MI);
3316 
3317   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3318                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3319 
3320   Register Dst = MI.getOperand(0).getReg();
3321   Register VData = MI.getOperand(2).getReg();
3322 
3323   Register CmpVal;
3324   int OpOffset = 0;
3325 
3326   if (IsCmpSwap) {
3327     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3328     ++OpOffset;
3329   }
3330 
3331   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3332   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
3333 
3334   // The struct intrinsic variants add one additional operand over raw.
3335   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3336   Register VIndex;
3337   if (HasVIndex) {
3338     VIndex = MI.getOperand(4 + OpOffset).getReg();
3339     ++OpOffset;
3340   }
3341 
3342   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3343   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3344   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3345 
3346   MachineMemOperand *MMO = *MI.memoperands_begin();
3347 
3348   unsigned ImmOffset;
3349   unsigned TotalOffset;
3350   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3351   if (TotalOffset != 0)
3352     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3353 
3354   if (!VIndex)
3355     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3356 
3357   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
3358     .addDef(Dst)
3359     .addUse(VData); // vdata
3360 
3361   if (IsCmpSwap)
3362     MIB.addReg(CmpVal);
3363 
3364   MIB.addUse(RSrc)               // rsrc
3365      .addUse(VIndex)             // vindex
3366      .addUse(VOffset)            // voffset
3367      .addUse(SOffset)            // soffset
3368      .addImm(ImmOffset)          // offset(imm)
3369      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3370      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3371      .addMemOperand(MMO);
3372 
3373   MI.eraseFromParent();
3374   return true;
3375 }
3376 
3377 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3378 /// vector with s16 typed elements.
3379 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI,
3380                                         SmallVectorImpl<Register> &PackedAddrs,
3381                                         int AddrIdx, int DimIdx, int NumVAddrs,
3382                                         int NumGradients) {
3383   const LLT S16 = LLT::scalar(16);
3384   const LLT V2S16 = LLT::vector(2, 16);
3385 
3386   for (int I = AddrIdx; I < AddrIdx + NumVAddrs; ++I) {
3387     Register AddrReg = MI.getOperand(I).getReg();
3388 
3389     if (I < DimIdx) {
3390       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
3391       PackedAddrs.push_back(AddrReg);
3392     } else {
3393       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
3394       // derivatives dx/dh and dx/dv are packed with undef.
3395       if (((I + 1) >= (AddrIdx + NumVAddrs)) ||
3396           ((NumGradients / 2) % 2 == 1 &&
3397            (I == DimIdx + (NumGradients / 2) - 1 ||
3398             I == DimIdx + NumGradients - 1))) {
3399         PackedAddrs.push_back(
3400             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
3401                 .getReg(0));
3402       } else {
3403         PackedAddrs.push_back(
3404             B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()})
3405                 .getReg(0));
3406         ++I;
3407       }
3408     }
3409   }
3410 }
3411 
3412 /// Convert from separate vaddr components to a single vector address register,
3413 /// and replace the remaining operands with $noreg.
3414 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
3415                                      int DimIdx, int NumVAddrs) {
3416   SmallVector<Register, 8> AddrRegs(NumVAddrs);
3417   for (int I = 0; I != NumVAddrs; ++I) {
3418     AddrRegs[I] = MI.getOperand(DimIdx + I).getReg();
3419     assert(B.getMRI()->getType(AddrRegs[I]) == LLT::scalar(32));
3420   }
3421 
3422   auto VAddr = B.buildBuildVector(LLT::vector(NumVAddrs, 32), AddrRegs);
3423   MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
3424   for (int I = 1; I != NumVAddrs; ++I)
3425     MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
3426 }
3427 
3428 /// Return number of address arguments, and the number of gradients
3429 static std::pair<int, int>
3430 getImageNumVAddr(const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
3431                  const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode) {
3432   const AMDGPU::MIMGDimInfo *DimInfo
3433     = AMDGPU::getMIMGDimInfo(ImageDimIntr->Dim);
3434 
3435   int NumGradients = BaseOpcode->Gradients ? DimInfo->NumGradients : 0;
3436   int NumCoords = BaseOpcode->Coordinates ? DimInfo->NumCoords : 0;
3437   int NumLCM = BaseOpcode->LodOrClampOrMip ? 1 : 0;
3438   int NumVAddr = BaseOpcode->NumExtraArgs + NumGradients + NumCoords + NumLCM;
3439   return {NumVAddr, NumGradients};
3440 }
3441 
3442 static int getDMaskIdx(const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode,
3443                        int NumDefs) {
3444   assert(!BaseOpcode->Atomic);
3445   return NumDefs + 1 + (BaseOpcode->Store ? 1 : 0);
3446 }
3447 
3448 /// Return first address operand index in an image intrinsic.
3449 static int getImageVAddrIdxBegin(const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode,
3450                                  int NumDefs) {
3451   if (BaseOpcode->Atomic)
3452     return NumDefs + 1 + (BaseOpcode->AtomicX2 ? 2 : 1);
3453   return getDMaskIdx(BaseOpcode, NumDefs) + 1;
3454 }
3455 
3456 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
3457 ///
3458 /// Depending on the subtarget, load/store with 16-bit element data need to be
3459 /// rewritten to use the low half of 32-bit registers, or directly use a packed
3460 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
3461 /// registers.
3462 ///
3463 /// We don't want to directly select image instructions just yet, but also want
3464 /// to exposes all register repacking to the legalizer/combiners. We also don't
3465 /// want a selected instrution entering RegBankSelect. In order to avoid
3466 /// defining a multitude of intermediate image instructions, directly hack on
3467 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
3468 /// now unnecessary arguments with $noreg.
3469 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3470     MachineInstr &MI, MachineIRBuilder &B,
3471     GISelChangeObserver &Observer,
3472     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3473   B.setInstr(MI);
3474 
3475   const int NumDefs = MI.getNumExplicitDefs();
3476   bool IsTFE = NumDefs == 2;
3477   // We are only processing the operands of d16 image operations on subtargets
3478   // that use the unpacked register layout, or need to repack the TFE result.
3479 
3480   // TODO: Do we need to guard against already legalized intrinsics?
3481   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3482     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3483 
3484   MachineRegisterInfo *MRI = B.getMRI();
3485   const LLT S32 = LLT::scalar(32);
3486   const LLT S16 = LLT::scalar(16);
3487   const LLT V2S16 = LLT::vector(2, 16);
3488 
3489   // Index of first address argument
3490   const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs);
3491 
3492   // Check for 16 bit addresses and pack if true.
3493   int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
3494   LLT AddrTy = MRI->getType(MI.getOperand(DimIdx).getReg());
3495   const bool IsA16 = AddrTy == S16;
3496 
3497   int NumVAddrs, NumGradients;
3498   std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode);
3499   const int DMaskIdx = BaseOpcode->Atomic ? -1 :
3500     getDMaskIdx(BaseOpcode, NumDefs);
3501   unsigned DMask = 0;
3502 
3503   int DMaskLanes = 0;
3504   if (!BaseOpcode->Atomic) {
3505     DMask = MI.getOperand(DMaskIdx).getImm();
3506     if (BaseOpcode->Gather4) {
3507       DMaskLanes = 4;
3508     } else if (DMask != 0) {
3509       DMaskLanes = countPopulation(DMask);
3510     } else if (!IsTFE && !BaseOpcode->Store) {
3511       // If dmask is 0, this is a no-op load. This can be eliminated.
3512       B.buildUndef(MI.getOperand(0));
3513       MI.eraseFromParent();
3514       return true;
3515     }
3516   }
3517 
3518   Observer.changingInstr(MI);
3519   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
3520 
3521   unsigned NewOpcode = NumDefs == 0 ?
3522     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
3523 
3524   // Track that we legalized this
3525   MI.setDesc(B.getTII().get(NewOpcode));
3526 
3527   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
3528   // dmask to be at least 1 otherwise the instruction will fail
3529   if (IsTFE && DMask == 0) {
3530     DMask = 0x1;
3531     DMaskLanes = 1;
3532     MI.getOperand(DMaskIdx).setImm(DMask);
3533   }
3534 
3535   // If the register allocator cannot place the address registers contiguously
3536   // without introducing moves, then using the non-sequential address encoding
3537   // is always preferable, since it saves VALU instructions and is usually a
3538   // wash in terms of code size or even better.
3539   //
3540   // However, we currently have no way of hinting to the register allocator
3541   // that MIMG addresses should be placed contiguously when it is possible to
3542   // do so, so force non-NSA for the common 2-address case as a heuristic.
3543   //
3544   // SIShrinkInstructions will convert NSA encodings to non-NSA after register
3545   // allocation when possible.
3546   const bool UseNSA = NumVAddrs >= 3 &&
3547                       ST.hasFeature(AMDGPU::FeatureNSAEncoding);
3548 
3549   // Rewrite the addressing register layout before doing anything else.
3550   if (IsA16) {
3551 #if 0
3552     // FIXME: this feature is missing from gfx10. When that is fixed, this check
3553     // should be introduced.
3554     if (!ST.hasFeature(AMDGPU::FeatureR128A16))
3555       return false;
3556 #endif
3557 
3558     if (NumVAddrs > 1) {
3559       SmallVector<Register, 4> PackedRegs;
3560       packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, NumVAddrs,
3561                                   NumGradients);
3562 
3563       if (!UseNSA && PackedRegs.size() > 1) {
3564         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
3565         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
3566         PackedRegs[0] = Concat.getReg(0);
3567         PackedRegs.resize(1);
3568       }
3569 
3570       const int NumPacked = PackedRegs.size();
3571       for (int I = 0; I != NumVAddrs; ++I) {
3572         assert(MI.getOperand(AddrIdx + I).getReg() != AMDGPU::NoRegister);
3573 
3574         if (I < NumPacked)
3575           MI.getOperand(AddrIdx + I).setReg(PackedRegs[I]);
3576         else
3577           MI.getOperand(AddrIdx + I).setReg(AMDGPU::NoRegister);
3578       }
3579     }
3580   } else if (!UseNSA && NumVAddrs > 1) {
3581     convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
3582   }
3583 
3584   if (BaseOpcode->Store) { // No TFE for stores?
3585     // TODO: Handle dmask trim
3586     Register VData = MI.getOperand(1).getReg();
3587     LLT Ty = MRI->getType(VData);
3588     if (!Ty.isVector() || Ty.getElementType() != S16)
3589       return true;
3590 
3591     B.setInstr(MI);
3592 
3593     Register RepackedReg = handleD16VData(B, *MRI, VData);
3594     if (RepackedReg != VData) {
3595       MI.getOperand(1).setReg(RepackedReg);
3596     }
3597 
3598     return true;
3599   }
3600 
3601   Register DstReg = MI.getOperand(0).getReg();
3602   LLT Ty = MRI->getType(DstReg);
3603   const LLT EltTy = Ty.getScalarType();
3604   const bool IsD16 = Ty.getScalarType() == S16;
3605   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
3606 
3607   // Confirm that the return type is large enough for the dmask specified
3608   if (NumElts < DMaskLanes)
3609     return false;
3610 
3611   if (NumElts > 4 || DMaskLanes > 4)
3612     return false;
3613 
3614   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
3615   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
3616 
3617   // The raw dword aligned data component of the load. The only legal cases
3618   // where this matters should be when using the packed D16 format, for
3619   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
3620   LLT RoundedTy;
3621 
3622   // S32 vector to to cover all data, plus TFE result element.
3623   LLT TFETy;
3624 
3625   // Register type to use for each loaded component. Will be S32 or V2S16.
3626   LLT RegTy;
3627 
3628   if (IsD16 && ST.hasUnpackedD16VMem()) {
3629     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
3630     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
3631     RegTy = S32;
3632   } else {
3633     unsigned EltSize = EltTy.getSizeInBits();
3634     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
3635     unsigned RoundedSize = 32 * RoundedElts;
3636     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
3637     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
3638     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
3639   }
3640 
3641   // The return type does not need adjustment.
3642   // TODO: Should we change s16 case to s32 or <2 x s16>?
3643   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
3644     return true;
3645 
3646   Register Dst1Reg;
3647 
3648   // Insert after the instruction.
3649   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3650 
3651   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
3652   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
3653   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
3654   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
3655 
3656   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
3657 
3658   MI.getOperand(0).setReg(NewResultReg);
3659 
3660   // In the IR, TFE is supposed to be used with a 2 element struct return
3661   // type. The intruction really returns these two values in one contiguous
3662   // register, with one additional dword beyond the loaded data. Rewrite the
3663   // return type to use a single register result.
3664 
3665   if (IsTFE) {
3666     Dst1Reg = MI.getOperand(1).getReg();
3667     if (MRI->getType(Dst1Reg) != S32)
3668       return false;
3669 
3670     // TODO: Make sure the TFE operand bit is set.
3671     MI.RemoveOperand(1);
3672 
3673     // Handle the easy case that requires no repack instructions.
3674     if (Ty == S32) {
3675       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
3676       return true;
3677     }
3678   }
3679 
3680   // Now figure out how to copy the new result register back into the old
3681   // result.
3682   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
3683 
3684   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
3685 
3686   if (ResultNumRegs == 1) {
3687     assert(!IsTFE);
3688     ResultRegs[0] = NewResultReg;
3689   } else {
3690     // We have to repack into a new vector of some kind.
3691     for (int I = 0; I != NumDataRegs; ++I)
3692       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
3693     B.buildUnmerge(ResultRegs, NewResultReg);
3694 
3695     // Drop the final TFE element to get the data part. The TFE result is
3696     // directly written to the right place already.
3697     if (IsTFE)
3698       ResultRegs.resize(NumDataRegs);
3699   }
3700 
3701   // For an s16 scalar result, we form an s32 result with a truncate regardless
3702   // of packed vs. unpacked.
3703   if (IsD16 && !Ty.isVector()) {
3704     B.buildTrunc(DstReg, ResultRegs[0]);
3705     return true;
3706   }
3707 
3708   // Avoid a build/concat_vector of 1 entry.
3709   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
3710     B.buildBitcast(DstReg, ResultRegs[0]);
3711     return true;
3712   }
3713 
3714   assert(Ty.isVector());
3715 
3716   if (IsD16) {
3717     // For packed D16 results with TFE enabled, all the data components are
3718     // S32. Cast back to the expected type.
3719     //
3720     // TODO: We don't really need to use load s32 elements. We would only need one
3721     // cast for the TFE result if a multiple of v2s16 was used.
3722     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
3723       for (Register &Reg : ResultRegs)
3724         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
3725     } else if (ST.hasUnpackedD16VMem()) {
3726       for (Register &Reg : ResultRegs)
3727         Reg = B.buildTrunc(S16, Reg).getReg(0);
3728     }
3729   }
3730 
3731   auto padWithUndef = [&](LLT Ty, int NumElts) {
3732     if (NumElts == 0)
3733       return;
3734     Register Undef = B.buildUndef(Ty).getReg(0);
3735     for (int I = 0; I != NumElts; ++I)
3736       ResultRegs.push_back(Undef);
3737   };
3738 
3739   // Pad out any elements eliminated due to the dmask.
3740   LLT ResTy = MRI->getType(ResultRegs[0]);
3741   if (!ResTy.isVector()) {
3742     padWithUndef(ResTy, NumElts - ResultRegs.size());
3743     B.buildBuildVector(DstReg, ResultRegs);
3744     return true;
3745   }
3746 
3747   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
3748   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
3749 
3750   // Deal with the one annoying legal case.
3751   const LLT V3S16 = LLT::vector(3, 16);
3752   if (Ty == V3S16) {
3753     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
3754     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
3755     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
3756     return true;
3757   }
3758 
3759   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
3760   B.buildConcatVectors(DstReg, ResultRegs);
3761   return true;
3762 }
3763 
3764 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
3765   MachineInstr &MI, MachineIRBuilder &B,
3766   GISelChangeObserver &Observer) const {
3767   Register Dst = MI.getOperand(0).getReg();
3768   LLT Ty = B.getMRI()->getType(Dst);
3769   unsigned Size = Ty.getSizeInBits();
3770   MachineFunction &MF = B.getMF();
3771 
3772   Observer.changingInstr(MI);
3773 
3774   // FIXME: We don't really need this intermediate instruction. The intrinsic
3775   // should be fixed to have a memory operand. Since it's readnone, we're not
3776   // allowed to add one.
3777   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
3778   MI.RemoveOperand(1); // Remove intrinsic ID
3779 
3780   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
3781   // TODO: Should this use datalayout alignment?
3782   const unsigned MemSize = (Size + 7) / 8;
3783   const unsigned MemAlign = 4;
3784   MachineMemOperand *MMO = MF.getMachineMemOperand(
3785     MachinePointerInfo(),
3786     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
3787     MachineMemOperand::MOInvariant, MemSize, MemAlign);
3788   MI.addMemOperand(MF, MMO);
3789 
3790   // There are no 96-bit result scalar loads, but widening to 128-bit should
3791   // always be legal. We may need to restore this to a 96-bit result if it turns
3792   // out this needs to be converted to a vector load during RegBankSelect.
3793   if (!isPowerOf2_32(Size)) {
3794     LegalizerHelper Helper(MF, *this, Observer, B);
3795     B.setInstr(MI);
3796 
3797     if (Ty.isVector())
3798       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
3799     else
3800       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
3801   }
3802 
3803   Observer.changedInstr(MI);
3804   return true;
3805 }
3806 
3807 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
3808                                                 MachineRegisterInfo &MRI,
3809                                                 MachineIRBuilder &B) const {
3810   B.setInstr(MI);
3811 
3812   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
3813   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
3814       !ST.isTrapHandlerEnabled()) {
3815     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
3816   } else {
3817     // Pass queue pointer to trap handler as input, and insert trap instruction
3818     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
3819     const ArgDescriptor *Arg =
3820         getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR);
3821     if (!Arg)
3822       return false;
3823     MachineRegisterInfo &MRI = *B.getMRI();
3824     Register SGPR01(AMDGPU::SGPR0_SGPR1);
3825     Register LiveIn = getLiveInRegister(
3826         B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64),
3827         /*InsertLiveInCopy=*/false);
3828     if (!loadInputValue(LiveIn, B, Arg))
3829       return false;
3830     B.buildCopy(SGPR01, LiveIn);
3831     B.buildInstr(AMDGPU::S_TRAP)
3832         .addImm(GCNSubtarget::TrapIDLLVMTrap)
3833         .addReg(SGPR01, RegState::Implicit);
3834   }
3835 
3836   MI.eraseFromParent();
3837   return true;
3838 }
3839 
3840 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
3841     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
3842   B.setInstr(MI);
3843 
3844   // Is non-HSA path or trap-handler disabled? then, report a warning
3845   // accordingly
3846   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
3847       !ST.isTrapHandlerEnabled()) {
3848     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
3849                                      "debugtrap handler not supported",
3850                                      MI.getDebugLoc(), DS_Warning);
3851     LLVMContext &Ctx = B.getMF().getFunction().getContext();
3852     Ctx.diagnose(NoTrap);
3853   } else {
3854     // Insert debug-trap instruction
3855     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
3856   }
3857 
3858   MI.eraseFromParent();
3859   return true;
3860 }
3861 
3862 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
3863                                             MachineIRBuilder &B,
3864                                             GISelChangeObserver &Observer) const {
3865   MachineRegisterInfo &MRI = *B.getMRI();
3866 
3867   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
3868   auto IntrID = MI.getIntrinsicID();
3869   switch (IntrID) {
3870   case Intrinsic::amdgcn_if:
3871   case Intrinsic::amdgcn_else: {
3872     MachineInstr *Br = nullptr;
3873     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
3874       const SIRegisterInfo *TRI
3875         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
3876 
3877       B.setInstr(*BrCond);
3878       Register Def = MI.getOperand(1).getReg();
3879       Register Use = MI.getOperand(3).getReg();
3880 
3881       MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB();
3882       if (Br)
3883         BrTarget = Br->getOperand(0).getMBB();
3884 
3885       if (IntrID == Intrinsic::amdgcn_if) {
3886         B.buildInstr(AMDGPU::SI_IF)
3887           .addDef(Def)
3888           .addUse(Use)
3889           .addMBB(BrTarget);
3890       } else {
3891         B.buildInstr(AMDGPU::SI_ELSE)
3892           .addDef(Def)
3893           .addUse(Use)
3894           .addMBB(BrTarget)
3895           .addImm(0);
3896       }
3897 
3898       if (Br)
3899         Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB());
3900 
3901       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
3902       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
3903       MI.eraseFromParent();
3904       BrCond->eraseFromParent();
3905       return true;
3906     }
3907 
3908     return false;
3909   }
3910   case Intrinsic::amdgcn_loop: {
3911     MachineInstr *Br = nullptr;
3912     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
3913       const SIRegisterInfo *TRI
3914         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
3915 
3916       B.setInstr(*BrCond);
3917 
3918       MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB();
3919       if (Br)
3920         BrTarget = Br->getOperand(0).getMBB();
3921 
3922       Register Reg = MI.getOperand(2).getReg();
3923       B.buildInstr(AMDGPU::SI_LOOP)
3924         .addUse(Reg)
3925         .addMBB(BrTarget);
3926 
3927       if (Br)
3928         Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB());
3929 
3930       MI.eraseFromParent();
3931       BrCond->eraseFromParent();
3932       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
3933       return true;
3934     }
3935 
3936     return false;
3937   }
3938   case Intrinsic::amdgcn_kernarg_segment_ptr:
3939     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
3940       B.setInstr(MI);
3941       // This only makes sense to call in a kernel, so just lower to null.
3942       B.buildConstant(MI.getOperand(0).getReg(), 0);
3943       MI.eraseFromParent();
3944       return true;
3945     }
3946 
3947     return legalizePreloadedArgIntrin(
3948       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
3949   case Intrinsic::amdgcn_implicitarg_ptr:
3950     return legalizeImplicitArgPtr(MI, MRI, B);
3951   case Intrinsic::amdgcn_workitem_id_x:
3952     return legalizePreloadedArgIntrin(MI, MRI, B,
3953                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
3954   case Intrinsic::amdgcn_workitem_id_y:
3955     return legalizePreloadedArgIntrin(MI, MRI, B,
3956                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
3957   case Intrinsic::amdgcn_workitem_id_z:
3958     return legalizePreloadedArgIntrin(MI, MRI, B,
3959                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
3960   case Intrinsic::amdgcn_workgroup_id_x:
3961     return legalizePreloadedArgIntrin(MI, MRI, B,
3962                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
3963   case Intrinsic::amdgcn_workgroup_id_y:
3964     return legalizePreloadedArgIntrin(MI, MRI, B,
3965                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
3966   case Intrinsic::amdgcn_workgroup_id_z:
3967     return legalizePreloadedArgIntrin(MI, MRI, B,
3968                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
3969   case Intrinsic::amdgcn_dispatch_ptr:
3970     return legalizePreloadedArgIntrin(MI, MRI, B,
3971                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
3972   case Intrinsic::amdgcn_queue_ptr:
3973     return legalizePreloadedArgIntrin(MI, MRI, B,
3974                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
3975   case Intrinsic::amdgcn_implicit_buffer_ptr:
3976     return legalizePreloadedArgIntrin(
3977       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
3978   case Intrinsic::amdgcn_dispatch_id:
3979     return legalizePreloadedArgIntrin(MI, MRI, B,
3980                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
3981   case Intrinsic::amdgcn_fdiv_fast:
3982     return legalizeFDIVFastIntrin(MI, MRI, B);
3983   case Intrinsic::amdgcn_is_shared:
3984     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
3985   case Intrinsic::amdgcn_is_private:
3986     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
3987   case Intrinsic::amdgcn_wavefrontsize: {
3988     B.setInstr(MI);
3989     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
3990     MI.eraseFromParent();
3991     return true;
3992   }
3993   case Intrinsic::amdgcn_s_buffer_load:
3994     return legalizeSBufferLoad(MI, B, Observer);
3995   case Intrinsic::amdgcn_raw_buffer_store:
3996   case Intrinsic::amdgcn_struct_buffer_store:
3997     return legalizeBufferStore(MI, MRI, B, false, false);
3998   case Intrinsic::amdgcn_raw_buffer_store_format:
3999   case Intrinsic::amdgcn_struct_buffer_store_format:
4000     return legalizeBufferStore(MI, MRI, B, false, true);
4001   case Intrinsic::amdgcn_raw_tbuffer_store:
4002   case Intrinsic::amdgcn_struct_tbuffer_store:
4003     return legalizeBufferStore(MI, MRI, B, true, true);
4004   case Intrinsic::amdgcn_raw_buffer_load:
4005   case Intrinsic::amdgcn_struct_buffer_load:
4006     return legalizeBufferLoad(MI, MRI, B, false, false);
4007   case Intrinsic::amdgcn_raw_buffer_load_format:
4008   case Intrinsic::amdgcn_struct_buffer_load_format:
4009     return legalizeBufferLoad(MI, MRI, B, true, false);
4010   case Intrinsic::amdgcn_raw_tbuffer_load:
4011   case Intrinsic::amdgcn_struct_tbuffer_load:
4012     return legalizeBufferLoad(MI, MRI, B, true, true);
4013   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4014   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4015   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4016   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4017   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4018   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4019   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4020   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4021   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4022   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4023   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4024   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4025   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4026   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4027   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4028   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4029   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4030   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4031   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4032   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4033   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4034   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4035   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4036   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4037   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4038   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4039     return legalizeBufferAtomic(MI, B, IntrID);
4040   case Intrinsic::amdgcn_atomic_inc:
4041     return legalizeAtomicIncDec(MI, B, true);
4042   case Intrinsic::amdgcn_atomic_dec:
4043     return legalizeAtomicIncDec(MI, B, false);
4044   case Intrinsic::trap:
4045     return legalizeTrapIntrinsic(MI, MRI, B);
4046   case Intrinsic::debugtrap:
4047     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4048   default: {
4049     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4050             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4051       return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr);
4052     return true;
4053   }
4054   }
4055 
4056   return true;
4057 }
4058