1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
19 #endif
20 
21 #include "AMDGPULegalizerInfo.h"
22 
23 #include "AMDGPU.h"
24 #include "AMDGPUGlobalISelUtils.h"
25 #include "AMDGPUTargetMachine.h"
26 #include "SIMachineFunctionInfo.h"
27 #include "llvm/ADT/ScopeExit.h"
28 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
29 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
30 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
31 #include "llvm/CodeGen/TargetOpcodes.h"
32 #include "llvm/CodeGen/ValueTypes.h"
33 #include "llvm/IR/DerivedTypes.h"
34 #include "llvm/IR/DiagnosticInfo.h"
35 #include "llvm/IR/Type.h"
36 #include "llvm/Support/Debug.h"
37 
38 #define DEBUG_TYPE "amdgpu-legalinfo"
39 
40 using namespace llvm;
41 using namespace LegalizeActions;
42 using namespace LegalizeMutations;
43 using namespace LegalityPredicates;
44 using namespace MIPatternMatch;
45 
46 // Round the number of elements to the next power of two elements
47 static LLT getPow2VectorType(LLT Ty) {
48   unsigned NElts = Ty.getNumElements();
49   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
50   return Ty.changeNumElements(Pow2NElts);
51 }
52 
53 // Round the number of bits to the next power of two bits
54 static LLT getPow2ScalarType(LLT Ty) {
55   unsigned Bits = Ty.getSizeInBits();
56   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
57   return LLT::scalar(Pow2Bits);
58 }
59 
60 static LegalityPredicate isMultiple32(unsigned TypeIdx,
61                                       unsigned MaxSize = 1024) {
62   return [=](const LegalityQuery &Query) {
63     const LLT Ty = Query.Types[TypeIdx];
64     const LLT EltTy = Ty.getScalarType();
65     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
66   };
67 }
68 
69 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
70   return [=](const LegalityQuery &Query) {
71     return Query.Types[TypeIdx].getSizeInBits() == Size;
72   };
73 }
74 
75 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
76   return [=](const LegalityQuery &Query) {
77     const LLT Ty = Query.Types[TypeIdx];
78     return Ty.isVector() &&
79            Ty.getNumElements() % 2 != 0 &&
80            Ty.getElementType().getSizeInBits() < 32 &&
81            Ty.getSizeInBits() % 32 != 0;
82   };
83 }
84 
85 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
86   return [=](const LegalityQuery &Query) {
87     const LLT Ty = Query.Types[TypeIdx];
88     const LLT EltTy = Ty.getScalarType();
89     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
90   };
91 }
92 
93 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
94   return [=](const LegalityQuery &Query) {
95     const LLT Ty = Query.Types[TypeIdx];
96     const LLT EltTy = Ty.getElementType();
97     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
98   };
99 }
100 
101 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
102   return [=](const LegalityQuery &Query) {
103     const LLT Ty = Query.Types[TypeIdx];
104     const LLT EltTy = Ty.getElementType();
105     unsigned Size = Ty.getSizeInBits();
106     unsigned Pieces = (Size + 63) / 64;
107     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
108     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
109   };
110 }
111 
112 // Increase the number of vector elements to reach the next multiple of 32-bit
113 // type.
114 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
115   return [=](const LegalityQuery &Query) {
116     const LLT Ty = Query.Types[TypeIdx];
117 
118     const LLT EltTy = Ty.getElementType();
119     const int Size = Ty.getSizeInBits();
120     const int EltSize = EltTy.getSizeInBits();
121     const int NextMul32 = (Size + 31) / 32;
122 
123     assert(EltSize < 32);
124 
125     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
126     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
127   };
128 }
129 
130 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
131   return [=](const LegalityQuery &Query) {
132     const LLT QueryTy = Query.Types[TypeIdx];
133     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
134   };
135 }
136 
137 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
138   return [=](const LegalityQuery &Query) {
139     const LLT QueryTy = Query.Types[TypeIdx];
140     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
141   };
142 }
143 
144 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
145   return [=](const LegalityQuery &Query) {
146     const LLT QueryTy = Query.Types[TypeIdx];
147     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
148   };
149 }
150 
151 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
152 // v2s16.
153 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
154   return [=](const LegalityQuery &Query) {
155     const LLT Ty = Query.Types[TypeIdx];
156     if (Ty.isVector()) {
157       const int EltSize = Ty.getElementType().getSizeInBits();
158       return EltSize == 32 || EltSize == 64 ||
159             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
160              EltSize == 128 || EltSize == 256;
161     }
162 
163     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
164   };
165 }
166 
167 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
168   return [=](const LegalityQuery &Query) {
169     const LLT QueryTy = Query.Types[TypeIdx];
170     return QueryTy.isVector() && QueryTy.getElementType() == Type;
171   };
172 }
173 
174 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
175   return [=](const LegalityQuery &Query) {
176     const LLT QueryTy = Query.Types[TypeIdx];
177     if (!QueryTy.isVector())
178       return false;
179     const LLT EltTy = QueryTy.getElementType();
180     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
181   };
182 }
183 
184 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
185   return [=](const LegalityQuery &Query) {
186     const LLT Ty = Query.Types[TypeIdx];
187     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
188            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
189   };
190 }
191 
192 static LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1) {
193   return [=](const LegalityQuery &Query) {
194     return Query.Types[TypeIdx0].getSizeInBits() <
195            Query.Types[TypeIdx1].getSizeInBits();
196   };
197 }
198 
199 static LegalityPredicate greaterThan(unsigned TypeIdx0, unsigned TypeIdx1) {
200   return [=](const LegalityQuery &Query) {
201     return Query.Types[TypeIdx0].getSizeInBits() >
202            Query.Types[TypeIdx1].getSizeInBits();
203   };
204 }
205 
206 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
207                                          const GCNTargetMachine &TM)
208   :  ST(ST_) {
209   using namespace TargetOpcode;
210 
211   auto GetAddrSpacePtr = [&TM](unsigned AS) {
212     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
213   };
214 
215   const LLT S1 = LLT::scalar(1);
216   const LLT S16 = LLT::scalar(16);
217   const LLT S32 = LLT::scalar(32);
218   const LLT S64 = LLT::scalar(64);
219   const LLT S128 = LLT::scalar(128);
220   const LLT S256 = LLT::scalar(256);
221   const LLT S512 = LLT::scalar(512);
222   const LLT S1024 = LLT::scalar(1024);
223 
224   const LLT V2S16 = LLT::vector(2, 16);
225   const LLT V4S16 = LLT::vector(4, 16);
226 
227   const LLT V2S32 = LLT::vector(2, 32);
228   const LLT V3S32 = LLT::vector(3, 32);
229   const LLT V4S32 = LLT::vector(4, 32);
230   const LLT V5S32 = LLT::vector(5, 32);
231   const LLT V6S32 = LLT::vector(6, 32);
232   const LLT V7S32 = LLT::vector(7, 32);
233   const LLT V8S32 = LLT::vector(8, 32);
234   const LLT V9S32 = LLT::vector(9, 32);
235   const LLT V10S32 = LLT::vector(10, 32);
236   const LLT V11S32 = LLT::vector(11, 32);
237   const LLT V12S32 = LLT::vector(12, 32);
238   const LLT V13S32 = LLT::vector(13, 32);
239   const LLT V14S32 = LLT::vector(14, 32);
240   const LLT V15S32 = LLT::vector(15, 32);
241   const LLT V16S32 = LLT::vector(16, 32);
242   const LLT V32S32 = LLT::vector(32, 32);
243 
244   const LLT V2S64 = LLT::vector(2, 64);
245   const LLT V3S64 = LLT::vector(3, 64);
246   const LLT V4S64 = LLT::vector(4, 64);
247   const LLT V5S64 = LLT::vector(5, 64);
248   const LLT V6S64 = LLT::vector(6, 64);
249   const LLT V7S64 = LLT::vector(7, 64);
250   const LLT V8S64 = LLT::vector(8, 64);
251   const LLT V16S64 = LLT::vector(16, 64);
252 
253   std::initializer_list<LLT> AllS32Vectors =
254     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
255      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
256   std::initializer_list<LLT> AllS64Vectors =
257     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
258 
259   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
260   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
261   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
262   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
263   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
264   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
265   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
266 
267   const LLT CodePtr = FlatPtr;
268 
269   const std::initializer_list<LLT> AddrSpaces64 = {
270     GlobalPtr, ConstantPtr, FlatPtr
271   };
272 
273   const std::initializer_list<LLT> AddrSpaces32 = {
274     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
275   };
276 
277   const std::initializer_list<LLT> FPTypesBase = {
278     S32, S64
279   };
280 
281   const std::initializer_list<LLT> FPTypes16 = {
282     S32, S64, S16
283   };
284 
285   const std::initializer_list<LLT> FPTypesPK16 = {
286     S32, S64, S16, V2S16
287   };
288 
289   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
290 
291   setAction({G_BRCOND, S1}, Legal); // VCC branches
292   setAction({G_BRCOND, S32}, Legal); // SCC branches
293 
294   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
295   // elements for v3s16
296   getActionDefinitionsBuilder(G_PHI)
297     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
298     .legalFor(AllS32Vectors)
299     .legalFor(AllS64Vectors)
300     .legalFor(AddrSpaces64)
301     .legalFor(AddrSpaces32)
302     .clampScalar(0, S32, S256)
303     .widenScalarToNextPow2(0, 32)
304     .clampMaxNumElements(0, S32, 16)
305     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
306     .legalIf(isPointer(0));
307 
308   if (ST.hasVOP3PInsts()) {
309     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
310       .legalFor({S32, S16, V2S16})
311       .clampScalar(0, S16, S32)
312       .clampMaxNumElements(0, S16, 2)
313       .scalarize(0)
314       .widenScalarToNextPow2(0, 32);
315   } else if (ST.has16BitInsts()) {
316     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
317       .legalFor({S32, S16})
318       .clampScalar(0, S16, S32)
319       .scalarize(0)
320       .widenScalarToNextPow2(0, 32);
321   } else {
322     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
323       .legalFor({S32})
324       .clampScalar(0, S32, S32)
325       .scalarize(0);
326   }
327 
328   // FIXME: Not really legal. Placeholder for custom lowering.
329   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
330     .customFor({S32, S64})
331     .clampScalar(0, S32, S64)
332     .widenScalarToNextPow2(0, 32)
333     .scalarize(0);
334 
335   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
336     .legalFor({S32})
337     .clampScalar(0, S32, S32)
338     .scalarize(0);
339 
340   // Report legal for any types we can handle anywhere. For the cases only legal
341   // on the SALU, RegBankSelect will be able to re-legalize.
342   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
343     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
344     .clampScalar(0, S32, S64)
345     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
346     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
347     .widenScalarToNextPow2(0)
348     .scalarize(0);
349 
350   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
351                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
352     .legalFor({{S32, S1}, {S32, S32}})
353     .minScalar(0, S32)
354     // TODO: .scalarize(0)
355     .lower();
356 
357   getActionDefinitionsBuilder(G_BITCAST)
358     // Don't worry about the size constraint.
359     .legalIf(all(isRegisterType(0), isRegisterType(1)))
360     .lower();
361 
362 
363   getActionDefinitionsBuilder(G_CONSTANT)
364     .legalFor({S1, S32, S64, S16, GlobalPtr,
365                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
366     .clampScalar(0, S32, S64)
367     .widenScalarToNextPow2(0)
368     .legalIf(isPointer(0));
369 
370   getActionDefinitionsBuilder(G_FCONSTANT)
371     .legalFor({S32, S64, S16})
372     .clampScalar(0, S16, S64);
373 
374   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
375     .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
376                ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
377     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
378     .clampScalarOrElt(0, S32, S1024)
379     .legalIf(isMultiple32(0))
380     .widenScalarToNextPow2(0, 32)
381     .clampMaxNumElements(0, S32, 16);
382 
383   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
384   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
385     .unsupportedFor({PrivatePtr})
386     .custom();
387   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
388 
389   auto &FPOpActions = getActionDefinitionsBuilder(
390     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
391     .legalFor({S32, S64});
392   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
393     .customFor({S32, S64});
394   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
395     .customFor({S32, S64});
396 
397   if (ST.has16BitInsts()) {
398     if (ST.hasVOP3PInsts())
399       FPOpActions.legalFor({S16, V2S16});
400     else
401       FPOpActions.legalFor({S16});
402 
403     TrigActions.customFor({S16});
404     FDIVActions.customFor({S16});
405   }
406 
407   auto &MinNumMaxNum = getActionDefinitionsBuilder({
408       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
409 
410   if (ST.hasVOP3PInsts()) {
411     MinNumMaxNum.customFor(FPTypesPK16)
412       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
413       .clampMaxNumElements(0, S16, 2)
414       .clampScalar(0, S16, S64)
415       .scalarize(0);
416   } else if (ST.has16BitInsts()) {
417     MinNumMaxNum.customFor(FPTypes16)
418       .clampScalar(0, S16, S64)
419       .scalarize(0);
420   } else {
421     MinNumMaxNum.customFor(FPTypesBase)
422       .clampScalar(0, S32, S64)
423       .scalarize(0);
424   }
425 
426   if (ST.hasVOP3PInsts())
427     FPOpActions.clampMaxNumElements(0, S16, 2);
428 
429   FPOpActions
430     .scalarize(0)
431     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
432 
433   TrigActions
434     .scalarize(0)
435     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
436 
437   FDIVActions
438     .scalarize(0)
439     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
440 
441   getActionDefinitionsBuilder({G_FNEG, G_FABS})
442     .legalFor(FPTypesPK16)
443     .clampMaxNumElements(0, S16, 2)
444     .scalarize(0)
445     .clampScalar(0, S16, S64);
446 
447   if (ST.has16BitInsts()) {
448     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
449       .legalFor({S32, S64, S16})
450       .scalarize(0)
451       .clampScalar(0, S16, S64);
452   } else {
453     getActionDefinitionsBuilder(G_FSQRT)
454       .legalFor({S32, S64})
455       .scalarize(0)
456       .clampScalar(0, S32, S64);
457 
458     if (ST.hasFractBug()) {
459       getActionDefinitionsBuilder(G_FFLOOR)
460         .customFor({S64})
461         .legalFor({S32, S64})
462         .scalarize(0)
463         .clampScalar(0, S32, S64);
464     } else {
465       getActionDefinitionsBuilder(G_FFLOOR)
466         .legalFor({S32, S64})
467         .scalarize(0)
468         .clampScalar(0, S32, S64);
469     }
470   }
471 
472   getActionDefinitionsBuilder(G_FPTRUNC)
473     .legalFor({{S32, S64}, {S16, S32}})
474     .scalarize(0)
475     .lower();
476 
477   getActionDefinitionsBuilder(G_FPEXT)
478     .legalFor({{S64, S32}, {S32, S16}})
479     .lowerFor({{S64, S16}}) // FIXME: Implement
480     .scalarize(0);
481 
482   getActionDefinitionsBuilder(G_FSUB)
483       // Use actual fsub instruction
484       .legalFor({S32})
485       // Must use fadd + fneg
486       .lowerFor({S64, S16, V2S16})
487       .scalarize(0)
488       .clampScalar(0, S32, S64);
489 
490   // Whether this is legal depends on the floating point mode for the function.
491   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
492   if (ST.hasMadF16())
493     FMad.customFor({S32, S16});
494   else
495     FMad.customFor({S32});
496   FMad.scalarize(0)
497       .lower();
498 
499   // TODO: Do we need to clamp maximum bitwidth?
500   getActionDefinitionsBuilder(G_TRUNC)
501     .legalIf(isScalar(0))
502     .legalFor({{V2S16, V2S32}})
503     .clampMaxNumElements(0, S16, 2)
504     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
505     // situations (like an invalid implicit use), we don't want to infinite loop
506     // in the legalizer.
507     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
508     .alwaysLegal();
509 
510   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
511     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
512                {S32, S1}, {S64, S1}, {S16, S1}})
513     .scalarize(0)
514     .clampScalar(0, S32, S64)
515     .widenScalarToNextPow2(1, 32);
516 
517   // TODO: Split s1->s64 during regbankselect for VALU.
518   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
519     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
520     .lowerFor({{S32, S64}})
521     .lowerIf(typeIs(1, S1))
522     .customFor({{S64, S64}});
523   if (ST.has16BitInsts())
524     IToFP.legalFor({{S16, S16}});
525   IToFP.clampScalar(1, S32, S64)
526        .scalarize(0)
527        .widenScalarToNextPow2(1);
528 
529   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
530     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
531     .customFor({{S64, S64}});
532   if (ST.has16BitInsts())
533     FPToI.legalFor({{S16, S16}});
534   else
535     FPToI.minScalar(1, S32);
536 
537   FPToI.minScalar(0, S32)
538        .scalarize(0)
539        .lower();
540 
541   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
542     .scalarize(0)
543     .lower();
544 
545   if (ST.has16BitInsts()) {
546     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
547       .legalFor({S16, S32, S64})
548       .clampScalar(0, S16, S64)
549       .scalarize(0);
550   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
551     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
552       .legalFor({S32, S64})
553       .clampScalar(0, S32, S64)
554       .scalarize(0);
555   } else {
556     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
557       .legalFor({S32})
558       .customFor({S64})
559       .clampScalar(0, S32, S64)
560       .scalarize(0);
561   }
562 
563   getActionDefinitionsBuilder({G_PTR_ADD, G_PTR_MASK})
564     .scalarize(0)
565     .alwaysLegal();
566 
567   auto &CmpBuilder =
568     getActionDefinitionsBuilder(G_ICMP)
569     // The compare output type differs based on the register bank of the output,
570     // so make both s1 and s32 legal.
571     //
572     // Scalar compares producing output in scc will be promoted to s32, as that
573     // is the allocatable register type that will be needed for the copy from
574     // scc. This will be promoted during RegBankSelect, and we assume something
575     // before that won't try to use s32 result types.
576     //
577     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
578     // bank.
579     .legalForCartesianProduct(
580       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
581     .legalForCartesianProduct(
582       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
583   if (ST.has16BitInsts()) {
584     CmpBuilder.legalFor({{S1, S16}});
585   }
586 
587   CmpBuilder
588     .widenScalarToNextPow2(1)
589     .clampScalar(1, S32, S64)
590     .scalarize(0)
591     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
592 
593   getActionDefinitionsBuilder(G_FCMP)
594     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
595     .widenScalarToNextPow2(1)
596     .clampScalar(1, S32, S64)
597     .scalarize(0);
598 
599   // FIXME: fpow has a selection pattern that should move to custom lowering.
600   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
601   if (ST.has16BitInsts())
602     Exp2Ops.legalFor({S32, S16});
603   else
604     Exp2Ops.legalFor({S32});
605   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
606   Exp2Ops.scalarize(0);
607 
608   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
609   if (ST.has16BitInsts())
610     ExpOps.customFor({{S32}, {S16}});
611   else
612     ExpOps.customFor({S32});
613   ExpOps.clampScalar(0, MinScalarFPTy, S32)
614         .scalarize(0);
615 
616   // The 64-bit versions produce 32-bit results, but only on the SALU.
617   getActionDefinitionsBuilder(G_CTPOP)
618     .legalFor({{S32, S32}, {S32, S64}})
619     .clampScalar(0, S32, S32)
620     .clampScalar(1, S32, S64)
621     .scalarize(0)
622     .widenScalarToNextPow2(0, 32)
623     .widenScalarToNextPow2(1, 32);
624 
625   // The hardware instructions return a different result on 0 than the generic
626   // instructions expect. The hardware produces -1, but these produce the
627   // bitwidth.
628   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
629     .scalarize(0)
630     .clampScalar(0, S32, S32)
631     .clampScalar(1, S32, S64)
632     .widenScalarToNextPow2(0, 32)
633     .widenScalarToNextPow2(1, 32)
634     .lower();
635 
636   // The 64-bit versions produce 32-bit results, but only on the SALU.
637   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
638     .legalFor({{S32, S32}, {S32, S64}})
639     .clampScalar(0, S32, S32)
640     .clampScalar(1, S32, S64)
641     .scalarize(0)
642     .widenScalarToNextPow2(0, 32)
643     .widenScalarToNextPow2(1, 32);
644 
645   getActionDefinitionsBuilder(G_BITREVERSE)
646     .legalFor({S32})
647     .clampScalar(0, S32, S32)
648     .scalarize(0);
649 
650   if (ST.has16BitInsts()) {
651     getActionDefinitionsBuilder(G_BSWAP)
652       .legalFor({S16, S32, V2S16})
653       .clampMaxNumElements(0, S16, 2)
654       // FIXME: Fixing non-power-of-2 before clamp is workaround for
655       // narrowScalar limitation.
656       .widenScalarToNextPow2(0)
657       .clampScalar(0, S16, S32)
658       .scalarize(0);
659 
660     if (ST.hasVOP3PInsts()) {
661       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
662         .legalFor({S32, S16, V2S16})
663         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
664         .clampMaxNumElements(0, S16, 2)
665         .minScalar(0, S16)
666         .widenScalarToNextPow2(0)
667         .scalarize(0)
668         .lower();
669     } else {
670       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
671         .legalFor({S32, S16})
672         .widenScalarToNextPow2(0)
673         .minScalar(0, S16)
674         .scalarize(0)
675         .lower();
676     }
677   } else {
678     // TODO: Should have same legality without v_perm_b32
679     getActionDefinitionsBuilder(G_BSWAP)
680       .legalFor({S32})
681       .lowerIf(narrowerThan(0, 32))
682       // FIXME: Fixing non-power-of-2 before clamp is workaround for
683       // narrowScalar limitation.
684       .widenScalarToNextPow2(0)
685       .maxScalar(0, S32)
686       .scalarize(0)
687       .lower();
688 
689     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
690       .legalFor({S32})
691       .minScalar(0, S32)
692       .widenScalarToNextPow2(0)
693       .scalarize(0)
694       .lower();
695   }
696 
697   getActionDefinitionsBuilder(G_INTTOPTR)
698     // List the common cases
699     .legalForCartesianProduct(AddrSpaces64, {S64})
700     .legalForCartesianProduct(AddrSpaces32, {S32})
701     .scalarize(0)
702     // Accept any address space as long as the size matches
703     .legalIf(sameSize(0, 1))
704     .widenScalarIf(smallerThan(1, 0),
705       [](const LegalityQuery &Query) {
706         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
707       })
708     .narrowScalarIf(greaterThan(1, 0),
709       [](const LegalityQuery &Query) {
710         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
711       });
712 
713   getActionDefinitionsBuilder(G_PTRTOINT)
714     // List the common cases
715     .legalForCartesianProduct(AddrSpaces64, {S64})
716     .legalForCartesianProduct(AddrSpaces32, {S32})
717     .scalarize(0)
718     // Accept any address space as long as the size matches
719     .legalIf(sameSize(0, 1))
720     .widenScalarIf(smallerThan(0, 1),
721       [](const LegalityQuery &Query) {
722         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
723       })
724     .narrowScalarIf(
725       greaterThan(0, 1),
726       [](const LegalityQuery &Query) {
727         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
728       });
729 
730   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
731     .scalarize(0)
732     .custom();
733 
734   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
735   // handle some operations by just promoting the register during
736   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
737   auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned {
738     switch (AS) {
739     // FIXME: Private element size.
740     case AMDGPUAS::PRIVATE_ADDRESS:
741       return 32;
742     // FIXME: Check subtarget
743     case AMDGPUAS::LOCAL_ADDRESS:
744       return ST.useDS128() ? 128 : 64;
745 
746     // Treat constant and global as identical. SMRD loads are sometimes usable
747     // for global loads (ideally constant address space should be eliminated)
748     // depending on the context. Legality cannot be context dependent, but
749     // RegBankSelect can split the load as necessary depending on the pointer
750     // register bank/uniformity and if the memory is invariant or not written in
751     // a kernel.
752     case AMDGPUAS::CONSTANT_ADDRESS:
753     case AMDGPUAS::GLOBAL_ADDRESS:
754       return IsLoad ? 512 : 128;
755     default:
756       return 128;
757     }
758   };
759 
760   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
761                                     bool IsLoad) -> bool {
762     const LLT DstTy = Query.Types[0];
763 
764     // Split vector extloads.
765     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
766     unsigned Align = Query.MMODescrs[0].AlignInBits;
767 
768     if (MemSize < DstTy.getSizeInBits())
769       MemSize = std::max(MemSize, Align);
770 
771     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
772       return true;
773 
774     const LLT PtrTy = Query.Types[1];
775     unsigned AS = PtrTy.getAddressSpace();
776     if (MemSize > maxSizeForAddrSpace(AS, IsLoad))
777       return true;
778 
779     // Catch weird sized loads that don't evenly divide into the access sizes
780     // TODO: May be able to widen depending on alignment etc.
781     unsigned NumRegs = (MemSize + 31) / 32;
782     if (NumRegs == 3) {
783       if (!ST.hasDwordx3LoadStores())
784         return true;
785     } else {
786       // If the alignment allows, these should have been widened.
787       if (!isPowerOf2_32(NumRegs))
788         return true;
789     }
790 
791     if (Align < MemSize) {
792       const SITargetLowering *TLI = ST.getTargetLowering();
793       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
794     }
795 
796     return false;
797   };
798 
799   const auto shouldWidenLoadResult = [=](const LegalityQuery &Query) -> bool {
800     unsigned Size = Query.Types[0].getSizeInBits();
801     if (isPowerOf2_32(Size))
802       return false;
803 
804     if (Size == 96 && ST.hasDwordx3LoadStores())
805       return false;
806 
807     unsigned AddrSpace = Query.Types[1].getAddressSpace();
808     if (Size >= maxSizeForAddrSpace(AddrSpace, true))
809       return false;
810 
811     unsigned Align = Query.MMODescrs[0].AlignInBits;
812     unsigned RoundedSize = NextPowerOf2(Size);
813     return (Align >= RoundedSize);
814   };
815 
816   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
817   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
818   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
819 
820   // TODO: Refine based on subtargets which support unaligned access or 128-bit
821   // LDS
822   // TODO: Unsupported flat for SI.
823 
824   for (unsigned Op : {G_LOAD, G_STORE}) {
825     const bool IsStore = Op == G_STORE;
826 
827     auto &Actions = getActionDefinitionsBuilder(Op);
828     // Whitelist the common cases.
829     // TODO: Loads to s16 on gfx9
830     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
831                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
832                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
833                                       {S128, GlobalPtr, 128, GlobalAlign32},
834                                       {S64, GlobalPtr, 64, GlobalAlign32},
835                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
836                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
837                                       {S32, GlobalPtr, 8, GlobalAlign8},
838                                       {S32, GlobalPtr, 16, GlobalAlign16},
839 
840                                       {S32, LocalPtr, 32, 32},
841                                       {S64, LocalPtr, 64, 32},
842                                       {V2S32, LocalPtr, 64, 32},
843                                       {S32, LocalPtr, 8, 8},
844                                       {S32, LocalPtr, 16, 16},
845                                       {V2S16, LocalPtr, 32, 32},
846 
847                                       {S32, PrivatePtr, 32, 32},
848                                       {S32, PrivatePtr, 8, 8},
849                                       {S32, PrivatePtr, 16, 16},
850                                       {V2S16, PrivatePtr, 32, 32},
851 
852                                       {S32, FlatPtr, 32, GlobalAlign32},
853                                       {S32, FlatPtr, 16, GlobalAlign16},
854                                       {S32, FlatPtr, 8, GlobalAlign8},
855                                       {V2S16, FlatPtr, 32, GlobalAlign32},
856 
857                                       {S32, ConstantPtr, 32, GlobalAlign32},
858                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
859                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
860                                       {S64, ConstantPtr, 64, GlobalAlign32},
861                                       {S128, ConstantPtr, 128, GlobalAlign32},
862                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
863     Actions
864         .customIf(typeIs(1, Constant32Ptr))
865         // Widen suitably aligned loads by loading extra elements.
866         .moreElementsIf([=](const LegalityQuery &Query) {
867             const LLT Ty = Query.Types[0];
868             return Op == G_LOAD && Ty.isVector() &&
869                    shouldWidenLoadResult(Query);
870           }, moreElementsToNextPow2(0))
871         .widenScalarIf([=](const LegalityQuery &Query) {
872             const LLT Ty = Query.Types[0];
873             return Op == G_LOAD && !Ty.isVector() &&
874                    shouldWidenLoadResult(Query);
875           }, widenScalarOrEltToNextPow2(0))
876         .narrowScalarIf(
877             [=](const LegalityQuery &Query) -> bool {
878               return !Query.Types[0].isVector() &&
879                      needToSplitMemOp(Query, Op == G_LOAD);
880             },
881             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
882               const LLT DstTy = Query.Types[0];
883               const LLT PtrTy = Query.Types[1];
884 
885               const unsigned DstSize = DstTy.getSizeInBits();
886               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
887 
888               // Split extloads.
889               if (DstSize > MemSize)
890                 return std::make_pair(0, LLT::scalar(MemSize));
891 
892               if (!isPowerOf2_32(DstSize)) {
893                 // We're probably decomposing an odd sized store. Try to split
894                 // to the widest type. TODO: Account for alignment. As-is it
895                 // should be OK, since the new parts will be further legalized.
896                 unsigned FloorSize = PowerOf2Floor(DstSize);
897                 return std::make_pair(0, LLT::scalar(FloorSize));
898               }
899 
900               if (DstSize > 32 && (DstSize % 32 != 0)) {
901                 // FIXME: Need a way to specify non-extload of larger size if
902                 // suitably aligned.
903                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
904               }
905 
906               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
907                                                      Op == G_LOAD);
908               if (MemSize > MaxSize)
909                 return std::make_pair(0, LLT::scalar(MaxSize));
910 
911               unsigned Align = Query.MMODescrs[0].AlignInBits;
912               return std::make_pair(0, LLT::scalar(Align));
913             })
914         .fewerElementsIf(
915             [=](const LegalityQuery &Query) -> bool {
916               return Query.Types[0].isVector() &&
917                      needToSplitMemOp(Query, Op == G_LOAD);
918             },
919             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
920               const LLT DstTy = Query.Types[0];
921               const LLT PtrTy = Query.Types[1];
922 
923               LLT EltTy = DstTy.getElementType();
924               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
925                                                      Op == G_LOAD);
926 
927               // FIXME: Handle widened to power of 2 results better. This ends
928               // up scalarizing.
929               // FIXME: 3 element stores scalarized on SI
930 
931               // Split if it's too large for the address space.
932               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
933                 unsigned NumElts = DstTy.getNumElements();
934                 unsigned EltSize = EltTy.getSizeInBits();
935 
936                 if (MaxSize % EltSize == 0) {
937                   return std::make_pair(
938                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
939                 }
940 
941                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
942 
943                 // FIXME: Refine when odd breakdowns handled
944                 // The scalars will need to be re-legalized.
945                 if (NumPieces == 1 || NumPieces >= NumElts ||
946                     NumElts % NumPieces != 0)
947                   return std::make_pair(0, EltTy);
948 
949                 return std::make_pair(0,
950                                       LLT::vector(NumElts / NumPieces, EltTy));
951               }
952 
953               // FIXME: We could probably handle weird extending loads better.
954               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
955               if (DstTy.getSizeInBits() > MemSize)
956                 return std::make_pair(0, EltTy);
957 
958               unsigned EltSize = EltTy.getSizeInBits();
959               unsigned DstSize = DstTy.getSizeInBits();
960               if (!isPowerOf2_32(DstSize)) {
961                 // We're probably decomposing an odd sized store. Try to split
962                 // to the widest type. TODO: Account for alignment. As-is it
963                 // should be OK, since the new parts will be further legalized.
964                 unsigned FloorSize = PowerOf2Floor(DstSize);
965                 return std::make_pair(
966                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
967               }
968 
969               // Need to split because of alignment.
970               unsigned Align = Query.MMODescrs[0].AlignInBits;
971               if (EltSize > Align &&
972                   (EltSize / Align < DstTy.getNumElements())) {
973                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
974               }
975 
976               // May need relegalization for the scalars.
977               return std::make_pair(0, EltTy);
978             })
979         .minScalar(0, S32);
980 
981     if (IsStore)
982       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
983 
984     // TODO: Need a bitcast lower option?
985     Actions
986         .legalIf([=](const LegalityQuery &Query) {
987           const LLT Ty0 = Query.Types[0];
988           unsigned Size = Ty0.getSizeInBits();
989           unsigned MemSize = Query.MMODescrs[0].SizeInBits;
990           unsigned Align = Query.MMODescrs[0].AlignInBits;
991 
992           // FIXME: Widening store from alignment not valid.
993           if (MemSize < Size)
994             MemSize = std::max(MemSize, Align);
995 
996           // No extending vector loads.
997           if (Size > MemSize && Ty0.isVector())
998             return false;
999 
1000           switch (MemSize) {
1001           case 8:
1002           case 16:
1003             return Size == 32;
1004           case 32:
1005           case 64:
1006           case 128:
1007             return true;
1008           case 96:
1009             return ST.hasDwordx3LoadStores();
1010           case 256:
1011           case 512:
1012             return true;
1013           default:
1014             return false;
1015           }
1016         })
1017         .widenScalarToNextPow2(0)
1018         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
1019   }
1020 
1021   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1022                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1023                                                   {S32, GlobalPtr, 16, 2 * 8},
1024                                                   {S32, LocalPtr, 8, 8},
1025                                                   {S32, LocalPtr, 16, 16},
1026                                                   {S32, PrivatePtr, 8, 8},
1027                                                   {S32, PrivatePtr, 16, 16},
1028                                                   {S32, ConstantPtr, 8, 8},
1029                                                   {S32, ConstantPtr, 16, 2 * 8}});
1030   if (ST.hasFlatAddressSpace()) {
1031     ExtLoads.legalForTypesWithMemDesc(
1032         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1033   }
1034 
1035   ExtLoads.clampScalar(0, S32, S32)
1036           .widenScalarToNextPow2(0)
1037           .unsupportedIfMemSizeNotPow2()
1038           .lower();
1039 
1040   auto &Atomics = getActionDefinitionsBuilder(
1041     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1042      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1043      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1044      G_ATOMICRMW_UMIN})
1045     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1046                {S64, GlobalPtr}, {S64, LocalPtr}});
1047   if (ST.hasFlatAddressSpace()) {
1048     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1049   }
1050 
1051   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1052     .legalFor({{S32, LocalPtr}});
1053 
1054   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1055   // demarshalling
1056   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1057     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1058                 {S32, FlatPtr}, {S64, FlatPtr}})
1059     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1060                {S32, RegionPtr}, {S64, RegionPtr}});
1061   // TODO: Pointer types, any 32-bit or 64-bit vector
1062 
1063   // Condition should be s32 for scalar, s1 for vector.
1064   getActionDefinitionsBuilder(G_SELECT)
1065     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1066           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1067           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1068     .clampScalar(0, S16, S64)
1069     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1070     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1071     .scalarize(1)
1072     .clampMaxNumElements(0, S32, 2)
1073     .clampMaxNumElements(0, LocalPtr, 2)
1074     .clampMaxNumElements(0, PrivatePtr, 2)
1075     .scalarize(0)
1076     .widenScalarToNextPow2(0)
1077     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1078 
1079   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1080   // be more flexible with the shift amount type.
1081   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1082     .legalFor({{S32, S32}, {S64, S32}});
1083   if (ST.has16BitInsts()) {
1084     if (ST.hasVOP3PInsts()) {
1085       Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
1086             .clampMaxNumElements(0, S16, 2);
1087     } else
1088       Shifts.legalFor({{S16, S32}, {S16, S16}});
1089 
1090     // TODO: Support 16-bit shift amounts
1091     Shifts.clampScalar(1, S32, S32);
1092     Shifts.clampScalar(0, S16, S64);
1093     Shifts.widenScalarToNextPow2(0, 16);
1094   } else {
1095     // Make sure we legalize the shift amount type first, as the general
1096     // expansion for the shifted type will produce much worse code if it hasn't
1097     // been truncated already.
1098     Shifts.clampScalar(1, S32, S32);
1099     Shifts.clampScalar(0, S32, S64);
1100     Shifts.widenScalarToNextPow2(0, 32);
1101   }
1102   Shifts.scalarize(0);
1103 
1104   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1105     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1106     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1107     unsigned IdxTypeIdx = 2;
1108 
1109     getActionDefinitionsBuilder(Op)
1110       .customIf([=](const LegalityQuery &Query) {
1111           const LLT EltTy = Query.Types[EltTypeIdx];
1112           const LLT VecTy = Query.Types[VecTypeIdx];
1113           const LLT IdxTy = Query.Types[IdxTypeIdx];
1114           return (EltTy.getSizeInBits() == 16 ||
1115                   EltTy.getSizeInBits() % 32 == 0) &&
1116                  VecTy.getSizeInBits() % 32 == 0 &&
1117                  VecTy.getSizeInBits() <= 1024 &&
1118                  IdxTy.getSizeInBits() == 32;
1119         })
1120       .clampScalar(EltTypeIdx, S32, S64)
1121       .clampScalar(VecTypeIdx, S32, S64)
1122       .clampScalar(IdxTypeIdx, S32, S32);
1123   }
1124 
1125   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1126     .unsupportedIf([=](const LegalityQuery &Query) {
1127         const LLT &EltTy = Query.Types[1].getElementType();
1128         return Query.Types[0] != EltTy;
1129       });
1130 
1131   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1132     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1133     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1134 
1135     // FIXME: Doesn't handle extract of illegal sizes.
1136     getActionDefinitionsBuilder(Op)
1137       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1138       // FIXME: Multiples of 16 should not be legal.
1139       .legalIf([=](const LegalityQuery &Query) {
1140           const LLT BigTy = Query.Types[BigTyIdx];
1141           const LLT LitTy = Query.Types[LitTyIdx];
1142           return (BigTy.getSizeInBits() % 32 == 0) &&
1143                  (LitTy.getSizeInBits() % 16 == 0);
1144         })
1145       .widenScalarIf(
1146         [=](const LegalityQuery &Query) {
1147           const LLT BigTy = Query.Types[BigTyIdx];
1148           return (BigTy.getScalarSizeInBits() < 16);
1149         },
1150         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1151       .widenScalarIf(
1152         [=](const LegalityQuery &Query) {
1153           const LLT LitTy = Query.Types[LitTyIdx];
1154           return (LitTy.getScalarSizeInBits() < 16);
1155         },
1156         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1157       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1158       .widenScalarToNextPow2(BigTyIdx, 32);
1159 
1160   }
1161 
1162   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1163     .legalForCartesianProduct(AllS32Vectors, {S32})
1164     .legalForCartesianProduct(AllS64Vectors, {S64})
1165     .clampNumElements(0, V16S32, V32S32)
1166     .clampNumElements(0, V2S64, V16S64)
1167     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1168 
1169   if (ST.hasScalarPackInsts()) {
1170     BuildVector
1171       // FIXME: Should probably widen s1 vectors straight to s32
1172       .minScalarOrElt(0, S16)
1173       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1174       .minScalar(1, S32);
1175 
1176     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1177       .legalFor({V2S16, S32})
1178       .lower();
1179     BuildVector.minScalarOrElt(0, S32);
1180   } else {
1181     BuildVector.customFor({V2S16, S16});
1182     BuildVector.minScalarOrElt(0, S32);
1183 
1184     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1185       .customFor({V2S16, S32})
1186       .lower();
1187   }
1188 
1189   BuildVector.legalIf(isRegisterType(0));
1190 
1191   // FIXME: Clamp maximum size
1192   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1193     .legalIf(isRegisterType(0));
1194 
1195   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1196   // pre-legalize.
1197   if (ST.hasVOP3PInsts()) {
1198     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1199       .customFor({V2S16, V2S16})
1200       .lower();
1201   } else
1202     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1203 
1204   // Merge/Unmerge
1205   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1206     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1207     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1208 
1209     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1210       const LLT Ty = Query.Types[TypeIdx];
1211       if (Ty.isVector()) {
1212         const LLT &EltTy = Ty.getElementType();
1213         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1214           return true;
1215         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1216           return true;
1217       }
1218       return false;
1219     };
1220 
1221     auto &Builder = getActionDefinitionsBuilder(Op)
1222       // Try to widen to s16 first for small types.
1223       // TODO: Only do this on targets with legal s16 shifts
1224       .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1225 
1226       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1227       .lowerFor({{S16, V2S16}})
1228       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1229       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1230                            elementTypeIs(1, S16)),
1231                        changeTo(1, V2S16))
1232       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1233       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1234       // valid.
1235       .clampScalar(LitTyIdx, S32, S512)
1236       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1237       // Break up vectors with weird elements into scalars
1238       .fewerElementsIf(
1239         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1240         scalarize(0))
1241       .fewerElementsIf(
1242         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1243         scalarize(1))
1244       .clampScalar(BigTyIdx, S32, S1024);
1245 
1246     if (Op == G_MERGE_VALUES) {
1247       Builder.widenScalarIf(
1248         // TODO: Use 16-bit shifts if legal for 8-bit values?
1249         [=](const LegalityQuery &Query) {
1250           const LLT Ty = Query.Types[LitTyIdx];
1251           return Ty.getSizeInBits() < 32;
1252         },
1253         changeTo(LitTyIdx, S32));
1254     }
1255 
1256     Builder.widenScalarIf(
1257       [=](const LegalityQuery &Query) {
1258         const LLT Ty = Query.Types[BigTyIdx];
1259         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1260           Ty.getSizeInBits() % 16 != 0;
1261       },
1262       [=](const LegalityQuery &Query) {
1263         // Pick the next power of 2, or a multiple of 64 over 128.
1264         // Whichever is smaller.
1265         const LLT &Ty = Query.Types[BigTyIdx];
1266         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1267         if (NewSizeInBits >= 256) {
1268           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1269           if (RoundedTo < NewSizeInBits)
1270             NewSizeInBits = RoundedTo;
1271         }
1272         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1273       })
1274       .legalIf([=](const LegalityQuery &Query) {
1275           const LLT &BigTy = Query.Types[BigTyIdx];
1276           const LLT &LitTy = Query.Types[LitTyIdx];
1277 
1278           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1279             return false;
1280           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1281             return false;
1282 
1283           return BigTy.getSizeInBits() % 16 == 0 &&
1284                  LitTy.getSizeInBits() % 16 == 0 &&
1285                  BigTy.getSizeInBits() <= 1024;
1286         })
1287       // Any vectors left are the wrong size. Scalarize them.
1288       .scalarize(0)
1289       .scalarize(1);
1290   }
1291 
1292   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1293   // RegBankSelect.
1294   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1295     .legalFor({{S32}, {S64}});
1296 
1297   if (ST.hasVOP3PInsts()) {
1298     SextInReg.lowerFor({{V2S16}})
1299       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1300       // get more vector shift opportunities, since we'll get those when
1301       // expanded.
1302       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1303   } else if (ST.has16BitInsts()) {
1304     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1305   } else {
1306     // Prefer to promote to s32 before lowering if we don't have 16-bit
1307     // shifts. This avoid a lot of intermediate truncate and extend operations.
1308     SextInReg.lowerFor({{S32}, {S64}});
1309   }
1310 
1311   SextInReg
1312     .scalarize(0)
1313     .clampScalar(0, S32, S64)
1314     .lower();
1315 
1316   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1317     .legalFor({S64});
1318 
1319   getActionDefinitionsBuilder({
1320       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1321       G_FCOPYSIGN,
1322 
1323       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1324       G_READ_REGISTER,
1325       G_WRITE_REGISTER,
1326 
1327       G_SADDO, G_SSUBO,
1328 
1329        // TODO: Implement
1330       G_FMINIMUM, G_FMAXIMUM
1331     }).lower();
1332 
1333   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1334         G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1335         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1336     .unsupported();
1337 
1338   computeTables();
1339   verify(*ST.getInstrInfo());
1340 }
1341 
1342 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1343                                          MachineRegisterInfo &MRI,
1344                                          MachineIRBuilder &B,
1345                                          GISelChangeObserver &Observer) const {
1346   switch (MI.getOpcode()) {
1347   case TargetOpcode::G_ADDRSPACE_CAST:
1348     return legalizeAddrSpaceCast(MI, MRI, B);
1349   case TargetOpcode::G_FRINT:
1350     return legalizeFrint(MI, MRI, B);
1351   case TargetOpcode::G_FCEIL:
1352     return legalizeFceil(MI, MRI, B);
1353   case TargetOpcode::G_INTRINSIC_TRUNC:
1354     return legalizeIntrinsicTrunc(MI, MRI, B);
1355   case TargetOpcode::G_SITOFP:
1356     return legalizeITOFP(MI, MRI, B, true);
1357   case TargetOpcode::G_UITOFP:
1358     return legalizeITOFP(MI, MRI, B, false);
1359   case TargetOpcode::G_FPTOSI:
1360     return legalizeFPTOI(MI, MRI, B, true);
1361   case TargetOpcode::G_FPTOUI:
1362     return legalizeFPTOI(MI, MRI, B, false);
1363   case TargetOpcode::G_FMINNUM:
1364   case TargetOpcode::G_FMAXNUM:
1365   case TargetOpcode::G_FMINNUM_IEEE:
1366   case TargetOpcode::G_FMAXNUM_IEEE:
1367     return legalizeMinNumMaxNum(MI, MRI, B);
1368   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1369     return legalizeExtractVectorElt(MI, MRI, B);
1370   case TargetOpcode::G_INSERT_VECTOR_ELT:
1371     return legalizeInsertVectorElt(MI, MRI, B);
1372   case TargetOpcode::G_SHUFFLE_VECTOR:
1373     return legalizeShuffleVector(MI, MRI, B);
1374   case TargetOpcode::G_FSIN:
1375   case TargetOpcode::G_FCOS:
1376     return legalizeSinCos(MI, MRI, B);
1377   case TargetOpcode::G_GLOBAL_VALUE:
1378     return legalizeGlobalValue(MI, MRI, B);
1379   case TargetOpcode::G_LOAD:
1380     return legalizeLoad(MI, MRI, B, Observer);
1381   case TargetOpcode::G_FMAD:
1382     return legalizeFMad(MI, MRI, B);
1383   case TargetOpcode::G_FDIV:
1384     return legalizeFDIV(MI, MRI, B);
1385   case TargetOpcode::G_UDIV:
1386   case TargetOpcode::G_UREM:
1387     return legalizeUDIV_UREM(MI, MRI, B);
1388   case TargetOpcode::G_SDIV:
1389   case TargetOpcode::G_SREM:
1390     return legalizeSDIV_SREM(MI, MRI, B);
1391   case TargetOpcode::G_ATOMIC_CMPXCHG:
1392     return legalizeAtomicCmpXChg(MI, MRI, B);
1393   case TargetOpcode::G_FLOG:
1394     return legalizeFlog(MI, B, 1.0f / numbers::log2ef);
1395   case TargetOpcode::G_FLOG10:
1396     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1397   case TargetOpcode::G_FEXP:
1398     return legalizeFExp(MI, B);
1399   case TargetOpcode::G_FPOW:
1400     return legalizeFPow(MI, B);
1401   case TargetOpcode::G_FFLOOR:
1402     return legalizeFFloor(MI, MRI, B);
1403   case TargetOpcode::G_BUILD_VECTOR:
1404     return legalizeBuildVector(MI, MRI, B);
1405   default:
1406     return false;
1407   }
1408 
1409   llvm_unreachable("expected switch to return");
1410 }
1411 
1412 Register AMDGPULegalizerInfo::getSegmentAperture(
1413   unsigned AS,
1414   MachineRegisterInfo &MRI,
1415   MachineIRBuilder &B) const {
1416   MachineFunction &MF = B.getMF();
1417   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1418   const LLT S32 = LLT::scalar(32);
1419 
1420   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1421 
1422   if (ST.hasApertureRegs()) {
1423     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1424     // getreg.
1425     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1426         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1427         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1428     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1429         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1430         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1431     unsigned Encoding =
1432         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1433         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1434         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1435 
1436     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1437 
1438     B.buildInstr(AMDGPU::S_GETREG_B32)
1439       .addDef(GetReg)
1440       .addImm(Encoding);
1441     MRI.setType(GetReg, S32);
1442 
1443     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1444     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1445   }
1446 
1447   Register QueuePtr = MRI.createGenericVirtualRegister(
1448     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1449 
1450   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1451   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1452     return Register();
1453 
1454   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1455   // private_segment_aperture_base_hi.
1456   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1457 
1458   // TODO: can we be smarter about machine pointer info?
1459   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1460   MachineMemOperand *MMO = MF.getMachineMemOperand(
1461     PtrInfo,
1462     MachineMemOperand::MOLoad |
1463     MachineMemOperand::MODereferenceable |
1464     MachineMemOperand::MOInvariant,
1465     4,
1466     MinAlign(64, StructOffset));
1467 
1468   Register LoadAddr;
1469 
1470   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1471   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1472 }
1473 
1474 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1475   MachineInstr &MI, MachineRegisterInfo &MRI,
1476   MachineIRBuilder &B) const {
1477   MachineFunction &MF = B.getMF();
1478 
1479   B.setInstr(MI);
1480 
1481   const LLT S32 = LLT::scalar(32);
1482   Register Dst = MI.getOperand(0).getReg();
1483   Register Src = MI.getOperand(1).getReg();
1484 
1485   LLT DstTy = MRI.getType(Dst);
1486   LLT SrcTy = MRI.getType(Src);
1487   unsigned DestAS = DstTy.getAddressSpace();
1488   unsigned SrcAS = SrcTy.getAddressSpace();
1489 
1490   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1491   // vector element.
1492   assert(!DstTy.isVector());
1493 
1494   const AMDGPUTargetMachine &TM
1495     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1496 
1497   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1498   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1499     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1500     return true;
1501   }
1502 
1503   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1504     // Truncate.
1505     B.buildExtract(Dst, Src, 0);
1506     MI.eraseFromParent();
1507     return true;
1508   }
1509 
1510   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1511     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1512     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1513 
1514     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1515     // another. Merge operands are required to be the same type, but creating an
1516     // extra ptrtoint would be kind of pointless.
1517     auto HighAddr = B.buildConstant(
1518       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1519     B.buildMerge(Dst, {Src, HighAddr});
1520     MI.eraseFromParent();
1521     return true;
1522   }
1523 
1524   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1525     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1526            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1527     unsigned NullVal = TM.getNullPointerValue(DestAS);
1528 
1529     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1530     auto FlatNull = B.buildConstant(SrcTy, 0);
1531 
1532     // Extract low 32-bits of the pointer.
1533     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1534 
1535     auto CmpRes =
1536         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1537     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1538 
1539     MI.eraseFromParent();
1540     return true;
1541   }
1542 
1543   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1544     return false;
1545 
1546   if (!ST.hasFlatAddressSpace())
1547     return false;
1548 
1549   auto SegmentNull =
1550       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1551   auto FlatNull =
1552       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1553 
1554   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1555   if (!ApertureReg.isValid())
1556     return false;
1557 
1558   auto CmpRes =
1559       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1560 
1561   // Coerce the type of the low half of the result so we can use merge_values.
1562   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1563 
1564   // TODO: Should we allow mismatched types but matching sizes in merges to
1565   // avoid the ptrtoint?
1566   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1567   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1568 
1569   MI.eraseFromParent();
1570   return true;
1571 }
1572 
1573 bool AMDGPULegalizerInfo::legalizeFrint(
1574   MachineInstr &MI, MachineRegisterInfo &MRI,
1575   MachineIRBuilder &B) const {
1576   B.setInstr(MI);
1577 
1578   Register Src = MI.getOperand(1).getReg();
1579   LLT Ty = MRI.getType(Src);
1580   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1581 
1582   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1583   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1584 
1585   auto C1 = B.buildFConstant(Ty, C1Val);
1586   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1587 
1588   // TODO: Should this propagate fast-math-flags?
1589   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1590   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1591 
1592   auto C2 = B.buildFConstant(Ty, C2Val);
1593   auto Fabs = B.buildFAbs(Ty, Src);
1594 
1595   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1596   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1597   return true;
1598 }
1599 
1600 bool AMDGPULegalizerInfo::legalizeFceil(
1601   MachineInstr &MI, MachineRegisterInfo &MRI,
1602   MachineIRBuilder &B) const {
1603   B.setInstr(MI);
1604 
1605   const LLT S1 = LLT::scalar(1);
1606   const LLT S64 = LLT::scalar(64);
1607 
1608   Register Src = MI.getOperand(1).getReg();
1609   assert(MRI.getType(Src) == S64);
1610 
1611   // result = trunc(src)
1612   // if (src > 0.0 && src != result)
1613   //   result += 1.0
1614 
1615   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1616 
1617   const auto Zero = B.buildFConstant(S64, 0.0);
1618   const auto One = B.buildFConstant(S64, 1.0);
1619   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1620   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1621   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1622   auto Add = B.buildSelect(S64, And, One, Zero);
1623 
1624   // TODO: Should this propagate fast-math-flags?
1625   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1626   return true;
1627 }
1628 
1629 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1630                                               MachineIRBuilder &B) {
1631   const unsigned FractBits = 52;
1632   const unsigned ExpBits = 11;
1633   LLT S32 = LLT::scalar(32);
1634 
1635   auto Const0 = B.buildConstant(S32, FractBits - 32);
1636   auto Const1 = B.buildConstant(S32, ExpBits);
1637 
1638   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1639     .addUse(Const0.getReg(0))
1640     .addUse(Const1.getReg(0));
1641 
1642   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1643 }
1644 
1645 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1646   MachineInstr &MI, MachineRegisterInfo &MRI,
1647   MachineIRBuilder &B) const {
1648   B.setInstr(MI);
1649 
1650   const LLT S1 = LLT::scalar(1);
1651   const LLT S32 = LLT::scalar(32);
1652   const LLT S64 = LLT::scalar(64);
1653 
1654   Register Src = MI.getOperand(1).getReg();
1655   assert(MRI.getType(Src) == S64);
1656 
1657   // TODO: Should this use extract since the low half is unused?
1658   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1659   Register Hi = Unmerge.getReg(1);
1660 
1661   // Extract the upper half, since this is where we will find the sign and
1662   // exponent.
1663   auto Exp = extractF64Exponent(Hi, B);
1664 
1665   const unsigned FractBits = 52;
1666 
1667   // Extract the sign bit.
1668   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1669   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1670 
1671   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1672 
1673   const auto Zero32 = B.buildConstant(S32, 0);
1674 
1675   // Extend back to 64-bits.
1676   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1677 
1678   auto Shr = B.buildAShr(S64, FractMask, Exp);
1679   auto Not = B.buildNot(S64, Shr);
1680   auto Tmp0 = B.buildAnd(S64, Src, Not);
1681   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1682 
1683   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1684   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1685 
1686   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1687   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1688   return true;
1689 }
1690 
1691 bool AMDGPULegalizerInfo::legalizeITOFP(
1692   MachineInstr &MI, MachineRegisterInfo &MRI,
1693   MachineIRBuilder &B, bool Signed) const {
1694   B.setInstr(MI);
1695 
1696   Register Dst = MI.getOperand(0).getReg();
1697   Register Src = MI.getOperand(1).getReg();
1698 
1699   const LLT S64 = LLT::scalar(64);
1700   const LLT S32 = LLT::scalar(32);
1701 
1702   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1703 
1704   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1705 
1706   auto CvtHi = Signed ?
1707     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1708     B.buildUITOFP(S64, Unmerge.getReg(1));
1709 
1710   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1711 
1712   auto ThirtyTwo = B.buildConstant(S32, 32);
1713   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1714     .addUse(CvtHi.getReg(0))
1715     .addUse(ThirtyTwo.getReg(0));
1716 
1717   // TODO: Should this propagate fast-math-flags?
1718   B.buildFAdd(Dst, LdExp, CvtLo);
1719   MI.eraseFromParent();
1720   return true;
1721 }
1722 
1723 // TODO: Copied from DAG implementation. Verify logic and document how this
1724 // actually works.
1725 bool AMDGPULegalizerInfo::legalizeFPTOI(
1726   MachineInstr &MI, MachineRegisterInfo &MRI,
1727   MachineIRBuilder &B, bool Signed) const {
1728   B.setInstr(MI);
1729 
1730   Register Dst = MI.getOperand(0).getReg();
1731   Register Src = MI.getOperand(1).getReg();
1732 
1733   const LLT S64 = LLT::scalar(64);
1734   const LLT S32 = LLT::scalar(32);
1735 
1736   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1737 
1738   unsigned Flags = MI.getFlags();
1739 
1740   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
1741   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
1742   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
1743 
1744   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
1745   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
1746   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
1747 
1748   auto Hi = Signed ?
1749     B.buildFPTOSI(S32, FloorMul) :
1750     B.buildFPTOUI(S32, FloorMul);
1751   auto Lo = B.buildFPTOUI(S32, Fma);
1752 
1753   B.buildMerge(Dst, { Lo, Hi });
1754   MI.eraseFromParent();
1755 
1756   return true;
1757 }
1758 
1759 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1760   MachineInstr &MI, MachineRegisterInfo &MRI,
1761   MachineIRBuilder &B) const {
1762   MachineFunction &MF = B.getMF();
1763   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1764 
1765   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1766                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1767 
1768   // With ieee_mode disabled, the instructions have the correct behavior
1769   // already for G_FMINNUM/G_FMAXNUM
1770   if (!MFI->getMode().IEEE)
1771     return !IsIEEEOp;
1772 
1773   if (IsIEEEOp)
1774     return true;
1775 
1776   MachineIRBuilder HelperBuilder(MI);
1777   GISelObserverWrapper DummyObserver;
1778   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1779   HelperBuilder.setInstr(MI);
1780   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1781 }
1782 
1783 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1784   MachineInstr &MI, MachineRegisterInfo &MRI,
1785   MachineIRBuilder &B) const {
1786   // TODO: Should move some of this into LegalizerHelper.
1787 
1788   // TODO: Promote dynamic indexing of s16 to s32
1789 
1790   // FIXME: Artifact combiner probably should have replaced the truncated
1791   // constant before this, so we shouldn't need
1792   // getConstantVRegValWithLookThrough.
1793   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1794     MI.getOperand(2).getReg(), MRI);
1795   if (!IdxVal) // Dynamic case will be selected to register indexing.
1796     return true;
1797 
1798   Register Dst = MI.getOperand(0).getReg();
1799   Register Vec = MI.getOperand(1).getReg();
1800 
1801   LLT VecTy = MRI.getType(Vec);
1802   LLT EltTy = VecTy.getElementType();
1803   assert(EltTy == MRI.getType(Dst));
1804 
1805   B.setInstr(MI);
1806 
1807   if (IdxVal->Value < VecTy.getNumElements())
1808     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
1809   else
1810     B.buildUndef(Dst);
1811 
1812   MI.eraseFromParent();
1813   return true;
1814 }
1815 
1816 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1817   MachineInstr &MI, MachineRegisterInfo &MRI,
1818   MachineIRBuilder &B) const {
1819   // TODO: Should move some of this into LegalizerHelper.
1820 
1821   // TODO: Promote dynamic indexing of s16 to s32
1822 
1823   // FIXME: Artifact combiner probably should have replaced the truncated
1824   // constant before this, so we shouldn't need
1825   // getConstantVRegValWithLookThrough.
1826   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1827     MI.getOperand(3).getReg(), MRI);
1828   if (!IdxVal) // Dynamic case will be selected to register indexing.
1829     return true;
1830 
1831   Register Dst = MI.getOperand(0).getReg();
1832   Register Vec = MI.getOperand(1).getReg();
1833   Register Ins = MI.getOperand(2).getReg();
1834 
1835   LLT VecTy = MRI.getType(Vec);
1836   LLT EltTy = VecTy.getElementType();
1837   assert(EltTy == MRI.getType(Ins));
1838 
1839   B.setInstr(MI);
1840 
1841   if (IdxVal->Value < VecTy.getNumElements())
1842     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
1843   else
1844     B.buildUndef(Dst);
1845 
1846   MI.eraseFromParent();
1847   return true;
1848 }
1849 
1850 bool AMDGPULegalizerInfo::legalizeShuffleVector(
1851   MachineInstr &MI, MachineRegisterInfo &MRI,
1852   MachineIRBuilder &B) const {
1853   const LLT V2S16 = LLT::vector(2, 16);
1854 
1855   Register Dst = MI.getOperand(0).getReg();
1856   Register Src0 = MI.getOperand(1).getReg();
1857   LLT DstTy = MRI.getType(Dst);
1858   LLT SrcTy = MRI.getType(Src0);
1859 
1860   if (SrcTy == V2S16 && DstTy == V2S16 &&
1861       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
1862     return true;
1863 
1864   MachineIRBuilder HelperBuilder(MI);
1865   GISelObserverWrapper DummyObserver;
1866   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
1867   HelperBuilder.setInstr(MI);
1868   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
1869 }
1870 
1871 bool AMDGPULegalizerInfo::legalizeSinCos(
1872   MachineInstr &MI, MachineRegisterInfo &MRI,
1873   MachineIRBuilder &B) const {
1874   B.setInstr(MI);
1875 
1876   Register DstReg = MI.getOperand(0).getReg();
1877   Register SrcReg = MI.getOperand(1).getReg();
1878   LLT Ty = MRI.getType(DstReg);
1879   unsigned Flags = MI.getFlags();
1880 
1881   Register TrigVal;
1882   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1883   if (ST.hasTrigReducedRange()) {
1884     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1885     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1886       .addUse(MulVal.getReg(0))
1887       .setMIFlags(Flags).getReg(0);
1888   } else
1889     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1890 
1891   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1892     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1893   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1894     .addUse(TrigVal)
1895     .setMIFlags(Flags);
1896   MI.eraseFromParent();
1897   return true;
1898 }
1899 
1900 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1901   Register DstReg, LLT PtrTy,
1902   MachineIRBuilder &B, const GlobalValue *GV,
1903   unsigned Offset, unsigned GAFlags) const {
1904   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1905   // to the following code sequence:
1906   //
1907   // For constant address space:
1908   //   s_getpc_b64 s[0:1]
1909   //   s_add_u32 s0, s0, $symbol
1910   //   s_addc_u32 s1, s1, 0
1911   //
1912   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1913   //   a fixup or relocation is emitted to replace $symbol with a literal
1914   //   constant, which is a pc-relative offset from the encoding of the $symbol
1915   //   operand to the global variable.
1916   //
1917   // For global address space:
1918   //   s_getpc_b64 s[0:1]
1919   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1920   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1921   //
1922   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1923   //   fixups or relocations are emitted to replace $symbol@*@lo and
1924   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1925   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
1926   //   operand to the global variable.
1927   //
1928   // What we want here is an offset from the value returned by s_getpc
1929   // (which is the address of the s_add_u32 instruction) to the global
1930   // variable, but since the encoding of $symbol starts 4 bytes after the start
1931   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1932   // small. This requires us to add 4 to the global variable offset in order to
1933   // compute the correct address.
1934 
1935   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1936 
1937   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1938     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1939 
1940   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1941     .addDef(PCReg);
1942 
1943   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1944   if (GAFlags == SIInstrInfo::MO_NONE)
1945     MIB.addImm(0);
1946   else
1947     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1948 
1949   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1950 
1951   if (PtrTy.getSizeInBits() == 32)
1952     B.buildExtract(DstReg, PCReg, 0);
1953   return true;
1954  }
1955 
1956 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1957   MachineInstr &MI, MachineRegisterInfo &MRI,
1958   MachineIRBuilder &B) const {
1959   Register DstReg = MI.getOperand(0).getReg();
1960   LLT Ty = MRI.getType(DstReg);
1961   unsigned AS = Ty.getAddressSpace();
1962 
1963   const GlobalValue *GV = MI.getOperand(1).getGlobal();
1964   MachineFunction &MF = B.getMF();
1965   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1966   B.setInstr(MI);
1967 
1968   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1969     if (!MFI->isEntryFunction()) {
1970       const Function &Fn = MF.getFunction();
1971       DiagnosticInfoUnsupported BadLDSDecl(
1972         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
1973         DS_Warning);
1974       Fn.getContext().diagnose(BadLDSDecl);
1975 
1976       // We currently don't have a way to correctly allocate LDS objects that
1977       // aren't directly associated with a kernel. We do force inlining of
1978       // functions that use local objects. However, if these dead functions are
1979       // not eliminated, we don't want a compile time error. Just emit a warning
1980       // and a trap, since there should be no callable path here.
1981       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
1982       B.buildUndef(DstReg);
1983       MI.eraseFromParent();
1984       return true;
1985     }
1986 
1987     // TODO: We could emit code to handle the initialization somewhere.
1988     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1989       const SITargetLowering *TLI = ST.getTargetLowering();
1990       if (!TLI->shouldUseLDSConstAddress(GV)) {
1991         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
1992         return true; // Leave in place;
1993       }
1994 
1995       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1996       MI.eraseFromParent();
1997       return true;
1998     }
1999 
2000     const Function &Fn = MF.getFunction();
2001     DiagnosticInfoUnsupported BadInit(
2002       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2003     Fn.getContext().diagnose(BadInit);
2004     return true;
2005   }
2006 
2007   const SITargetLowering *TLI = ST.getTargetLowering();
2008 
2009   if (TLI->shouldEmitFixup(GV)) {
2010     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2011     MI.eraseFromParent();
2012     return true;
2013   }
2014 
2015   if (TLI->shouldEmitPCReloc(GV)) {
2016     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2017     MI.eraseFromParent();
2018     return true;
2019   }
2020 
2021   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2022   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2023 
2024   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2025     MachinePointerInfo::getGOT(MF),
2026     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2027     MachineMemOperand::MOInvariant,
2028     8 /*Size*/, 8 /*Align*/);
2029 
2030   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2031 
2032   if (Ty.getSizeInBits() == 32) {
2033     // Truncate if this is a 32-bit constant adrdess.
2034     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2035     B.buildExtract(DstReg, Load, 0);
2036   } else
2037     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2038 
2039   MI.eraseFromParent();
2040   return true;
2041 }
2042 
2043 bool AMDGPULegalizerInfo::legalizeLoad(
2044   MachineInstr &MI, MachineRegisterInfo &MRI,
2045   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
2046   B.setInstr(MI);
2047   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2048   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
2049   Observer.changingInstr(MI);
2050   MI.getOperand(1).setReg(Cast.getReg(0));
2051   Observer.changedInstr(MI);
2052   return true;
2053 }
2054 
2055 bool AMDGPULegalizerInfo::legalizeFMad(
2056   MachineInstr &MI, MachineRegisterInfo &MRI,
2057   MachineIRBuilder &B) const {
2058   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2059   assert(Ty.isScalar());
2060 
2061   MachineFunction &MF = B.getMF();
2062   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2063 
2064   // TODO: Always legal with future ftz flag.
2065   // FIXME: Do we need just output?
2066   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2067     return true;
2068   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2069     return true;
2070 
2071   MachineIRBuilder HelperBuilder(MI);
2072   GISelObserverWrapper DummyObserver;
2073   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2074   HelperBuilder.setMBB(*MI.getParent());
2075   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2076 }
2077 
2078 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2079   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2080   Register DstReg = MI.getOperand(0).getReg();
2081   Register PtrReg = MI.getOperand(1).getReg();
2082   Register CmpVal = MI.getOperand(2).getReg();
2083   Register NewVal = MI.getOperand(3).getReg();
2084 
2085   assert(SITargetLowering::isFlatGlobalAddrSpace(
2086            MRI.getType(PtrReg).getAddressSpace()) &&
2087          "this should not have been custom lowered");
2088 
2089   LLT ValTy = MRI.getType(CmpVal);
2090   LLT VecTy = LLT::vector(2, ValTy);
2091 
2092   B.setInstr(MI);
2093   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2094 
2095   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2096     .addDef(DstReg)
2097     .addUse(PtrReg)
2098     .addUse(PackedVal)
2099     .setMemRefs(MI.memoperands());
2100 
2101   MI.eraseFromParent();
2102   return true;
2103 }
2104 
2105 bool AMDGPULegalizerInfo::legalizeFlog(
2106   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2107   Register Dst = MI.getOperand(0).getReg();
2108   Register Src = MI.getOperand(1).getReg();
2109   LLT Ty = B.getMRI()->getType(Dst);
2110   unsigned Flags = MI.getFlags();
2111   B.setInstr(MI);
2112 
2113   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2114   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2115 
2116   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2117   MI.eraseFromParent();
2118   return true;
2119 }
2120 
2121 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2122                                        MachineIRBuilder &B) const {
2123   Register Dst = MI.getOperand(0).getReg();
2124   Register Src = MI.getOperand(1).getReg();
2125   unsigned Flags = MI.getFlags();
2126   LLT Ty = B.getMRI()->getType(Dst);
2127   B.setInstr(MI);
2128 
2129   auto K = B.buildFConstant(Ty, numbers::log2e);
2130   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2131   B.buildFExp2(Dst, Mul, Flags);
2132   MI.eraseFromParent();
2133   return true;
2134 }
2135 
2136 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2137                                        MachineIRBuilder &B) const {
2138   Register Dst = MI.getOperand(0).getReg();
2139   Register Src0 = MI.getOperand(1).getReg();
2140   Register Src1 = MI.getOperand(2).getReg();
2141   unsigned Flags = MI.getFlags();
2142   LLT Ty = B.getMRI()->getType(Dst);
2143   B.setInstr(MI);
2144   const LLT S16 = LLT::scalar(16);
2145   const LLT S32 = LLT::scalar(32);
2146 
2147   if (Ty == S32) {
2148     auto Log = B.buildFLog2(S32, Src0, Flags);
2149     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2150       .addUse(Log.getReg(0))
2151       .addUse(Src1)
2152       .setMIFlags(Flags);
2153     B.buildFExp2(Dst, Mul, Flags);
2154   } else if (Ty == S16) {
2155     // There's no f16 fmul_legacy, so we need to convert for it.
2156     auto Log = B.buildFLog2(S16, Src0, Flags);
2157     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2158     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2159     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2160       .addUse(Ext0.getReg(0))
2161       .addUse(Ext1.getReg(0))
2162       .setMIFlags(Flags);
2163 
2164     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2165   } else
2166     return false;
2167 
2168   MI.eraseFromParent();
2169   return true;
2170 }
2171 
2172 // Find a source register, ignoring any possible source modifiers.
2173 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2174   Register ModSrc = OrigSrc;
2175   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2176     ModSrc = SrcFNeg->getOperand(1).getReg();
2177     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2178       ModSrc = SrcFAbs->getOperand(1).getReg();
2179   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2180     ModSrc = SrcFAbs->getOperand(1).getReg();
2181   return ModSrc;
2182 }
2183 
2184 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2185                                          MachineRegisterInfo &MRI,
2186                                          MachineIRBuilder &B) const {
2187   B.setInstr(MI);
2188 
2189   const LLT S1 = LLT::scalar(1);
2190   const LLT S64 = LLT::scalar(64);
2191   Register Dst = MI.getOperand(0).getReg();
2192   Register OrigSrc = MI.getOperand(1).getReg();
2193   unsigned Flags = MI.getFlags();
2194   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2195          "this should not have been custom lowered");
2196 
2197   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2198   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2199   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2200   // V_FRACT bug is:
2201   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2202   //
2203   // Convert floor(x) to (x - fract(x))
2204 
2205   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2206     .addUse(OrigSrc)
2207     .setMIFlags(Flags);
2208 
2209   // Give source modifier matching some assistance before obscuring a foldable
2210   // pattern.
2211 
2212   // TODO: We can avoid the neg on the fract? The input sign to fract
2213   // shouldn't matter?
2214   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2215 
2216   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2217 
2218   Register Min = MRI.createGenericVirtualRegister(S64);
2219 
2220   // We don't need to concern ourselves with the snan handling difference, so
2221   // use the one which will directly select.
2222   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2223   if (MFI->getMode().IEEE)
2224     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2225   else
2226     B.buildFMinNum(Min, Fract, Const, Flags);
2227 
2228   Register CorrectedFract = Min;
2229   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2230     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2231     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2232   }
2233 
2234   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2235   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2236 
2237   MI.eraseFromParent();
2238   return true;
2239 }
2240 
2241 // Turn an illegal packed v2s16 build vector into bit operations.
2242 // TODO: This should probably be a bitcast action in LegalizerHelper.
2243 bool AMDGPULegalizerInfo::legalizeBuildVector(
2244   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2245   Register Dst = MI.getOperand(0).getReg();
2246   const LLT S32 = LLT::scalar(32);
2247   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2248 
2249   Register Src0 = MI.getOperand(1).getReg();
2250   Register Src1 = MI.getOperand(2).getReg();
2251   assert(MRI.getType(Src0) == LLT::scalar(16));
2252 
2253   B.setInstr(MI);
2254   auto Merge = B.buildMerge(S32, {Src0, Src1});
2255   B.buildBitcast(Dst, Merge);
2256 
2257   MI.eraseFromParent();
2258   return true;
2259 }
2260 
2261 // Return the use branch instruction, otherwise null if the usage is invalid.
2262 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2263                                        MachineRegisterInfo &MRI,
2264                                        MachineInstr *&Br) {
2265   Register CondDef = MI.getOperand(0).getReg();
2266   if (!MRI.hasOneNonDBGUse(CondDef))
2267     return nullptr;
2268 
2269   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2270   if (UseMI.getParent() != MI.getParent() ||
2271       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2272     return nullptr;
2273 
2274   // Make sure the cond br is followed by a G_BR
2275   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2276   if (Next != MI.getParent()->end()) {
2277     if (Next->getOpcode() != AMDGPU::G_BR)
2278       return nullptr;
2279     Br = &*Next;
2280   }
2281 
2282   return &UseMI;
2283 }
2284 
2285 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B,
2286                                                MachineRegisterInfo &MRI,
2287                                                Register LiveIn,
2288                                                Register PhyReg) const {
2289   assert(PhyReg.isPhysical() && "Physical register expected");
2290 
2291   // Insert the live-in copy, if required, by defining destination virtual
2292   // register.
2293   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
2294   if (!MRI.getVRegDef(LiveIn)) {
2295     // FIXME: Should have scoped insert pt
2296     MachineBasicBlock &OrigInsBB = B.getMBB();
2297     auto OrigInsPt = B.getInsertPt();
2298 
2299     MachineBasicBlock &EntryMBB = B.getMF().front();
2300     EntryMBB.addLiveIn(PhyReg);
2301     B.setInsertPt(EntryMBB, EntryMBB.begin());
2302     B.buildCopy(LiveIn, PhyReg);
2303 
2304     B.setInsertPt(OrigInsBB, OrigInsPt);
2305   }
2306 
2307   return LiveIn;
2308 }
2309 
2310 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B,
2311                                                 MachineRegisterInfo &MRI,
2312                                                 Register PhyReg, LLT Ty,
2313                                                 bool InsertLiveInCopy) const {
2314   assert(PhyReg.isPhysical() && "Physical register expected");
2315 
2316   // Get or create virtual live-in regester
2317   Register LiveIn = MRI.getLiveInVirtReg(PhyReg);
2318   if (!LiveIn) {
2319     LiveIn = MRI.createGenericVirtualRegister(Ty);
2320     MRI.addLiveIn(PhyReg, LiveIn);
2321   }
2322 
2323   // When the actual true copy required is from virtual register to physical
2324   // register (to be inserted later), live-in copy insertion from physical
2325   // to register virtual register is not required
2326   if (!InsertLiveInCopy)
2327     return LiveIn;
2328 
2329   return insertLiveInCopy(B, MRI, LiveIn, PhyReg);
2330 }
2331 
2332 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor(
2333     MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2334   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2335   const ArgDescriptor *Arg;
2336   const TargetRegisterClass *RC;
2337   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
2338   if (!Arg) {
2339     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
2340     return nullptr;
2341   }
2342   return Arg;
2343 }
2344 
2345 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2346                                          const ArgDescriptor *Arg) const {
2347   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2348     return false; // TODO: Handle these
2349 
2350   Register SrcReg = Arg->getRegister();
2351   assert(SrcReg.isPhysical() && "Physical register expected");
2352   assert(DstReg.isVirtual() && "Virtual register expected");
2353 
2354   MachineRegisterInfo &MRI = *B.getMRI();
2355 
2356   LLT Ty = MRI.getType(DstReg);
2357   Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty);
2358 
2359   if (Arg->isMasked()) {
2360     // TODO: Should we try to emit this once in the entry block?
2361     const LLT S32 = LLT::scalar(32);
2362     const unsigned Mask = Arg->getMask();
2363     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2364 
2365     Register AndMaskSrc = LiveIn;
2366 
2367     if (Shift != 0) {
2368       auto ShiftAmt = B.buildConstant(S32, Shift);
2369       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2370     }
2371 
2372     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2373   } else {
2374     B.buildCopy(DstReg, LiveIn);
2375   }
2376 
2377   return true;
2378 }
2379 
2380 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2381     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2382     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2383   B.setInstr(MI);
2384 
2385   const ArgDescriptor *Arg = getArgDescriptor(B, ArgType);
2386   if (!Arg)
2387     return false;
2388 
2389   if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg))
2390     return false;
2391 
2392   MI.eraseFromParent();
2393   return true;
2394 }
2395 
2396 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2397                                        MachineRegisterInfo &MRI,
2398                                        MachineIRBuilder &B) const {
2399   B.setInstr(MI);
2400   Register Dst = MI.getOperand(0).getReg();
2401   LLT DstTy = MRI.getType(Dst);
2402   LLT S16 = LLT::scalar(16);
2403   LLT S32 = LLT::scalar(32);
2404   LLT S64 = LLT::scalar(64);
2405 
2406   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2407     return true;
2408 
2409   if (DstTy == S16)
2410     return legalizeFDIV16(MI, MRI, B);
2411   if (DstTy == S32)
2412     return legalizeFDIV32(MI, MRI, B);
2413   if (DstTy == S64)
2414     return legalizeFDIV64(MI, MRI, B);
2415 
2416   return false;
2417 }
2418 
2419 static Register buildDivRCP(MachineIRBuilder &B, Register Src) {
2420   const LLT S32 = LLT::scalar(32);
2421 
2422   auto Cvt0 = B.buildUITOFP(S32, Src);
2423   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Cvt0});
2424   auto FPUIntMaxPlus1 = B.buildFConstant(S32, BitsToFloat(0x4f800000));
2425   auto Mul = B.buildFMul(S32, RcpIFlag, FPUIntMaxPlus1);
2426   return B.buildFPTOUI(S32, Mul).getReg(0);
2427 }
2428 
2429 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2430                                                   Register DstReg,
2431                                                   Register Num,
2432                                                   Register Den,
2433                                                   bool IsRem) const {
2434   const LLT S1 = LLT::scalar(1);
2435   const LLT S32 = LLT::scalar(32);
2436 
2437   // RCP =  URECIP(Den) = 2^32 / Den + e
2438   // e is rounding error.
2439   auto RCP = buildDivRCP(B, Den);
2440 
2441   // RCP_LO = mul(RCP, Den)
2442   auto RCP_LO = B.buildMul(S32, RCP, Den);
2443 
2444   // RCP_HI = mulhu (RCP, Den) */
2445   auto RCP_HI = B.buildUMulH(S32, RCP, Den);
2446 
2447   // NEG_RCP_LO = -RCP_LO
2448   auto Zero = B.buildConstant(S32, 0);
2449   auto NEG_RCP_LO = B.buildSub(S32, Zero, RCP_LO);
2450 
2451   // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
2452   auto CmpRcpHiZero = B.buildICmp(CmpInst::ICMP_EQ, S1, RCP_HI, Zero);
2453   auto ABS_RCP_LO = B.buildSelect(S32, CmpRcpHiZero, NEG_RCP_LO, RCP_LO);
2454 
2455   // Calculate the rounding error from the URECIP instruction
2456   // E = mulhu(ABS_RCP_LO, RCP)
2457   auto E = B.buildUMulH(S32, ABS_RCP_LO, RCP);
2458 
2459   // RCP_A_E = RCP + E
2460   auto RCP_A_E = B.buildAdd(S32, RCP, E);
2461 
2462   // RCP_S_E = RCP - E
2463   auto RCP_S_E = B.buildSub(S32, RCP, E);
2464 
2465   // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
2466   auto Tmp0 = B.buildSelect(S32, CmpRcpHiZero, RCP_A_E, RCP_S_E);
2467 
2468   // Quotient = mulhu(Tmp0, Num)stmp
2469   auto Quotient = B.buildUMulH(S32, Tmp0, Num);
2470 
2471   // Num_S_Remainder = Quotient * Den
2472   auto Num_S_Remainder = B.buildMul(S32, Quotient, Den);
2473 
2474   // Remainder = Num - Num_S_Remainder
2475   auto Remainder = B.buildSub(S32, Num, Num_S_Remainder);
2476 
2477   // Remainder_GE_Den = Remainder >= Den
2478   auto Remainder_GE_Den = B.buildICmp(CmpInst::ICMP_UGE, S1, Remainder, Den);
2479 
2480   // Remainder_GE_Zero = Num >= Num_S_Remainder;
2481   auto Remainder_GE_Zero = B.buildICmp(CmpInst::ICMP_UGE, S1,
2482                                        Num, Num_S_Remainder);
2483 
2484   // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
2485   auto Tmp1 = B.buildAnd(S1, Remainder_GE_Den, Remainder_GE_Zero);
2486 
2487   // Calculate Division result:
2488 
2489   // Quotient_A_One = Quotient + 1
2490   auto One = B.buildConstant(S32, 1);
2491   auto Quotient_A_One = B.buildAdd(S32, Quotient, One);
2492 
2493   // Quotient_S_One = Quotient - 1
2494   auto Quotient_S_One = B.buildSub(S32, Quotient, One);
2495 
2496   // Div = (Tmp1 == 0 ? Quotient_A_One : Quotient)
2497   auto Div = B.buildSelect(S32, Tmp1, Quotient, Quotient_A_One);
2498 
2499   // Div = (Remainder_GE_Zero ? Div : Quotient_S_One)
2500   if (IsRem) {
2501     Div = B.buildSelect(S32, Remainder_GE_Zero, Div, Quotient_S_One);
2502 
2503     // Calculate Rem result:
2504     auto Remainder_S_Den = B.buildSub(S32, Remainder, Den);
2505 
2506     // Remainder_A_Den = Remainder + Den
2507     auto Remainder_A_Den = B.buildAdd(S32, Remainder, Den);
2508 
2509     // Rem = (Tmp1 ? Remainder_S_Den : Remainder)
2510     auto Rem = B.buildSelect(S32, Tmp1, Remainder_S_Den, Remainder);
2511 
2512     // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den)
2513     B.buildSelect(DstReg, Remainder_GE_Zero, Rem, Remainder_A_Den);
2514   } else {
2515     B.buildSelect(DstReg, Remainder_GE_Zero, Div, Quotient_S_One);
2516   }
2517 }
2518 
2519 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2520                                               MachineRegisterInfo &MRI,
2521                                               MachineIRBuilder &B) const {
2522   B.setInstr(MI);
2523   const bool IsRem = MI.getOpcode() == AMDGPU::G_UREM;
2524   Register DstReg = MI.getOperand(0).getReg();
2525   Register Num = MI.getOperand(1).getReg();
2526   Register Den = MI.getOperand(2).getReg();
2527   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsRem);
2528   MI.eraseFromParent();
2529   return true;
2530 }
2531 
2532 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2533                                             MachineRegisterInfo &MRI,
2534                                             MachineIRBuilder &B) const {
2535   if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32))
2536     return legalizeUDIV_UREM32(MI, MRI, B);
2537   return false;
2538 }
2539 
2540 bool AMDGPULegalizerInfo::legalizeSDIV_SREM32(MachineInstr &MI,
2541                                               MachineRegisterInfo &MRI,
2542                                               MachineIRBuilder &B) const {
2543   B.setInstr(MI);
2544   const LLT S32 = LLT::scalar(32);
2545 
2546   const bool IsRem = MI.getOpcode() == AMDGPU::G_SREM;
2547   Register DstReg = MI.getOperand(0).getReg();
2548   Register LHS = MI.getOperand(1).getReg();
2549   Register RHS = MI.getOperand(2).getReg();
2550 
2551   auto ThirtyOne = B.buildConstant(S32, 31);
2552   auto LHSign = B.buildAShr(S32, LHS, ThirtyOne);
2553   auto RHSign = B.buildAShr(S32, LHS, ThirtyOne);
2554 
2555   LHS = B.buildAdd(S32, LHS, LHSign).getReg(0);
2556   RHS = B.buildAdd(S32, RHS, RHSign).getReg(0);
2557 
2558   LHS = B.buildXor(S32, LHS, LHSign).getReg(0);
2559   RHS = B.buildXor(S32, RHS, RHSign).getReg(0);
2560 
2561   Register UDivRem = MRI.createGenericVirtualRegister(S32);
2562   legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsRem);
2563 
2564   if (IsRem) {
2565     auto RSign = LHSign; // Remainder sign is the same as LHS
2566     UDivRem = B.buildXor(S32, UDivRem, RSign).getReg(0);
2567     B.buildSub(DstReg, UDivRem, RSign);
2568   } else {
2569     auto DSign = B.buildXor(S32, LHSign, RHSign);
2570     UDivRem = B.buildXor(S32, UDivRem, DSign).getReg(0);
2571     B.buildSub(DstReg, UDivRem, DSign);
2572   }
2573 
2574   MI.eraseFromParent();
2575   return true;
2576 }
2577 
2578 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2579                                             MachineRegisterInfo &MRI,
2580                                             MachineIRBuilder &B) const {
2581   if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32))
2582     return legalizeSDIV_SREM32(MI, MRI, B);
2583   return false;
2584 }
2585 
2586 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2587                                                  MachineRegisterInfo &MRI,
2588                                                  MachineIRBuilder &B) const {
2589   Register Res = MI.getOperand(0).getReg();
2590   Register LHS = MI.getOperand(1).getReg();
2591   Register RHS = MI.getOperand(2).getReg();
2592 
2593   uint16_t Flags = MI.getFlags();
2594 
2595   LLT ResTy = MRI.getType(Res);
2596   LLT S32 = LLT::scalar(32);
2597   LLT S64 = LLT::scalar(64);
2598 
2599   const MachineFunction &MF = B.getMF();
2600   bool Unsafe =
2601     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2602 
2603   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2604     return false;
2605 
2606   if (!Unsafe && ResTy == S32 &&
2607       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2608     return false;
2609 
2610   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2611     // 1 / x -> RCP(x)
2612     if (CLHS->isExactlyValue(1.0)) {
2613       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2614         .addUse(RHS)
2615         .setMIFlags(Flags);
2616 
2617       MI.eraseFromParent();
2618       return true;
2619     }
2620 
2621     // -1 / x -> RCP( FNEG(x) )
2622     if (CLHS->isExactlyValue(-1.0)) {
2623       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2624       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2625         .addUse(FNeg.getReg(0))
2626         .setMIFlags(Flags);
2627 
2628       MI.eraseFromParent();
2629       return true;
2630     }
2631   }
2632 
2633   // x / y -> x * (1.0 / y)
2634   if (Unsafe) {
2635     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2636       .addUse(RHS)
2637       .setMIFlags(Flags);
2638     B.buildFMul(Res, LHS, RCP, Flags);
2639 
2640     MI.eraseFromParent();
2641     return true;
2642   }
2643 
2644   return false;
2645 }
2646 
2647 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2648                                          MachineRegisterInfo &MRI,
2649                                          MachineIRBuilder &B) const {
2650   B.setInstr(MI);
2651   Register Res = MI.getOperand(0).getReg();
2652   Register LHS = MI.getOperand(1).getReg();
2653   Register RHS = MI.getOperand(2).getReg();
2654 
2655   uint16_t Flags = MI.getFlags();
2656 
2657   LLT S16 = LLT::scalar(16);
2658   LLT S32 = LLT::scalar(32);
2659 
2660   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2661   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2662 
2663   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2664     .addUse(RHSExt.getReg(0))
2665     .setMIFlags(Flags);
2666 
2667   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2668   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2669 
2670   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2671     .addUse(RDst.getReg(0))
2672     .addUse(RHS)
2673     .addUse(LHS)
2674     .setMIFlags(Flags);
2675 
2676   MI.eraseFromParent();
2677   return true;
2678 }
2679 
2680 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2681 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2682 static void toggleSPDenormMode(bool Enable,
2683                                MachineIRBuilder &B,
2684                                const GCNSubtarget &ST,
2685                                AMDGPU::SIModeRegisterDefaults Mode) {
2686   // Set SP denorm mode to this value.
2687   unsigned SPDenormMode =
2688     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
2689 
2690   if (ST.hasDenormModeInst()) {
2691     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2692     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
2693 
2694     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2695     B.buildInstr(AMDGPU::S_DENORM_MODE)
2696       .addImm(NewDenormModeValue);
2697 
2698   } else {
2699     // Select FP32 bit field in mode register.
2700     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2701                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2702                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2703 
2704     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2705       .addImm(SPDenormMode)
2706       .addImm(SPDenormModeBitField);
2707   }
2708 }
2709 
2710 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2711                                          MachineRegisterInfo &MRI,
2712                                          MachineIRBuilder &B) const {
2713   B.setInstr(MI);
2714   Register Res = MI.getOperand(0).getReg();
2715   Register LHS = MI.getOperand(1).getReg();
2716   Register RHS = MI.getOperand(2).getReg();
2717   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2718   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2719 
2720   uint16_t Flags = MI.getFlags();
2721 
2722   LLT S32 = LLT::scalar(32);
2723   LLT S1 = LLT::scalar(1);
2724 
2725   auto One = B.buildFConstant(S32, 1.0f);
2726 
2727   auto DenominatorScaled =
2728     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2729       .addUse(RHS)
2730       .addUse(LHS)
2731       .addImm(1)
2732       .setMIFlags(Flags);
2733   auto NumeratorScaled =
2734     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2735       .addUse(LHS)
2736       .addUse(RHS)
2737       .addImm(0)
2738       .setMIFlags(Flags);
2739 
2740   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2741     .addUse(DenominatorScaled.getReg(0))
2742     .setMIFlags(Flags);
2743   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2744 
2745   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2746   // aren't modeled as reading it.
2747   if (!Mode.allFP32Denormals())
2748     toggleSPDenormMode(true, B, ST, Mode);
2749 
2750   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2751   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2752   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2753   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2754   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2755   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2756 
2757   if (!Mode.allFP32Denormals())
2758     toggleSPDenormMode(false, B, ST, Mode);
2759 
2760   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2761     .addUse(Fma4.getReg(0))
2762     .addUse(Fma1.getReg(0))
2763     .addUse(Fma3.getReg(0))
2764     .addUse(NumeratorScaled.getReg(1))
2765     .setMIFlags(Flags);
2766 
2767   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2768     .addUse(Fmas.getReg(0))
2769     .addUse(RHS)
2770     .addUse(LHS)
2771     .setMIFlags(Flags);
2772 
2773   MI.eraseFromParent();
2774   return true;
2775 }
2776 
2777 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
2778                                          MachineRegisterInfo &MRI,
2779                                          MachineIRBuilder &B) const {
2780   B.setInstr(MI);
2781   Register Res = MI.getOperand(0).getReg();
2782   Register LHS = MI.getOperand(1).getReg();
2783   Register RHS = MI.getOperand(2).getReg();
2784 
2785   uint16_t Flags = MI.getFlags();
2786 
2787   LLT S64 = LLT::scalar(64);
2788   LLT S1 = LLT::scalar(1);
2789 
2790   auto One = B.buildFConstant(S64, 1.0);
2791 
2792   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2793     .addUse(LHS)
2794     .addUse(RHS)
2795     .addImm(1)
2796     .setMIFlags(Flags);
2797 
2798   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
2799 
2800   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
2801     .addUse(DivScale0.getReg(0))
2802     .setMIFlags(Flags);
2803 
2804   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
2805   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
2806   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
2807 
2808   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2809     .addUse(LHS)
2810     .addUse(RHS)
2811     .addImm(0)
2812     .setMIFlags(Flags);
2813 
2814   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
2815   auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags);
2816   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
2817 
2818   Register Scale;
2819   if (!ST.hasUsableDivScaleConditionOutput()) {
2820     // Workaround a hardware bug on SI where the condition output from div_scale
2821     // is not usable.
2822 
2823     LLT S32 = LLT::scalar(32);
2824 
2825     auto NumUnmerge = B.buildUnmerge(S32, LHS);
2826     auto DenUnmerge = B.buildUnmerge(S32, RHS);
2827     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
2828     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
2829 
2830     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
2831                               Scale1Unmerge.getReg(1));
2832     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
2833                               Scale0Unmerge.getReg(1));
2834     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
2835   } else {
2836     Scale = DivScale1.getReg(1);
2837   }
2838 
2839   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
2840     .addUse(Fma4.getReg(0))
2841     .addUse(Fma3.getReg(0))
2842     .addUse(Mul.getReg(0))
2843     .addUse(Scale)
2844     .setMIFlags(Flags);
2845 
2846   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
2847     .addUse(Fmas.getReg(0))
2848     .addUse(RHS)
2849     .addUse(LHS)
2850     .setMIFlags(Flags);
2851 
2852   MI.eraseFromParent();
2853   return true;
2854 }
2855 
2856 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
2857                                                  MachineRegisterInfo &MRI,
2858                                                  MachineIRBuilder &B) const {
2859   B.setInstr(MI);
2860   Register Res = MI.getOperand(0).getReg();
2861   Register LHS = MI.getOperand(2).getReg();
2862   Register RHS = MI.getOperand(3).getReg();
2863   uint16_t Flags = MI.getFlags();
2864 
2865   LLT S32 = LLT::scalar(32);
2866   LLT S1 = LLT::scalar(1);
2867 
2868   auto Abs = B.buildFAbs(S32, RHS, Flags);
2869   const APFloat C0Val(1.0f);
2870 
2871   auto C0 = B.buildConstant(S32, 0x6f800000);
2872   auto C1 = B.buildConstant(S32, 0x2f800000);
2873   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
2874 
2875   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
2876   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
2877 
2878   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
2879 
2880   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2881     .addUse(Mul0.getReg(0))
2882     .setMIFlags(Flags);
2883 
2884   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
2885 
2886   B.buildFMul(Res, Sel, Mul1, Flags);
2887 
2888   MI.eraseFromParent();
2889   return true;
2890 }
2891 
2892 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
2893                                                  MachineRegisterInfo &MRI,
2894                                                  MachineIRBuilder &B) const {
2895   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2896   if (!MFI->isEntryFunction()) {
2897     return legalizePreloadedArgIntrin(MI, MRI, B,
2898                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
2899   }
2900 
2901   B.setInstr(MI);
2902 
2903   uint64_t Offset =
2904     ST.getTargetLowering()->getImplicitParameterOffset(
2905       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
2906   Register DstReg = MI.getOperand(0).getReg();
2907   LLT DstTy = MRI.getType(DstReg);
2908   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
2909 
2910   const ArgDescriptor *Arg;
2911   const TargetRegisterClass *RC;
2912   std::tie(Arg, RC)
2913     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2914   if (!Arg)
2915     return false;
2916 
2917   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
2918   if (!loadInputValue(KernargPtrReg, B, Arg))
2919     return false;
2920 
2921   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
2922   MI.eraseFromParent();
2923   return true;
2924 }
2925 
2926 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
2927                                               MachineRegisterInfo &MRI,
2928                                               MachineIRBuilder &B,
2929                                               unsigned AddrSpace) const {
2930   B.setInstr(MI);
2931   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
2932   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
2933   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
2934   MI.eraseFromParent();
2935   return true;
2936 }
2937 
2938 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
2939 // offset (the offset that is included in bounds checking and swizzling, to be
2940 // split between the instruction's voffset and immoffset fields) and soffset
2941 // (the offset that is excluded from bounds checking and swizzling, to go in
2942 // the instruction's soffset field).  This function takes the first kind of
2943 // offset and figures out how to split it between voffset and immoffset.
2944 std::tuple<Register, unsigned, unsigned>
2945 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
2946                                         Register OrigOffset) const {
2947   const unsigned MaxImm = 4095;
2948   Register BaseReg;
2949   unsigned TotalConstOffset;
2950   MachineInstr *OffsetDef;
2951   const LLT S32 = LLT::scalar(32);
2952 
2953   std::tie(BaseReg, TotalConstOffset, OffsetDef)
2954     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
2955 
2956   unsigned ImmOffset = TotalConstOffset;
2957 
2958   // If the immediate value is too big for the immoffset field, put the value
2959   // and -4096 into the immoffset field so that the value that is copied/added
2960   // for the voffset field is a multiple of 4096, and it stands more chance
2961   // of being CSEd with the copy/add for another similar load/store.
2962   // However, do not do that rounding down to a multiple of 4096 if that is a
2963   // negative number, as it appears to be illegal to have a negative offset
2964   // in the vgpr, even if adding the immediate offset makes it positive.
2965   unsigned Overflow = ImmOffset & ~MaxImm;
2966   ImmOffset -= Overflow;
2967   if ((int32_t)Overflow < 0) {
2968     Overflow += ImmOffset;
2969     ImmOffset = 0;
2970   }
2971 
2972   if (Overflow != 0) {
2973     if (!BaseReg) {
2974       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
2975     } else {
2976       auto OverflowVal = B.buildConstant(S32, Overflow);
2977       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
2978     }
2979   }
2980 
2981   if (!BaseReg)
2982     BaseReg = B.buildConstant(S32, 0).getReg(0);
2983 
2984   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
2985 }
2986 
2987 /// Handle register layout difference for f16 images for some subtargets.
2988 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
2989                                              MachineRegisterInfo &MRI,
2990                                              Register Reg) const {
2991   if (!ST.hasUnpackedD16VMem())
2992     return Reg;
2993 
2994   const LLT S16 = LLT::scalar(16);
2995   const LLT S32 = LLT::scalar(32);
2996   LLT StoreVT = MRI.getType(Reg);
2997   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
2998 
2999   auto Unmerge = B.buildUnmerge(S16, Reg);
3000 
3001   SmallVector<Register, 4> WideRegs;
3002   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3003     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3004 
3005   int NumElts = StoreVT.getNumElements();
3006 
3007   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3008 }
3009 
3010 Register AMDGPULegalizerInfo::fixStoreSourceType(
3011   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3012   MachineRegisterInfo *MRI = B.getMRI();
3013   LLT Ty = MRI->getType(VData);
3014 
3015   const LLT S16 = LLT::scalar(16);
3016 
3017   // Fixup illegal register types for i8 stores.
3018   if (Ty == LLT::scalar(8) || Ty == S16) {
3019     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3020     return AnyExt;
3021   }
3022 
3023   if (Ty.isVector()) {
3024     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3025       if (IsFormat)
3026         return handleD16VData(B, *MRI, VData);
3027     }
3028   }
3029 
3030   return VData;
3031 }
3032 
3033 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3034                                               MachineRegisterInfo &MRI,
3035                                               MachineIRBuilder &B,
3036                                               bool IsTyped,
3037                                               bool IsFormat) const {
3038   B.setInstr(MI);
3039 
3040   Register VData = MI.getOperand(1).getReg();
3041   LLT Ty = MRI.getType(VData);
3042   LLT EltTy = Ty.getScalarType();
3043   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3044   const LLT S32 = LLT::scalar(32);
3045 
3046   VData = fixStoreSourceType(B, VData, IsFormat);
3047   Register RSrc = MI.getOperand(2).getReg();
3048 
3049   MachineMemOperand *MMO = *MI.memoperands_begin();
3050   const int MemSize = MMO->getSize();
3051 
3052   unsigned ImmOffset;
3053   unsigned TotalOffset;
3054 
3055   // The typed intrinsics add an immediate after the registers.
3056   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3057 
3058   // The struct intrinsic variants add one additional operand over raw.
3059   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3060   Register VIndex;
3061   int OpOffset = 0;
3062   if (HasVIndex) {
3063     VIndex = MI.getOperand(3).getReg();
3064     OpOffset = 1;
3065   }
3066 
3067   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3068   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3069 
3070   unsigned Format = 0;
3071   if (IsTyped) {
3072     Format = MI.getOperand(5 + OpOffset).getImm();
3073     ++OpOffset;
3074   }
3075 
3076   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3077 
3078   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3079   if (TotalOffset != 0)
3080     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3081 
3082   unsigned Opc;
3083   if (IsTyped) {
3084     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3085                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3086   } else if (IsFormat) {
3087     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3088                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3089   } else {
3090     switch (MemSize) {
3091     case 1:
3092       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3093       break;
3094     case 2:
3095       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3096       break;
3097     default:
3098       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3099       break;
3100     }
3101   }
3102 
3103   if (!VIndex)
3104     VIndex = B.buildConstant(S32, 0).getReg(0);
3105 
3106   auto MIB = B.buildInstr(Opc)
3107     .addUse(VData)              // vdata
3108     .addUse(RSrc)               // rsrc
3109     .addUse(VIndex)             // vindex
3110     .addUse(VOffset)            // voffset
3111     .addUse(SOffset)            // soffset
3112     .addImm(ImmOffset);         // offset(imm)
3113 
3114   if (IsTyped)
3115     MIB.addImm(Format);
3116 
3117   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3118      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3119      .addMemOperand(MMO);
3120 
3121   MI.eraseFromParent();
3122   return true;
3123 }
3124 
3125 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3126                                              MachineRegisterInfo &MRI,
3127                                              MachineIRBuilder &B,
3128                                              bool IsFormat,
3129                                              bool IsTyped) const {
3130   B.setInstr(MI);
3131 
3132   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3133   MachineMemOperand *MMO = *MI.memoperands_begin();
3134   const int MemSize = MMO->getSize();
3135   const LLT S32 = LLT::scalar(32);
3136 
3137   Register Dst = MI.getOperand(0).getReg();
3138   Register RSrc = MI.getOperand(2).getReg();
3139 
3140   // The typed intrinsics add an immediate after the registers.
3141   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3142 
3143   // The struct intrinsic variants add one additional operand over raw.
3144   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3145   Register VIndex;
3146   int OpOffset = 0;
3147   if (HasVIndex) {
3148     VIndex = MI.getOperand(3).getReg();
3149     OpOffset = 1;
3150   }
3151 
3152   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3153   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3154 
3155   unsigned Format = 0;
3156   if (IsTyped) {
3157     Format = MI.getOperand(5 + OpOffset).getImm();
3158     ++OpOffset;
3159   }
3160 
3161   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3162   unsigned ImmOffset;
3163   unsigned TotalOffset;
3164 
3165   LLT Ty = MRI.getType(Dst);
3166   LLT EltTy = Ty.getScalarType();
3167   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3168   const bool Unpacked = ST.hasUnpackedD16VMem();
3169 
3170   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3171   if (TotalOffset != 0)
3172     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3173 
3174   unsigned Opc;
3175 
3176   if (IsTyped) {
3177     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3178                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3179   } else if (IsFormat) {
3180     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3181                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3182   } else {
3183     switch (MemSize) {
3184     case 1:
3185       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3186       break;
3187     case 2:
3188       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3189       break;
3190     default:
3191       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3192       break;
3193     }
3194   }
3195 
3196   Register LoadDstReg;
3197 
3198   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3199   LLT UnpackedTy = Ty.changeElementSize(32);
3200 
3201   if (IsExtLoad)
3202     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3203   else if (Unpacked && IsD16 && Ty.isVector())
3204     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3205   else
3206     LoadDstReg = Dst;
3207 
3208   if (!VIndex)
3209     VIndex = B.buildConstant(S32, 0).getReg(0);
3210 
3211   auto MIB = B.buildInstr(Opc)
3212     .addDef(LoadDstReg)         // vdata
3213     .addUse(RSrc)               // rsrc
3214     .addUse(VIndex)             // vindex
3215     .addUse(VOffset)            // voffset
3216     .addUse(SOffset)            // soffset
3217     .addImm(ImmOffset);         // offset(imm)
3218 
3219   if (IsTyped)
3220     MIB.addImm(Format);
3221 
3222   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3223      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3224      .addMemOperand(MMO);
3225 
3226   if (LoadDstReg != Dst) {
3227     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3228 
3229     // Widen result for extending loads was widened.
3230     if (IsExtLoad)
3231       B.buildTrunc(Dst, LoadDstReg);
3232     else {
3233       // Repack to original 16-bit vector result
3234       // FIXME: G_TRUNC should work, but legalization currently fails
3235       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3236       SmallVector<Register, 4> Repack;
3237       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3238         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3239       B.buildMerge(Dst, Repack);
3240     }
3241   }
3242 
3243   MI.eraseFromParent();
3244   return true;
3245 }
3246 
3247 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3248                                                MachineIRBuilder &B,
3249                                                bool IsInc) const {
3250   B.setInstr(MI);
3251   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3252                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3253   B.buildInstr(Opc)
3254     .addDef(MI.getOperand(0).getReg())
3255     .addUse(MI.getOperand(2).getReg())
3256     .addUse(MI.getOperand(3).getReg())
3257     .cloneMemRefs(MI);
3258   MI.eraseFromParent();
3259   return true;
3260 }
3261 
3262 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3263   switch (IntrID) {
3264   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3265   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3266     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3267   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3268   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3269     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3270   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3271   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3272     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3273   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3274   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3275     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3276   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3277   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3278     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3279   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3280   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3281     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3282   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3283   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3284     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3285   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3286   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3287     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3288   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3289   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3290     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3291   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3292   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3293     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3294   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3295   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3296     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3297   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3298   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3299     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3300   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3301   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3302     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3303   default:
3304     llvm_unreachable("unhandled atomic opcode");
3305   }
3306 }
3307 
3308 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3309                                                MachineIRBuilder &B,
3310                                                Intrinsic::ID IID) const {
3311   B.setInstr(MI);
3312 
3313   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3314                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3315 
3316   Register Dst = MI.getOperand(0).getReg();
3317   Register VData = MI.getOperand(2).getReg();
3318 
3319   Register CmpVal;
3320   int OpOffset = 0;
3321 
3322   if (IsCmpSwap) {
3323     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3324     ++OpOffset;
3325   }
3326 
3327   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3328   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
3329 
3330   // The struct intrinsic variants add one additional operand over raw.
3331   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3332   Register VIndex;
3333   if (HasVIndex) {
3334     VIndex = MI.getOperand(4 + OpOffset).getReg();
3335     ++OpOffset;
3336   }
3337 
3338   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3339   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3340   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3341 
3342   MachineMemOperand *MMO = *MI.memoperands_begin();
3343 
3344   unsigned ImmOffset;
3345   unsigned TotalOffset;
3346   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3347   if (TotalOffset != 0)
3348     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3349 
3350   if (!VIndex)
3351     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3352 
3353   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
3354     .addDef(Dst)
3355     .addUse(VData); // vdata
3356 
3357   if (IsCmpSwap)
3358     MIB.addReg(CmpVal);
3359 
3360   MIB.addUse(RSrc)               // rsrc
3361      .addUse(VIndex)             // vindex
3362      .addUse(VOffset)            // voffset
3363      .addUse(SOffset)            // soffset
3364      .addImm(ImmOffset)          // offset(imm)
3365      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3366      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3367      .addMemOperand(MMO);
3368 
3369   MI.eraseFromParent();
3370   return true;
3371 }
3372 
3373 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3374 /// vector with s16 typed elements.
3375 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI,
3376                                         SmallVectorImpl<Register> &PackedAddrs,
3377                                         int AddrIdx, int DimIdx, int NumVAddrs,
3378                                         int NumGradients) {
3379   const LLT S16 = LLT::scalar(16);
3380   const LLT V2S16 = LLT::vector(2, 16);
3381 
3382   for (int I = AddrIdx; I < AddrIdx + NumVAddrs; ++I) {
3383     Register AddrReg = MI.getOperand(I).getReg();
3384 
3385     if (I < DimIdx) {
3386       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
3387       PackedAddrs.push_back(AddrReg);
3388     } else {
3389       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
3390       // derivatives dx/dh and dx/dv are packed with undef.
3391       if (((I + 1) >= (AddrIdx + NumVAddrs)) ||
3392           ((NumGradients / 2) % 2 == 1 &&
3393            (I == DimIdx + (NumGradients / 2) - 1 ||
3394             I == DimIdx + NumGradients - 1))) {
3395         PackedAddrs.push_back(
3396             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
3397                 .getReg(0));
3398       } else {
3399         PackedAddrs.push_back(
3400             B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()})
3401                 .getReg(0));
3402         ++I;
3403       }
3404     }
3405   }
3406 }
3407 
3408 /// Convert from separate vaddr components to a single vector address register,
3409 /// and replace the remaining operands with $noreg.
3410 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
3411                                      int DimIdx, int NumVAddrs) {
3412   SmallVector<Register, 8> AddrRegs(NumVAddrs);
3413   for (int I = 0; I != NumVAddrs; ++I) {
3414     AddrRegs[I] = MI.getOperand(DimIdx + I).getReg();
3415     assert(B.getMRI()->getType(AddrRegs[I]) == LLT::scalar(32));
3416   }
3417 
3418   auto VAddr = B.buildBuildVector(LLT::vector(NumVAddrs, 32), AddrRegs);
3419   MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
3420   for (int I = 1; I != NumVAddrs; ++I)
3421     MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
3422 }
3423 
3424 /// Return number of address arguments, and the number of gradients
3425 static std::pair<int, int>
3426 getImageNumVAddr(const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
3427                  const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode) {
3428   const AMDGPU::MIMGDimInfo *DimInfo
3429     = AMDGPU::getMIMGDimInfo(ImageDimIntr->Dim);
3430 
3431   int NumGradients = BaseOpcode->Gradients ? DimInfo->NumGradients : 0;
3432   int NumCoords = BaseOpcode->Coordinates ? DimInfo->NumCoords : 0;
3433   int NumLCM = BaseOpcode->LodOrClampOrMip ? 1 : 0;
3434   int NumVAddr = BaseOpcode->NumExtraArgs + NumGradients + NumCoords + NumLCM;
3435   return {NumVAddr, NumGradients};
3436 }
3437 
3438 static int getDMaskIdx(const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode,
3439                        int NumDefs) {
3440   assert(!BaseOpcode->Atomic);
3441   return NumDefs + 1 + (BaseOpcode->Store ? 1 : 0);
3442 }
3443 
3444 /// Return first address operand index in an image intrinsic.
3445 static int getImageVAddrIdxBegin(const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode,
3446                                  int NumDefs) {
3447   if (BaseOpcode->Atomic)
3448     return NumDefs + 1 + (BaseOpcode->AtomicX2 ? 2 : 1);
3449   return getDMaskIdx(BaseOpcode, NumDefs) + 1;
3450 }
3451 
3452 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
3453 ///
3454 /// Depending on the subtarget, load/store with 16-bit element data need to be
3455 /// rewritten to use the low half of 32-bit registers, or directly use a packed
3456 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
3457 /// registers.
3458 ///
3459 /// We don't want to directly select image instructions just yet, but also want
3460 /// to exposes all register repacking to the legalizer/combiners. We also don't
3461 /// want a selected instrution entering RegBankSelect. In order to avoid
3462 /// defining a multitude of intermediate image instructions, directly hack on
3463 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
3464 /// now unnecessary arguments with $noreg.
3465 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3466     MachineInstr &MI, MachineIRBuilder &B,
3467     GISelChangeObserver &Observer,
3468     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3469   B.setInstr(MI);
3470 
3471   const int NumDefs = MI.getNumExplicitDefs();
3472   bool IsTFE = NumDefs == 2;
3473   // We are only processing the operands of d16 image operations on subtargets
3474   // that use the unpacked register layout, or need to repack the TFE result.
3475 
3476   // TODO: Do we need to guard against already legalized intrinsics?
3477   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3478     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3479 
3480   MachineRegisterInfo *MRI = B.getMRI();
3481   const LLT S32 = LLT::scalar(32);
3482   const LLT S16 = LLT::scalar(16);
3483   const LLT V2S16 = LLT::vector(2, 16);
3484 
3485   // Index of first address argument
3486   const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs);
3487 
3488   // Check for 16 bit addresses and pack if true.
3489   int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
3490   LLT AddrTy = MRI->getType(MI.getOperand(DimIdx).getReg());
3491   const bool IsA16 = AddrTy == S16;
3492 
3493   int NumVAddrs, NumGradients;
3494   std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode);
3495   const int DMaskIdx = BaseOpcode->Atomic ? -1 :
3496     getDMaskIdx(BaseOpcode, NumDefs);
3497   unsigned DMask = 0;
3498 
3499   int DMaskLanes = 0;
3500   if (!BaseOpcode->Atomic) {
3501     DMask = MI.getOperand(DMaskIdx).getImm();
3502     if (BaseOpcode->Gather4) {
3503       DMaskLanes = 4;
3504     } else if (DMask != 0) {
3505       DMaskLanes = countPopulation(DMask);
3506     } else if (!IsTFE && !BaseOpcode->Store) {
3507       // If dmask is 0, this is a no-op load. This can be eliminated.
3508       B.buildUndef(MI.getOperand(0));
3509       MI.eraseFromParent();
3510       return true;
3511     }
3512   }
3513 
3514   Observer.changingInstr(MI);
3515   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
3516 
3517   unsigned NewOpcode = NumDefs == 0 ?
3518     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
3519 
3520   // Track that we legalized this
3521   MI.setDesc(B.getTII().get(NewOpcode));
3522 
3523   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
3524   // dmask to be at least 1 otherwise the instruction will fail
3525   if (IsTFE && DMask == 0) {
3526     DMask = 0x1;
3527     DMaskLanes = 1;
3528     MI.getOperand(DMaskIdx).setImm(DMask);
3529   }
3530 
3531   // If the register allocator cannot place the address registers contiguously
3532   // without introducing moves, then using the non-sequential address encoding
3533   // is always preferable, since it saves VALU instructions and is usually a
3534   // wash in terms of code size or even better.
3535   //
3536   // However, we currently have no way of hinting to the register allocator
3537   // that MIMG addresses should be placed contiguously when it is possible to
3538   // do so, so force non-NSA for the common 2-address case as a heuristic.
3539   //
3540   // SIShrinkInstructions will convert NSA encodings to non-NSA after register
3541   // allocation when possible.
3542   const bool UseNSA = NumVAddrs >= 3 &&
3543                       ST.hasFeature(AMDGPU::FeatureNSAEncoding);
3544 
3545   // Rewrite the addressing register layout before doing anything else.
3546   if (IsA16) {
3547     // FIXME: this feature is missing from gfx10. When that is fixed, this check
3548     // should be introduced.
3549     if (!ST.hasR128A16() && !ST.hasGFX10A16())
3550       return false;
3551 
3552     if (NumVAddrs > 1) {
3553       SmallVector<Register, 4> PackedRegs;
3554       packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, NumVAddrs,
3555                                   NumGradients);
3556 
3557       if (!UseNSA && PackedRegs.size() > 1) {
3558         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
3559         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
3560         PackedRegs[0] = Concat.getReg(0);
3561         PackedRegs.resize(1);
3562       }
3563 
3564       const int NumPacked = PackedRegs.size();
3565       for (int I = 0; I != NumVAddrs; ++I) {
3566         assert(MI.getOperand(AddrIdx + I).getReg() != AMDGPU::NoRegister);
3567 
3568         if (I < NumPacked)
3569           MI.getOperand(AddrIdx + I).setReg(PackedRegs[I]);
3570         else
3571           MI.getOperand(AddrIdx + I).setReg(AMDGPU::NoRegister);
3572       }
3573     }
3574   } else if (!UseNSA && NumVAddrs > 1) {
3575     convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
3576   }
3577 
3578   if (BaseOpcode->Store) { // No TFE for stores?
3579     // TODO: Handle dmask trim
3580     Register VData = MI.getOperand(1).getReg();
3581     LLT Ty = MRI->getType(VData);
3582     if (!Ty.isVector() || Ty.getElementType() != S16)
3583       return true;
3584 
3585     B.setInstr(MI);
3586 
3587     Register RepackedReg = handleD16VData(B, *MRI, VData);
3588     if (RepackedReg != VData) {
3589       MI.getOperand(1).setReg(RepackedReg);
3590     }
3591 
3592     return true;
3593   }
3594 
3595   Register DstReg = MI.getOperand(0).getReg();
3596   LLT Ty = MRI->getType(DstReg);
3597   const LLT EltTy = Ty.getScalarType();
3598   const bool IsD16 = Ty.getScalarType() == S16;
3599   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
3600 
3601   // Confirm that the return type is large enough for the dmask specified
3602   if (NumElts < DMaskLanes)
3603     return false;
3604 
3605   if (NumElts > 4 || DMaskLanes > 4)
3606     return false;
3607 
3608   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
3609   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
3610 
3611   // The raw dword aligned data component of the load. The only legal cases
3612   // where this matters should be when using the packed D16 format, for
3613   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
3614   LLT RoundedTy;
3615 
3616   // S32 vector to to cover all data, plus TFE result element.
3617   LLT TFETy;
3618 
3619   // Register type to use for each loaded component. Will be S32 or V2S16.
3620   LLT RegTy;
3621 
3622   if (IsD16 && ST.hasUnpackedD16VMem()) {
3623     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
3624     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
3625     RegTy = S32;
3626   } else {
3627     unsigned EltSize = EltTy.getSizeInBits();
3628     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
3629     unsigned RoundedSize = 32 * RoundedElts;
3630     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
3631     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
3632     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
3633   }
3634 
3635   // The return type does not need adjustment.
3636   // TODO: Should we change s16 case to s32 or <2 x s16>?
3637   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
3638     return true;
3639 
3640   Register Dst1Reg;
3641 
3642   // Insert after the instruction.
3643   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3644 
3645   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
3646   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
3647   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
3648   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
3649 
3650   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
3651 
3652   MI.getOperand(0).setReg(NewResultReg);
3653 
3654   // In the IR, TFE is supposed to be used with a 2 element struct return
3655   // type. The intruction really returns these two values in one contiguous
3656   // register, with one additional dword beyond the loaded data. Rewrite the
3657   // return type to use a single register result.
3658 
3659   if (IsTFE) {
3660     Dst1Reg = MI.getOperand(1).getReg();
3661     if (MRI->getType(Dst1Reg) != S32)
3662       return false;
3663 
3664     // TODO: Make sure the TFE operand bit is set.
3665     MI.RemoveOperand(1);
3666 
3667     // Handle the easy case that requires no repack instructions.
3668     if (Ty == S32) {
3669       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
3670       return true;
3671     }
3672   }
3673 
3674   // Now figure out how to copy the new result register back into the old
3675   // result.
3676   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
3677 
3678   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
3679 
3680   if (ResultNumRegs == 1) {
3681     assert(!IsTFE);
3682     ResultRegs[0] = NewResultReg;
3683   } else {
3684     // We have to repack into a new vector of some kind.
3685     for (int I = 0; I != NumDataRegs; ++I)
3686       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
3687     B.buildUnmerge(ResultRegs, NewResultReg);
3688 
3689     // Drop the final TFE element to get the data part. The TFE result is
3690     // directly written to the right place already.
3691     if (IsTFE)
3692       ResultRegs.resize(NumDataRegs);
3693   }
3694 
3695   // For an s16 scalar result, we form an s32 result with a truncate regardless
3696   // of packed vs. unpacked.
3697   if (IsD16 && !Ty.isVector()) {
3698     B.buildTrunc(DstReg, ResultRegs[0]);
3699     return true;
3700   }
3701 
3702   // Avoid a build/concat_vector of 1 entry.
3703   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
3704     B.buildBitcast(DstReg, ResultRegs[0]);
3705     return true;
3706   }
3707 
3708   assert(Ty.isVector());
3709 
3710   if (IsD16) {
3711     // For packed D16 results with TFE enabled, all the data components are
3712     // S32. Cast back to the expected type.
3713     //
3714     // TODO: We don't really need to use load s32 elements. We would only need one
3715     // cast for the TFE result if a multiple of v2s16 was used.
3716     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
3717       for (Register &Reg : ResultRegs)
3718         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
3719     } else if (ST.hasUnpackedD16VMem()) {
3720       for (Register &Reg : ResultRegs)
3721         Reg = B.buildTrunc(S16, Reg).getReg(0);
3722     }
3723   }
3724 
3725   auto padWithUndef = [&](LLT Ty, int NumElts) {
3726     if (NumElts == 0)
3727       return;
3728     Register Undef = B.buildUndef(Ty).getReg(0);
3729     for (int I = 0; I != NumElts; ++I)
3730       ResultRegs.push_back(Undef);
3731   };
3732 
3733   // Pad out any elements eliminated due to the dmask.
3734   LLT ResTy = MRI->getType(ResultRegs[0]);
3735   if (!ResTy.isVector()) {
3736     padWithUndef(ResTy, NumElts - ResultRegs.size());
3737     B.buildBuildVector(DstReg, ResultRegs);
3738     return true;
3739   }
3740 
3741   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
3742   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
3743 
3744   // Deal with the one annoying legal case.
3745   const LLT V3S16 = LLT::vector(3, 16);
3746   if (Ty == V3S16) {
3747     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
3748     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
3749     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
3750     return true;
3751   }
3752 
3753   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
3754   B.buildConcatVectors(DstReg, ResultRegs);
3755   return true;
3756 }
3757 
3758 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
3759   MachineInstr &MI, MachineIRBuilder &B,
3760   GISelChangeObserver &Observer) const {
3761   Register Dst = MI.getOperand(0).getReg();
3762   LLT Ty = B.getMRI()->getType(Dst);
3763   unsigned Size = Ty.getSizeInBits();
3764   MachineFunction &MF = B.getMF();
3765 
3766   Observer.changingInstr(MI);
3767 
3768   // FIXME: We don't really need this intermediate instruction. The intrinsic
3769   // should be fixed to have a memory operand. Since it's readnone, we're not
3770   // allowed to add one.
3771   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
3772   MI.RemoveOperand(1); // Remove intrinsic ID
3773 
3774   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
3775   // TODO: Should this use datalayout alignment?
3776   const unsigned MemSize = (Size + 7) / 8;
3777   const unsigned MemAlign = 4;
3778   MachineMemOperand *MMO = MF.getMachineMemOperand(
3779     MachinePointerInfo(),
3780     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
3781     MachineMemOperand::MOInvariant, MemSize, MemAlign);
3782   MI.addMemOperand(MF, MMO);
3783 
3784   // There are no 96-bit result scalar loads, but widening to 128-bit should
3785   // always be legal. We may need to restore this to a 96-bit result if it turns
3786   // out this needs to be converted to a vector load during RegBankSelect.
3787   if (!isPowerOf2_32(Size)) {
3788     LegalizerHelper Helper(MF, *this, Observer, B);
3789     B.setInstr(MI);
3790 
3791     if (Ty.isVector())
3792       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
3793     else
3794       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
3795   }
3796 
3797   Observer.changedInstr(MI);
3798   return true;
3799 }
3800 
3801 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
3802                                                 MachineRegisterInfo &MRI,
3803                                                 MachineIRBuilder &B) const {
3804   B.setInstr(MI);
3805 
3806   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
3807   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
3808       !ST.isTrapHandlerEnabled()) {
3809     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
3810   } else {
3811     // Pass queue pointer to trap handler as input, and insert trap instruction
3812     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
3813     const ArgDescriptor *Arg =
3814         getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR);
3815     if (!Arg)
3816       return false;
3817     MachineRegisterInfo &MRI = *B.getMRI();
3818     Register SGPR01(AMDGPU::SGPR0_SGPR1);
3819     Register LiveIn = getLiveInRegister(
3820         B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64),
3821         /*InsertLiveInCopy=*/false);
3822     if (!loadInputValue(LiveIn, B, Arg))
3823       return false;
3824     B.buildCopy(SGPR01, LiveIn);
3825     B.buildInstr(AMDGPU::S_TRAP)
3826         .addImm(GCNSubtarget::TrapIDLLVMTrap)
3827         .addReg(SGPR01, RegState::Implicit);
3828   }
3829 
3830   MI.eraseFromParent();
3831   return true;
3832 }
3833 
3834 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
3835     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
3836   B.setInstr(MI);
3837 
3838   // Is non-HSA path or trap-handler disabled? then, report a warning
3839   // accordingly
3840   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
3841       !ST.isTrapHandlerEnabled()) {
3842     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
3843                                      "debugtrap handler not supported",
3844                                      MI.getDebugLoc(), DS_Warning);
3845     LLVMContext &Ctx = B.getMF().getFunction().getContext();
3846     Ctx.diagnose(NoTrap);
3847   } else {
3848     // Insert debug-trap instruction
3849     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
3850   }
3851 
3852   MI.eraseFromParent();
3853   return true;
3854 }
3855 
3856 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
3857                                             MachineIRBuilder &B,
3858                                             GISelChangeObserver &Observer) const {
3859   MachineRegisterInfo &MRI = *B.getMRI();
3860 
3861   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
3862   auto IntrID = MI.getIntrinsicID();
3863   switch (IntrID) {
3864   case Intrinsic::amdgcn_if:
3865   case Intrinsic::amdgcn_else: {
3866     MachineInstr *Br = nullptr;
3867     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
3868       const SIRegisterInfo *TRI
3869         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
3870 
3871       B.setInstr(*BrCond);
3872       Register Def = MI.getOperand(1).getReg();
3873       Register Use = MI.getOperand(3).getReg();
3874 
3875       MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB();
3876       if (Br)
3877         BrTarget = Br->getOperand(0).getMBB();
3878 
3879       if (IntrID == Intrinsic::amdgcn_if) {
3880         B.buildInstr(AMDGPU::SI_IF)
3881           .addDef(Def)
3882           .addUse(Use)
3883           .addMBB(BrTarget);
3884       } else {
3885         B.buildInstr(AMDGPU::SI_ELSE)
3886           .addDef(Def)
3887           .addUse(Use)
3888           .addMBB(BrTarget)
3889           .addImm(0);
3890       }
3891 
3892       if (Br)
3893         Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB());
3894 
3895       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
3896       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
3897       MI.eraseFromParent();
3898       BrCond->eraseFromParent();
3899       return true;
3900     }
3901 
3902     return false;
3903   }
3904   case Intrinsic::amdgcn_loop: {
3905     MachineInstr *Br = nullptr;
3906     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
3907       const SIRegisterInfo *TRI
3908         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
3909 
3910       B.setInstr(*BrCond);
3911 
3912       MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB();
3913       if (Br)
3914         BrTarget = Br->getOperand(0).getMBB();
3915 
3916       Register Reg = MI.getOperand(2).getReg();
3917       B.buildInstr(AMDGPU::SI_LOOP)
3918         .addUse(Reg)
3919         .addMBB(BrTarget);
3920 
3921       if (Br)
3922         Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB());
3923 
3924       MI.eraseFromParent();
3925       BrCond->eraseFromParent();
3926       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
3927       return true;
3928     }
3929 
3930     return false;
3931   }
3932   case Intrinsic::amdgcn_kernarg_segment_ptr:
3933     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
3934       B.setInstr(MI);
3935       // This only makes sense to call in a kernel, so just lower to null.
3936       B.buildConstant(MI.getOperand(0).getReg(), 0);
3937       MI.eraseFromParent();
3938       return true;
3939     }
3940 
3941     return legalizePreloadedArgIntrin(
3942       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
3943   case Intrinsic::amdgcn_implicitarg_ptr:
3944     return legalizeImplicitArgPtr(MI, MRI, B);
3945   case Intrinsic::amdgcn_workitem_id_x:
3946     return legalizePreloadedArgIntrin(MI, MRI, B,
3947                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
3948   case Intrinsic::amdgcn_workitem_id_y:
3949     return legalizePreloadedArgIntrin(MI, MRI, B,
3950                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
3951   case Intrinsic::amdgcn_workitem_id_z:
3952     return legalizePreloadedArgIntrin(MI, MRI, B,
3953                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
3954   case Intrinsic::amdgcn_workgroup_id_x:
3955     return legalizePreloadedArgIntrin(MI, MRI, B,
3956                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
3957   case Intrinsic::amdgcn_workgroup_id_y:
3958     return legalizePreloadedArgIntrin(MI, MRI, B,
3959                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
3960   case Intrinsic::amdgcn_workgroup_id_z:
3961     return legalizePreloadedArgIntrin(MI, MRI, B,
3962                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
3963   case Intrinsic::amdgcn_dispatch_ptr:
3964     return legalizePreloadedArgIntrin(MI, MRI, B,
3965                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
3966   case Intrinsic::amdgcn_queue_ptr:
3967     return legalizePreloadedArgIntrin(MI, MRI, B,
3968                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
3969   case Intrinsic::amdgcn_implicit_buffer_ptr:
3970     return legalizePreloadedArgIntrin(
3971       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
3972   case Intrinsic::amdgcn_dispatch_id:
3973     return legalizePreloadedArgIntrin(MI, MRI, B,
3974                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
3975   case Intrinsic::amdgcn_fdiv_fast:
3976     return legalizeFDIVFastIntrin(MI, MRI, B);
3977   case Intrinsic::amdgcn_is_shared:
3978     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
3979   case Intrinsic::amdgcn_is_private:
3980     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
3981   case Intrinsic::amdgcn_wavefrontsize: {
3982     B.setInstr(MI);
3983     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
3984     MI.eraseFromParent();
3985     return true;
3986   }
3987   case Intrinsic::amdgcn_s_buffer_load:
3988     return legalizeSBufferLoad(MI, B, Observer);
3989   case Intrinsic::amdgcn_raw_buffer_store:
3990   case Intrinsic::amdgcn_struct_buffer_store:
3991     return legalizeBufferStore(MI, MRI, B, false, false);
3992   case Intrinsic::amdgcn_raw_buffer_store_format:
3993   case Intrinsic::amdgcn_struct_buffer_store_format:
3994     return legalizeBufferStore(MI, MRI, B, false, true);
3995   case Intrinsic::amdgcn_raw_tbuffer_store:
3996   case Intrinsic::amdgcn_struct_tbuffer_store:
3997     return legalizeBufferStore(MI, MRI, B, true, true);
3998   case Intrinsic::amdgcn_raw_buffer_load:
3999   case Intrinsic::amdgcn_struct_buffer_load:
4000     return legalizeBufferLoad(MI, MRI, B, false, false);
4001   case Intrinsic::amdgcn_raw_buffer_load_format:
4002   case Intrinsic::amdgcn_struct_buffer_load_format:
4003     return legalizeBufferLoad(MI, MRI, B, true, false);
4004   case Intrinsic::amdgcn_raw_tbuffer_load:
4005   case Intrinsic::amdgcn_struct_tbuffer_load:
4006     return legalizeBufferLoad(MI, MRI, B, true, true);
4007   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4008   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4009   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4010   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4011   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4012   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4013   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4014   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4015   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4016   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4017   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4018   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4019   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4020   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4021   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4022   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4023   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4024   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4025   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4026   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4027   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4028   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4029   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4030   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4031   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4032   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4033     return legalizeBufferAtomic(MI, B, IntrID);
4034   case Intrinsic::amdgcn_atomic_inc:
4035     return legalizeAtomicIncDec(MI, B, true);
4036   case Intrinsic::amdgcn_atomic_dec:
4037     return legalizeAtomicIncDec(MI, B, false);
4038   case Intrinsic::trap:
4039     return legalizeTrapIntrinsic(MI, MRI, B);
4040   case Intrinsic::debugtrap:
4041     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4042   default: {
4043     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4044             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4045       return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr);
4046     return true;
4047   }
4048   }
4049 
4050   return true;
4051 }
4052