1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
19 #endif
20 
21 #include "AMDGPULegalizerInfo.h"
22 
23 #include "AMDGPU.h"
24 #include "AMDGPUGlobalISelUtils.h"
25 #include "AMDGPUTargetMachine.h"
26 #include "SIMachineFunctionInfo.h"
27 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
28 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
29 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
30 #include "llvm/CodeGen/TargetOpcodes.h"
31 #include "llvm/CodeGen/ValueTypes.h"
32 #include "llvm/IR/DerivedTypes.h"
33 #include "llvm/IR/DiagnosticInfo.h"
34 #include "llvm/IR/Type.h"
35 #include "llvm/Support/Debug.h"
36 
37 #define DEBUG_TYPE "amdgpu-legalinfo"
38 
39 using namespace llvm;
40 using namespace LegalizeActions;
41 using namespace LegalizeMutations;
42 using namespace LegalityPredicates;
43 using namespace MIPatternMatch;
44 
45 // Round the number of elements to the next power of two elements
46 static LLT getPow2VectorType(LLT Ty) {
47   unsigned NElts = Ty.getNumElements();
48   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
49   return Ty.changeNumElements(Pow2NElts);
50 }
51 
52 // Round the number of bits to the next power of two bits
53 static LLT getPow2ScalarType(LLT Ty) {
54   unsigned Bits = Ty.getSizeInBits();
55   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
56   return LLT::scalar(Pow2Bits);
57 }
58 
59 static LegalityPredicate isMultiple32(unsigned TypeIdx,
60                                       unsigned MaxSize = 1024) {
61   return [=](const LegalityQuery &Query) {
62     const LLT Ty = Query.Types[TypeIdx];
63     const LLT EltTy = Ty.getScalarType();
64     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
65   };
66 }
67 
68 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
69   return [=](const LegalityQuery &Query) {
70     return Query.Types[TypeIdx].getSizeInBits() == Size;
71   };
72 }
73 
74 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
75   return [=](const LegalityQuery &Query) {
76     const LLT Ty = Query.Types[TypeIdx];
77     return Ty.isVector() &&
78            Ty.getNumElements() % 2 != 0 &&
79            Ty.getElementType().getSizeInBits() < 32 &&
80            Ty.getSizeInBits() % 32 != 0;
81   };
82 }
83 
84 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
85   return [=](const LegalityQuery &Query) {
86     const LLT Ty = Query.Types[TypeIdx];
87     const LLT EltTy = Ty.getScalarType();
88     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
89   };
90 }
91 
92 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
93   return [=](const LegalityQuery &Query) {
94     const LLT Ty = Query.Types[TypeIdx];
95     const LLT EltTy = Ty.getElementType();
96     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
97   };
98 }
99 
100 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
101   return [=](const LegalityQuery &Query) {
102     const LLT Ty = Query.Types[TypeIdx];
103     const LLT EltTy = Ty.getElementType();
104     unsigned Size = Ty.getSizeInBits();
105     unsigned Pieces = (Size + 63) / 64;
106     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
107     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
108   };
109 }
110 
111 // Increase the number of vector elements to reach the next multiple of 32-bit
112 // type.
113 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
114   return [=](const LegalityQuery &Query) {
115     const LLT Ty = Query.Types[TypeIdx];
116 
117     const LLT EltTy = Ty.getElementType();
118     const int Size = Ty.getSizeInBits();
119     const int EltSize = EltTy.getSizeInBits();
120     const int NextMul32 = (Size + 31) / 32;
121 
122     assert(EltSize < 32);
123 
124     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
125     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
126   };
127 }
128 
129 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
130   return [=](const LegalityQuery &Query) {
131     const LLT QueryTy = Query.Types[TypeIdx];
132     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
133   };
134 }
135 
136 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
137   return [=](const LegalityQuery &Query) {
138     const LLT QueryTy = Query.Types[TypeIdx];
139     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
140   };
141 }
142 
143 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
144   return [=](const LegalityQuery &Query) {
145     const LLT QueryTy = Query.Types[TypeIdx];
146     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
147   };
148 }
149 
150 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
151 // v2s16.
152 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
153   return [=](const LegalityQuery &Query) {
154     const LLT Ty = Query.Types[TypeIdx];
155     if (Ty.isVector()) {
156       const int EltSize = Ty.getElementType().getSizeInBits();
157       return EltSize == 32 || EltSize == 64 ||
158             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
159              EltSize == 128 || EltSize == 256;
160     }
161 
162     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
163   };
164 }
165 
166 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
167   return [=](const LegalityQuery &Query) {
168     const LLT QueryTy = Query.Types[TypeIdx];
169     return QueryTy.isVector() && QueryTy.getElementType() == Type;
170   };
171 }
172 
173 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
174   return [=](const LegalityQuery &Query) {
175     const LLT Ty = Query.Types[TypeIdx];
176     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
177            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
178   };
179 }
180 
181 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
182                                          const GCNTargetMachine &TM)
183   :  ST(ST_) {
184   using namespace TargetOpcode;
185 
186   auto GetAddrSpacePtr = [&TM](unsigned AS) {
187     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
188   };
189 
190   const LLT S1 = LLT::scalar(1);
191   const LLT S16 = LLT::scalar(16);
192   const LLT S32 = LLT::scalar(32);
193   const LLT S64 = LLT::scalar(64);
194   const LLT S96 = LLT::scalar(96);
195   const LLT S128 = LLT::scalar(128);
196   const LLT S256 = LLT::scalar(256);
197   const LLT S1024 = LLT::scalar(1024);
198 
199   const LLT V2S16 = LLT::vector(2, 16);
200   const LLT V4S16 = LLT::vector(4, 16);
201 
202   const LLT V2S32 = LLT::vector(2, 32);
203   const LLT V3S32 = LLT::vector(3, 32);
204   const LLT V4S32 = LLT::vector(4, 32);
205   const LLT V5S32 = LLT::vector(5, 32);
206   const LLT V6S32 = LLT::vector(6, 32);
207   const LLT V7S32 = LLT::vector(7, 32);
208   const LLT V8S32 = LLT::vector(8, 32);
209   const LLT V9S32 = LLT::vector(9, 32);
210   const LLT V10S32 = LLT::vector(10, 32);
211   const LLT V11S32 = LLT::vector(11, 32);
212   const LLT V12S32 = LLT::vector(12, 32);
213   const LLT V13S32 = LLT::vector(13, 32);
214   const LLT V14S32 = LLT::vector(14, 32);
215   const LLT V15S32 = LLT::vector(15, 32);
216   const LLT V16S32 = LLT::vector(16, 32);
217   const LLT V32S32 = LLT::vector(32, 32);
218 
219   const LLT V2S64 = LLT::vector(2, 64);
220   const LLT V3S64 = LLT::vector(3, 64);
221   const LLT V4S64 = LLT::vector(4, 64);
222   const LLT V5S64 = LLT::vector(5, 64);
223   const LLT V6S64 = LLT::vector(6, 64);
224   const LLT V7S64 = LLT::vector(7, 64);
225   const LLT V8S64 = LLT::vector(8, 64);
226   const LLT V16S64 = LLT::vector(16, 64);
227 
228   std::initializer_list<LLT> AllS32Vectors =
229     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
230      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
231   std::initializer_list<LLT> AllS64Vectors =
232     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
233 
234   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
235   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
236   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
237   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
238   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
239   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
240   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
241 
242   const LLT CodePtr = FlatPtr;
243 
244   const std::initializer_list<LLT> AddrSpaces64 = {
245     GlobalPtr, ConstantPtr, FlatPtr
246   };
247 
248   const std::initializer_list<LLT> AddrSpaces32 = {
249     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
250   };
251 
252   const std::initializer_list<LLT> FPTypesBase = {
253     S32, S64
254   };
255 
256   const std::initializer_list<LLT> FPTypes16 = {
257     S32, S64, S16
258   };
259 
260   const std::initializer_list<LLT> FPTypesPK16 = {
261     S32, S64, S16, V2S16
262   };
263 
264   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
265 
266   setAction({G_BRCOND, S1}, Legal); // VCC branches
267   setAction({G_BRCOND, S32}, Legal); // SCC branches
268 
269   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
270   // elements for v3s16
271   getActionDefinitionsBuilder(G_PHI)
272     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
273     .legalFor(AllS32Vectors)
274     .legalFor(AllS64Vectors)
275     .legalFor(AddrSpaces64)
276     .legalFor(AddrSpaces32)
277     .clampScalar(0, S32, S256)
278     .widenScalarToNextPow2(0, 32)
279     .clampMaxNumElements(0, S32, 16)
280     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
281     .legalIf(isPointer(0));
282 
283   if (ST.has16BitInsts()) {
284     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
285       .legalFor({S32, S16})
286       .clampScalar(0, S16, S32)
287       .scalarize(0);
288   } else {
289     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
290       .legalFor({S32})
291       .clampScalar(0, S32, S32)
292       .scalarize(0);
293   }
294 
295   // FIXME: Not really legal. Placeholder for custom lowering.
296   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
297     .legalFor({S32, S64})
298     .clampScalar(0, S32, S64)
299     .widenScalarToNextPow2(0, 32)
300     .scalarize(0);
301 
302   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
303     .legalFor({S32})
304     .clampScalar(0, S32, S32)
305     .scalarize(0);
306 
307   // Report legal for any types we can handle anywhere. For the cases only legal
308   // on the SALU, RegBankSelect will be able to re-legalize.
309   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
310     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
311     .clampScalar(0, S32, S64)
312     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
313     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
314     .widenScalarToNextPow2(0)
315     .scalarize(0);
316 
317   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
318                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
319     .legalFor({{S32, S1}, {S32, S32}})
320     .clampScalar(0, S32, S32)
321     .scalarize(0); // TODO: Implement.
322 
323   getActionDefinitionsBuilder(G_BITCAST)
324     // Don't worry about the size constraint.
325     .legalIf(all(isRegisterType(0), isRegisterType(1)))
326     // FIXME: Testing hack
327     .legalForCartesianProduct({S16, LLT::vector(2, 8), })
328     .lower();
329 
330 
331   getActionDefinitionsBuilder(G_CONSTANT)
332     .legalFor({S1, S32, S64, S16, GlobalPtr,
333                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
334     .clampScalar(0, S32, S64)
335     .widenScalarToNextPow2(0)
336     .legalIf(isPointer(0));
337 
338   getActionDefinitionsBuilder(G_FCONSTANT)
339     .legalFor({S32, S64, S16})
340     .clampScalar(0, S16, S64);
341 
342   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
343     .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
344                ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
345     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
346     .clampScalarOrElt(0, S32, S1024)
347     .legalIf(isMultiple32(0))
348     .widenScalarToNextPow2(0, 32)
349     .clampMaxNumElements(0, S32, 16);
350 
351   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
352   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
353     .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr});
354   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
355 
356   auto &FPOpActions = getActionDefinitionsBuilder(
357     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
358     .legalFor({S32, S64});
359   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
360     .customFor({S32, S64});
361   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
362     .customFor({S32, S64});
363 
364   if (ST.has16BitInsts()) {
365     if (ST.hasVOP3PInsts())
366       FPOpActions.legalFor({S16, V2S16});
367     else
368       FPOpActions.legalFor({S16});
369 
370     TrigActions.customFor({S16});
371     FDIVActions.customFor({S16});
372   }
373 
374   auto &MinNumMaxNum = getActionDefinitionsBuilder({
375       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
376 
377   if (ST.hasVOP3PInsts()) {
378     MinNumMaxNum.customFor(FPTypesPK16)
379       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
380       .clampMaxNumElements(0, S16, 2)
381       .clampScalar(0, S16, S64)
382       .scalarize(0);
383   } else if (ST.has16BitInsts()) {
384     MinNumMaxNum.customFor(FPTypes16)
385       .clampScalar(0, S16, S64)
386       .scalarize(0);
387   } else {
388     MinNumMaxNum.customFor(FPTypesBase)
389       .clampScalar(0, S32, S64)
390       .scalarize(0);
391   }
392 
393   if (ST.hasVOP3PInsts())
394     FPOpActions.clampMaxNumElements(0, S16, 2);
395 
396   FPOpActions
397     .scalarize(0)
398     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
399 
400   TrigActions
401     .scalarize(0)
402     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
403 
404   FDIVActions
405     .scalarize(0)
406     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
407 
408   getActionDefinitionsBuilder({G_FNEG, G_FABS})
409     .legalFor(FPTypesPK16)
410     .clampMaxNumElements(0, S16, 2)
411     .scalarize(0)
412     .clampScalar(0, S16, S64);
413 
414   if (ST.has16BitInsts()) {
415     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
416       .legalFor({S32, S64, S16})
417       .scalarize(0)
418       .clampScalar(0, S16, S64);
419   } else {
420     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
421       .legalFor({S32, S64})
422       .scalarize(0)
423       .clampScalar(0, S32, S64);
424   }
425 
426   getActionDefinitionsBuilder(G_FPTRUNC)
427     .legalFor({{S32, S64}, {S16, S32}})
428     .scalarize(0);
429 
430   getActionDefinitionsBuilder(G_FPEXT)
431     .legalFor({{S64, S32}, {S32, S16}})
432     .lowerFor({{S64, S16}}) // FIXME: Implement
433     .scalarize(0);
434 
435   getActionDefinitionsBuilder(G_FSUB)
436       // Use actual fsub instruction
437       .legalFor({S32})
438       // Must use fadd + fneg
439       .lowerFor({S64, S16, V2S16})
440       .scalarize(0)
441       .clampScalar(0, S32, S64);
442 
443   // Whether this is legal depends on the floating point mode for the function.
444   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
445   if (ST.hasMadF16())
446     FMad.customFor({S32, S16});
447   else
448     FMad.customFor({S32});
449   FMad.scalarize(0)
450       .lower();
451 
452   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
453     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
454                {S32, S1}, {S64, S1}, {S16, S1}})
455     .scalarize(0)
456     .clampScalar(0, S32, S64);
457 
458   // TODO: Split s1->s64 during regbankselect for VALU.
459   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
460     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
461     .lowerFor({{S32, S64}})
462     .lowerIf(typeIs(1, S1))
463     .customFor({{S64, S64}});
464   if (ST.has16BitInsts())
465     IToFP.legalFor({{S16, S16}});
466   IToFP.clampScalar(1, S32, S64)
467        .scalarize(0);
468 
469   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
470     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
471     .customFor({{S64, S64}});
472   if (ST.has16BitInsts())
473     FPToI.legalFor({{S16, S16}});
474   else
475     FPToI.minScalar(1, S32);
476 
477   FPToI.minScalar(0, S32)
478        .scalarize(0)
479        .lower();
480 
481   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
482     .scalarize(0)
483     .lower();
484 
485   if (ST.has16BitInsts()) {
486     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
487       .legalFor({S16, S32, S64})
488       .clampScalar(0, S16, S64)
489       .scalarize(0);
490   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
491     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
492       .legalFor({S32, S64})
493       .clampScalar(0, S32, S64)
494       .scalarize(0);
495   } else {
496     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
497       .legalFor({S32})
498       .customFor({S64})
499       .clampScalar(0, S32, S64)
500       .scalarize(0);
501   }
502 
503   getActionDefinitionsBuilder({G_PTR_ADD, G_PTR_MASK})
504     .scalarize(0)
505     .alwaysLegal();
506 
507   auto &CmpBuilder =
508     getActionDefinitionsBuilder(G_ICMP)
509     // The compare output type differs based on the register bank of the output,
510     // so make both s1 and s32 legal.
511     //
512     // Scalar compares producing output in scc will be promoted to s32, as that
513     // is the allocatable register type that will be needed for the copy from
514     // scc. This will be promoted during RegBankSelect, and we assume something
515     // before that won't try to use s32 result types.
516     //
517     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
518     // bank.
519     .legalForCartesianProduct(
520       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
521     .legalForCartesianProduct(
522       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
523   if (ST.has16BitInsts()) {
524     CmpBuilder.legalFor({{S1, S16}});
525   }
526 
527   CmpBuilder
528     .widenScalarToNextPow2(1)
529     .clampScalar(1, S32, S64)
530     .scalarize(0)
531     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
532 
533   getActionDefinitionsBuilder(G_FCMP)
534     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
535     .widenScalarToNextPow2(1)
536     .clampScalar(1, S32, S64)
537     .scalarize(0);
538 
539   // FIXME: fpow has a selection pattern that should move to custom lowering.
540   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2, G_FPOW});
541   if (ST.has16BitInsts())
542     Exp2Ops.legalFor({S32, S16});
543   else
544     Exp2Ops.legalFor({S32});
545   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
546   Exp2Ops.scalarize(0);
547 
548   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10});
549   if (ST.has16BitInsts())
550     ExpOps.customFor({{S32}, {S16}});
551   else
552     ExpOps.customFor({S32});
553   ExpOps.clampScalar(0, MinScalarFPTy, S32)
554         .scalarize(0);
555 
556   // The 64-bit versions produce 32-bit results, but only on the SALU.
557   getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF,
558                                G_CTTZ, G_CTTZ_ZERO_UNDEF,
559                                G_CTPOP})
560     .legalFor({{S32, S32}, {S32, S64}})
561     .clampScalar(0, S32, S32)
562     .clampScalar(1, S32, S64)
563     .scalarize(0)
564     .widenScalarToNextPow2(0, 32)
565     .widenScalarToNextPow2(1, 32);
566 
567   // TODO: Expand for > s32
568   getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE})
569     .legalFor({S32})
570     .clampScalar(0, S32, S32)
571     .scalarize(0);
572 
573   if (ST.has16BitInsts()) {
574     if (ST.hasVOP3PInsts()) {
575       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
576         .legalFor({S32, S16, V2S16})
577         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
578         .clampMaxNumElements(0, S16, 2)
579         .clampScalar(0, S16, S32)
580         .widenScalarToNextPow2(0)
581         .scalarize(0);
582     } else {
583       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
584         .legalFor({S32, S16})
585         .widenScalarToNextPow2(0)
586         .clampScalar(0, S16, S32)
587         .scalarize(0);
588     }
589   } else {
590     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
591       .legalFor({S32})
592       .clampScalar(0, S32, S32)
593       .widenScalarToNextPow2(0)
594       .scalarize(0);
595   }
596 
597   auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
598     return [=](const LegalityQuery &Query) {
599       return Query.Types[TypeIdx0].getSizeInBits() <
600              Query.Types[TypeIdx1].getSizeInBits();
601     };
602   };
603 
604   auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
605     return [=](const LegalityQuery &Query) {
606       return Query.Types[TypeIdx0].getSizeInBits() >
607              Query.Types[TypeIdx1].getSizeInBits();
608     };
609   };
610 
611   getActionDefinitionsBuilder(G_INTTOPTR)
612     // List the common cases
613     .legalForCartesianProduct(AddrSpaces64, {S64})
614     .legalForCartesianProduct(AddrSpaces32, {S32})
615     .scalarize(0)
616     // Accept any address space as long as the size matches
617     .legalIf(sameSize(0, 1))
618     .widenScalarIf(smallerThan(1, 0),
619       [](const LegalityQuery &Query) {
620         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
621       })
622     .narrowScalarIf(greaterThan(1, 0),
623       [](const LegalityQuery &Query) {
624         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
625       });
626 
627   getActionDefinitionsBuilder(G_PTRTOINT)
628     // List the common cases
629     .legalForCartesianProduct(AddrSpaces64, {S64})
630     .legalForCartesianProduct(AddrSpaces32, {S32})
631     .scalarize(0)
632     // Accept any address space as long as the size matches
633     .legalIf(sameSize(0, 1))
634     .widenScalarIf(smallerThan(0, 1),
635       [](const LegalityQuery &Query) {
636         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
637       })
638     .narrowScalarIf(
639       greaterThan(0, 1),
640       [](const LegalityQuery &Query) {
641         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
642       });
643 
644   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
645     .scalarize(0)
646     .custom();
647 
648   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
649   // handle some operations by just promoting the register during
650   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
651   auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned {
652     switch (AS) {
653     // FIXME: Private element size.
654     case AMDGPUAS::PRIVATE_ADDRESS:
655       return 32;
656     // FIXME: Check subtarget
657     case AMDGPUAS::LOCAL_ADDRESS:
658       return ST.useDS128() ? 128 : 64;
659 
660     // Treat constant and global as identical. SMRD loads are sometimes usable
661     // for global loads (ideally constant address space should be eliminated)
662     // depending on the context. Legality cannot be context dependent, but
663     // RegBankSelect can split the load as necessary depending on the pointer
664     // register bank/uniformity and if the memory is invariant or not written in
665     // a kernel.
666     case AMDGPUAS::CONSTANT_ADDRESS:
667     case AMDGPUAS::GLOBAL_ADDRESS:
668       return IsLoad ? 512 : 128;
669     default:
670       return 128;
671     }
672   };
673 
674   const auto needToSplitMemOp = [=](const LegalityQuery &Query, bool IsLoad) -> bool {
675     const LLT DstTy = Query.Types[0];
676 
677     // Split vector extloads.
678     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
679     unsigned Align = Query.MMODescrs[0].AlignInBits;
680 
681     if (MemSize < DstTy.getSizeInBits())
682       MemSize = std::max(MemSize, Align);
683 
684     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
685       return true;
686 
687     const LLT PtrTy = Query.Types[1];
688     unsigned AS = PtrTy.getAddressSpace();
689     if (MemSize > maxSizeForAddrSpace(AS, IsLoad))
690       return true;
691 
692     // Catch weird sized loads that don't evenly divide into the access sizes
693     // TODO: May be able to widen depending on alignment etc.
694     unsigned NumRegs = MemSize / 32;
695     if (NumRegs == 3 && !ST.hasDwordx3LoadStores())
696       return true;
697 
698     if (Align < MemSize) {
699       const SITargetLowering *TLI = ST.getTargetLowering();
700       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
701     }
702 
703     return false;
704   };
705 
706   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
707   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
708   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
709 
710   // TODO: Refine based on subtargets which support unaligned access or 128-bit
711   // LDS
712   // TODO: Unsupported flat for SI.
713 
714   for (unsigned Op : {G_LOAD, G_STORE}) {
715     const bool IsStore = Op == G_STORE;
716 
717     auto &Actions = getActionDefinitionsBuilder(Op);
718     // Whitelist the common cases.
719     // TODO: Pointer loads
720     // TODO: Wide constant loads
721     // TODO: Only CI+ has 3x loads
722     // TODO: Loads to s16 on gfx9
723     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
724                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
725                                       {V3S32, GlobalPtr, 96, GlobalAlign32},
726                                       {S96, GlobalPtr, 96, GlobalAlign32},
727                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
728                                       {S128, GlobalPtr, 128, GlobalAlign32},
729                                       {S64, GlobalPtr, 64, GlobalAlign32},
730                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
731                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
732                                       {S32, GlobalPtr, 8, GlobalAlign8},
733                                       {S32, GlobalPtr, 16, GlobalAlign16},
734 
735                                       {S32, LocalPtr, 32, 32},
736                                       {S64, LocalPtr, 64, 32},
737                                       {V2S32, LocalPtr, 64, 32},
738                                       {S32, LocalPtr, 8, 8},
739                                       {S32, LocalPtr, 16, 16},
740                                       {V2S16, LocalPtr, 32, 32},
741 
742                                       {S32, PrivatePtr, 32, 32},
743                                       {S32, PrivatePtr, 8, 8},
744                                       {S32, PrivatePtr, 16, 16},
745                                       {V2S16, PrivatePtr, 32, 32},
746 
747                                       {S32, FlatPtr, 32, GlobalAlign32},
748                                       {S32, FlatPtr, 16, GlobalAlign16},
749                                       {S32, FlatPtr, 8, GlobalAlign8},
750                                       {V2S16, FlatPtr, 32, GlobalAlign32},
751 
752                                       {S32, ConstantPtr, 32, GlobalAlign32},
753                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
754                                       {V3S32, ConstantPtr, 96, GlobalAlign32},
755                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
756                                       {S64, ConstantPtr, 64, GlobalAlign32},
757                                       {S128, ConstantPtr, 128, GlobalAlign32},
758                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
759     Actions
760         .customIf(typeIs(1, Constant32Ptr))
761         .narrowScalarIf(
762             [=](const LegalityQuery &Query) -> bool {
763               return !Query.Types[0].isVector() &&
764                      needToSplitMemOp(Query, Op == G_LOAD);
765             },
766             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
767               const LLT DstTy = Query.Types[0];
768               const LLT PtrTy = Query.Types[1];
769 
770               const unsigned DstSize = DstTy.getSizeInBits();
771               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
772 
773               // Split extloads.
774               if (DstSize > MemSize)
775                 return std::make_pair(0, LLT::scalar(MemSize));
776 
777               if (DstSize > 32 && (DstSize % 32 != 0)) {
778                 // FIXME: Need a way to specify non-extload of larger size if
779                 // suitably aligned.
780                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
781               }
782 
783               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
784                                                      Op == G_LOAD);
785               if (MemSize > MaxSize)
786                 return std::make_pair(0, LLT::scalar(MaxSize));
787 
788               unsigned Align = Query.MMODescrs[0].AlignInBits;
789               return std::make_pair(0, LLT::scalar(Align));
790             })
791         .fewerElementsIf(
792             [=](const LegalityQuery &Query) -> bool {
793               return Query.Types[0].isVector() &&
794                      needToSplitMemOp(Query, Op == G_LOAD);
795             },
796             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
797               const LLT DstTy = Query.Types[0];
798               const LLT PtrTy = Query.Types[1];
799 
800               LLT EltTy = DstTy.getElementType();
801               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
802                                                      Op == G_LOAD);
803 
804               // Split if it's too large for the address space.
805               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
806                 unsigned NumElts = DstTy.getNumElements();
807                 unsigned EltSize = EltTy.getSizeInBits();
808 
809                 if (MaxSize % EltSize == 0) {
810                   return std::make_pair(
811                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
812                 }
813 
814                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
815 
816                 // FIXME: Refine when odd breakdowns handled
817                 // The scalars will need to be re-legalized.
818                 if (NumPieces == 1 || NumPieces >= NumElts ||
819                     NumElts % NumPieces != 0)
820                   return std::make_pair(0, EltTy);
821 
822                 return std::make_pair(0,
823                                       LLT::vector(NumElts / NumPieces, EltTy));
824               }
825 
826               // Need to split because of alignment.
827               unsigned Align = Query.MMODescrs[0].AlignInBits;
828               unsigned EltSize = EltTy.getSizeInBits();
829               if (EltSize > Align &&
830                   (EltSize / Align < DstTy.getNumElements())) {
831                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
832               }
833 
834               // May need relegalization for the scalars.
835               return std::make_pair(0, EltTy);
836             })
837         .minScalar(0, S32);
838 
839     if (IsStore)
840       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
841 
842     // TODO: Need a bitcast lower option?
843     Actions
844         .legalIf([=](const LegalityQuery &Query) {
845           const LLT Ty0 = Query.Types[0];
846           unsigned Size = Ty0.getSizeInBits();
847           unsigned MemSize = Query.MMODescrs[0].SizeInBits;
848           unsigned Align = Query.MMODescrs[0].AlignInBits;
849 
850           // FIXME: Widening store from alignment not valid.
851           if (MemSize < Size)
852             MemSize = std::max(MemSize, Align);
853 
854           // No extending vector loads.
855           if (Size > MemSize && Ty0.isVector())
856             return false;
857 
858           switch (MemSize) {
859           case 8:
860           case 16:
861             return Size == 32;
862           case 32:
863           case 64:
864           case 128:
865             return true;
866           case 96:
867             return ST.hasDwordx3LoadStores();
868           case 256:
869           case 512:
870             return true;
871           default:
872             return false;
873           }
874         })
875         .widenScalarToNextPow2(0)
876         // TODO: v3s32->v4s32 with alignment
877         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
878   }
879 
880   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
881                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
882                                                   {S32, GlobalPtr, 16, 2 * 8},
883                                                   {S32, LocalPtr, 8, 8},
884                                                   {S32, LocalPtr, 16, 16},
885                                                   {S32, PrivatePtr, 8, 8},
886                                                   {S32, PrivatePtr, 16, 16},
887                                                   {S32, ConstantPtr, 8, 8},
888                                                   {S32, ConstantPtr, 16, 2 * 8}});
889   if (ST.hasFlatAddressSpace()) {
890     ExtLoads.legalForTypesWithMemDesc(
891         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
892   }
893 
894   ExtLoads.clampScalar(0, S32, S32)
895           .widenScalarToNextPow2(0)
896           .unsupportedIfMemSizeNotPow2()
897           .lower();
898 
899   auto &Atomics = getActionDefinitionsBuilder(
900     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
901      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
902      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
903      G_ATOMICRMW_UMIN})
904     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
905                {S64, GlobalPtr}, {S64, LocalPtr}});
906   if (ST.hasFlatAddressSpace()) {
907     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
908   }
909 
910   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
911     .legalFor({{S32, LocalPtr}});
912 
913   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
914   // demarshalling
915   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
916     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
917                 {S32, FlatPtr}, {S64, FlatPtr}})
918     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
919                {S32, RegionPtr}, {S64, RegionPtr}});
920   // TODO: Pointer types, any 32-bit or 64-bit vector
921 
922   // Condition should be s32 for scalar, s1 for vector.
923   getActionDefinitionsBuilder(G_SELECT)
924     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
925           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
926           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
927     .clampScalar(0, S16, S64)
928     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
929     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
930     .scalarize(1)
931     .clampMaxNumElements(0, S32, 2)
932     .clampMaxNumElements(0, LocalPtr, 2)
933     .clampMaxNumElements(0, PrivatePtr, 2)
934     .scalarize(0)
935     .widenScalarToNextPow2(0)
936     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
937 
938   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
939   // be more flexible with the shift amount type.
940   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
941     .legalFor({{S32, S32}, {S64, S32}});
942   if (ST.has16BitInsts()) {
943     if (ST.hasVOP3PInsts()) {
944       Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
945             .clampMaxNumElements(0, S16, 2);
946     } else
947       Shifts.legalFor({{S16, S32}, {S16, S16}});
948 
949     // TODO: Support 16-bit shift amounts
950     Shifts.clampScalar(1, S32, S32);
951     Shifts.clampScalar(0, S16, S64);
952     Shifts.widenScalarToNextPow2(0, 16);
953   } else {
954     // Make sure we legalize the shift amount type first, as the general
955     // expansion for the shifted type will produce much worse code if it hasn't
956     // been truncated already.
957     Shifts.clampScalar(1, S32, S32);
958     Shifts.clampScalar(0, S32, S64);
959     Shifts.widenScalarToNextPow2(0, 32);
960   }
961   Shifts.scalarize(0);
962 
963   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
964     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
965     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
966     unsigned IdxTypeIdx = 2;
967 
968     getActionDefinitionsBuilder(Op)
969       .customIf([=](const LegalityQuery &Query) {
970           const LLT EltTy = Query.Types[EltTypeIdx];
971           const LLT VecTy = Query.Types[VecTypeIdx];
972           const LLT IdxTy = Query.Types[IdxTypeIdx];
973           return (EltTy.getSizeInBits() == 16 ||
974                   EltTy.getSizeInBits() % 32 == 0) &&
975                  VecTy.getSizeInBits() % 32 == 0 &&
976                  VecTy.getSizeInBits() <= 1024 &&
977                  IdxTy.getSizeInBits() == 32;
978         })
979       .clampScalar(EltTypeIdx, S32, S64)
980       .clampScalar(VecTypeIdx, S32, S64)
981       .clampScalar(IdxTypeIdx, S32, S32);
982   }
983 
984   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
985     .unsupportedIf([=](const LegalityQuery &Query) {
986         const LLT &EltTy = Query.Types[1].getElementType();
987         return Query.Types[0] != EltTy;
988       });
989 
990   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
991     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
992     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
993 
994     // FIXME: Doesn't handle extract of illegal sizes.
995     getActionDefinitionsBuilder(Op)
996       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
997       // FIXME: Multiples of 16 should not be legal.
998       .legalIf([=](const LegalityQuery &Query) {
999           const LLT BigTy = Query.Types[BigTyIdx];
1000           const LLT LitTy = Query.Types[LitTyIdx];
1001           return (BigTy.getSizeInBits() % 32 == 0) &&
1002                  (LitTy.getSizeInBits() % 16 == 0);
1003         })
1004       .widenScalarIf(
1005         [=](const LegalityQuery &Query) {
1006           const LLT BigTy = Query.Types[BigTyIdx];
1007           return (BigTy.getScalarSizeInBits() < 16);
1008         },
1009         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1010       .widenScalarIf(
1011         [=](const LegalityQuery &Query) {
1012           const LLT LitTy = Query.Types[LitTyIdx];
1013           return (LitTy.getScalarSizeInBits() < 16);
1014         },
1015         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1016       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1017       .widenScalarToNextPow2(BigTyIdx, 32);
1018 
1019   }
1020 
1021   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1022     .legalForCartesianProduct(AllS32Vectors, {S32})
1023     .legalForCartesianProduct(AllS64Vectors, {S64})
1024     .clampNumElements(0, V16S32, V32S32)
1025     .clampNumElements(0, V2S64, V16S64)
1026     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1027 
1028   if (ST.hasScalarPackInsts()) {
1029     BuildVector
1030       // FIXME: Should probably widen s1 vectors straight to s32
1031       .minScalarOrElt(0, S16)
1032       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1033       .minScalar(1, S32);
1034 
1035     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1036       .legalFor({V2S16, S32})
1037       .lower();
1038     BuildVector.minScalarOrElt(0, S32);
1039   } else {
1040     BuildVector.customFor({V2S16, S16});
1041     BuildVector.minScalarOrElt(0, S32);
1042 
1043     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1044       .customFor({V2S16, S32})
1045       .lower();
1046   }
1047 
1048   BuildVector.legalIf(isRegisterType(0));
1049 
1050   // FIXME: Clamp maximum size
1051   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1052     .legalIf(isRegisterType(0));
1053 
1054   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1055   // pre-legalize.
1056   if (ST.hasVOP3PInsts()) {
1057     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1058       .customFor({V2S16, V2S16})
1059       .lower();
1060   } else
1061     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1062 
1063   // Merge/Unmerge
1064   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1065     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1066     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1067 
1068     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1069       const LLT &Ty = Query.Types[TypeIdx];
1070       if (Ty.isVector()) {
1071         const LLT &EltTy = Ty.getElementType();
1072         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
1073           return true;
1074         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1075           return true;
1076       }
1077       return false;
1078     };
1079 
1080     auto &Builder = getActionDefinitionsBuilder(Op)
1081       // Try to widen to s16 first for small types.
1082       // TODO: Only do this on targets with legal s16 shifts
1083       .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1084 
1085       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1086       .lowerFor({{S16, V2S16}})
1087       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1088       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1089                            elementTypeIs(1, S16)),
1090                        changeTo(1, V2S16))
1091       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1092       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1093       // valid.
1094       .clampScalar(LitTyIdx, S32, S256)
1095       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1096       // Break up vectors with weird elements into scalars
1097       .fewerElementsIf(
1098         [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
1099         scalarize(0))
1100       .fewerElementsIf(
1101         [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
1102         scalarize(1))
1103       .clampScalar(BigTyIdx, S32, S1024);
1104 
1105     if (Op == G_MERGE_VALUES) {
1106       Builder.widenScalarIf(
1107         // TODO: Use 16-bit shifts if legal for 8-bit values?
1108         [=](const LegalityQuery &Query) {
1109           const LLT Ty = Query.Types[LitTyIdx];
1110           return Ty.getSizeInBits() < 32;
1111         },
1112         changeTo(LitTyIdx, S32));
1113     }
1114 
1115     Builder.widenScalarIf(
1116       [=](const LegalityQuery &Query) {
1117         const LLT Ty = Query.Types[BigTyIdx];
1118         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1119           Ty.getSizeInBits() % 16 != 0;
1120       },
1121       [=](const LegalityQuery &Query) {
1122         // Pick the next power of 2, or a multiple of 64 over 128.
1123         // Whichever is smaller.
1124         const LLT &Ty = Query.Types[BigTyIdx];
1125         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1126         if (NewSizeInBits >= 256) {
1127           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1128           if (RoundedTo < NewSizeInBits)
1129             NewSizeInBits = RoundedTo;
1130         }
1131         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1132       })
1133       .legalIf([=](const LegalityQuery &Query) {
1134           const LLT &BigTy = Query.Types[BigTyIdx];
1135           const LLT &LitTy = Query.Types[LitTyIdx];
1136 
1137           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1138             return false;
1139           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1140             return false;
1141 
1142           return BigTy.getSizeInBits() % 16 == 0 &&
1143                  LitTy.getSizeInBits() % 16 == 0 &&
1144                  BigTy.getSizeInBits() <= 1024;
1145         })
1146       // Any vectors left are the wrong size. Scalarize them.
1147       .scalarize(0)
1148       .scalarize(1);
1149   }
1150 
1151   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1152   // RegBankSelect.
1153   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1154     .legalFor({{S32}, {S64}});
1155 
1156   if (ST.hasVOP3PInsts()) {
1157     SextInReg.lowerFor({{V2S16}})
1158       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1159       // get more vector shift opportunities, since we'll get those when
1160       // expanded.
1161       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1162   } else if (ST.has16BitInsts()) {
1163     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1164   } else {
1165     // Prefer to promote to s32 before lowering if we don't have 16-bit
1166     // shifts. This avoid a lot of intermediate truncate and extend operations.
1167     SextInReg.lowerFor({{S32}, {S64}});
1168   }
1169 
1170   SextInReg
1171     .scalarize(0)
1172     .clampScalar(0, S32, S64)
1173     .lower();
1174 
1175   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1176     .legalFor({S64});
1177 
1178   getActionDefinitionsBuilder({
1179       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1180       G_FCOPYSIGN,
1181 
1182       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1183       G_READ_REGISTER,
1184       G_WRITE_REGISTER,
1185 
1186       G_SADDO, G_SSUBO,
1187 
1188        // TODO: Implement
1189       G_FMINIMUM, G_FMAXIMUM
1190     }).lower();
1191 
1192   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1193         G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1194         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1195     .unsupported();
1196 
1197   computeTables();
1198   verify(*ST.getInstrInfo());
1199 }
1200 
1201 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1202                                          MachineRegisterInfo &MRI,
1203                                          MachineIRBuilder &B,
1204                                          GISelChangeObserver &Observer) const {
1205   switch (MI.getOpcode()) {
1206   case TargetOpcode::G_ADDRSPACE_CAST:
1207     return legalizeAddrSpaceCast(MI, MRI, B);
1208   case TargetOpcode::G_FRINT:
1209     return legalizeFrint(MI, MRI, B);
1210   case TargetOpcode::G_FCEIL:
1211     return legalizeFceil(MI, MRI, B);
1212   case TargetOpcode::G_INTRINSIC_TRUNC:
1213     return legalizeIntrinsicTrunc(MI, MRI, B);
1214   case TargetOpcode::G_SITOFP:
1215     return legalizeITOFP(MI, MRI, B, true);
1216   case TargetOpcode::G_UITOFP:
1217     return legalizeITOFP(MI, MRI, B, false);
1218   case TargetOpcode::G_FPTOSI:
1219     return legalizeFPTOI(MI, MRI, B, true);
1220   case TargetOpcode::G_FPTOUI:
1221     return legalizeFPTOI(MI, MRI, B, false);
1222   case TargetOpcode::G_FMINNUM:
1223   case TargetOpcode::G_FMAXNUM:
1224   case TargetOpcode::G_FMINNUM_IEEE:
1225   case TargetOpcode::G_FMAXNUM_IEEE:
1226     return legalizeMinNumMaxNum(MI, MRI, B);
1227   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1228     return legalizeExtractVectorElt(MI, MRI, B);
1229   case TargetOpcode::G_INSERT_VECTOR_ELT:
1230     return legalizeInsertVectorElt(MI, MRI, B);
1231   case TargetOpcode::G_SHUFFLE_VECTOR:
1232     return legalizeShuffleVector(MI, MRI, B);
1233   case TargetOpcode::G_FSIN:
1234   case TargetOpcode::G_FCOS:
1235     return legalizeSinCos(MI, MRI, B);
1236   case TargetOpcode::G_GLOBAL_VALUE:
1237     return legalizeGlobalValue(MI, MRI, B);
1238   case TargetOpcode::G_LOAD:
1239     return legalizeLoad(MI, MRI, B, Observer);
1240   case TargetOpcode::G_FMAD:
1241     return legalizeFMad(MI, MRI, B);
1242   case TargetOpcode::G_FDIV:
1243     return legalizeFDIV(MI, MRI, B);
1244   case TargetOpcode::G_ATOMIC_CMPXCHG:
1245     return legalizeAtomicCmpXChg(MI, MRI, B);
1246   case TargetOpcode::G_FLOG:
1247     return legalizeFlog(MI, B, 1.0f / numbers::log2ef);
1248   case TargetOpcode::G_FLOG10:
1249     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1250   case TargetOpcode::G_FEXP:
1251     return legalizeFExp(MI, B);
1252   case TargetOpcode::G_BUILD_VECTOR:
1253     return legalizeBuildVector(MI, MRI, B);
1254   default:
1255     return false;
1256   }
1257 
1258   llvm_unreachable("expected switch to return");
1259 }
1260 
1261 Register AMDGPULegalizerInfo::getSegmentAperture(
1262   unsigned AS,
1263   MachineRegisterInfo &MRI,
1264   MachineIRBuilder &B) const {
1265   MachineFunction &MF = B.getMF();
1266   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1267   const LLT S32 = LLT::scalar(32);
1268 
1269   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1270 
1271   if (ST.hasApertureRegs()) {
1272     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1273     // getreg.
1274     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1275         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1276         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1277     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1278         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1279         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1280     unsigned Encoding =
1281         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1282         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1283         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1284 
1285     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1286 
1287     B.buildInstr(AMDGPU::S_GETREG_B32)
1288       .addDef(GetReg)
1289       .addImm(Encoding);
1290     MRI.setType(GetReg, S32);
1291 
1292     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1293     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1294   }
1295 
1296   Register QueuePtr = MRI.createGenericVirtualRegister(
1297     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1298 
1299   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1300   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1301     return Register();
1302 
1303   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1304   // private_segment_aperture_base_hi.
1305   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1306 
1307   // TODO: can we be smarter about machine pointer info?
1308   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1309   MachineMemOperand *MMO = MF.getMachineMemOperand(
1310     PtrInfo,
1311     MachineMemOperand::MOLoad |
1312     MachineMemOperand::MODereferenceable |
1313     MachineMemOperand::MOInvariant,
1314     4,
1315     MinAlign(64, StructOffset));
1316 
1317   Register LoadAddr;
1318 
1319   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1320   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1321 }
1322 
1323 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1324   MachineInstr &MI, MachineRegisterInfo &MRI,
1325   MachineIRBuilder &B) const {
1326   MachineFunction &MF = B.getMF();
1327 
1328   B.setInstr(MI);
1329 
1330   const LLT S32 = LLT::scalar(32);
1331   Register Dst = MI.getOperand(0).getReg();
1332   Register Src = MI.getOperand(1).getReg();
1333 
1334   LLT DstTy = MRI.getType(Dst);
1335   LLT SrcTy = MRI.getType(Src);
1336   unsigned DestAS = DstTy.getAddressSpace();
1337   unsigned SrcAS = SrcTy.getAddressSpace();
1338 
1339   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1340   // vector element.
1341   assert(!DstTy.isVector());
1342 
1343   const AMDGPUTargetMachine &TM
1344     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1345 
1346   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1347   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1348     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1349     return true;
1350   }
1351 
1352   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1353     // Truncate.
1354     B.buildExtract(Dst, Src, 0);
1355     MI.eraseFromParent();
1356     return true;
1357   }
1358 
1359   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1360     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1361     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1362 
1363     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1364     // another. Merge operands are required to be the same type, but creating an
1365     // extra ptrtoint would be kind of pointless.
1366     auto HighAddr = B.buildConstant(
1367       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1368     B.buildMerge(Dst, {Src, HighAddr.getReg(0)});
1369     MI.eraseFromParent();
1370     return true;
1371   }
1372 
1373   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1374     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1375            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1376     unsigned NullVal = TM.getNullPointerValue(DestAS);
1377 
1378     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1379     auto FlatNull = B.buildConstant(SrcTy, 0);
1380 
1381     // Extract low 32-bits of the pointer.
1382     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1383 
1384     auto CmpRes =
1385         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1386     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1387 
1388     MI.eraseFromParent();
1389     return true;
1390   }
1391 
1392   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1393     return false;
1394 
1395   if (!ST.hasFlatAddressSpace())
1396     return false;
1397 
1398   auto SegmentNull =
1399       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1400   auto FlatNull =
1401       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1402 
1403   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1404   if (!ApertureReg.isValid())
1405     return false;
1406 
1407   auto CmpRes =
1408       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1409 
1410   // Coerce the type of the low half of the result so we can use merge_values.
1411   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1412 
1413   // TODO: Should we allow mismatched types but matching sizes in merges to
1414   // avoid the ptrtoint?
1415   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1416   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1417 
1418   MI.eraseFromParent();
1419   return true;
1420 }
1421 
1422 bool AMDGPULegalizerInfo::legalizeFrint(
1423   MachineInstr &MI, MachineRegisterInfo &MRI,
1424   MachineIRBuilder &B) const {
1425   B.setInstr(MI);
1426 
1427   Register Src = MI.getOperand(1).getReg();
1428   LLT Ty = MRI.getType(Src);
1429   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1430 
1431   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1432   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1433 
1434   auto C1 = B.buildFConstant(Ty, C1Val);
1435   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1436 
1437   // TODO: Should this propagate fast-math-flags?
1438   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1439   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1440 
1441   auto C2 = B.buildFConstant(Ty, C2Val);
1442   auto Fabs = B.buildFAbs(Ty, Src);
1443 
1444   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1445   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1446   return true;
1447 }
1448 
1449 bool AMDGPULegalizerInfo::legalizeFceil(
1450   MachineInstr &MI, MachineRegisterInfo &MRI,
1451   MachineIRBuilder &B) const {
1452   B.setInstr(MI);
1453 
1454   const LLT S1 = LLT::scalar(1);
1455   const LLT S64 = LLT::scalar(64);
1456 
1457   Register Src = MI.getOperand(1).getReg();
1458   assert(MRI.getType(Src) == S64);
1459 
1460   // result = trunc(src)
1461   // if (src > 0.0 && src != result)
1462   //   result += 1.0
1463 
1464   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1465 
1466   const auto Zero = B.buildFConstant(S64, 0.0);
1467   const auto One = B.buildFConstant(S64, 1.0);
1468   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1469   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1470   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1471   auto Add = B.buildSelect(S64, And, One, Zero);
1472 
1473   // TODO: Should this propagate fast-math-flags?
1474   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1475   return true;
1476 }
1477 
1478 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1479                                               MachineIRBuilder &B) {
1480   const unsigned FractBits = 52;
1481   const unsigned ExpBits = 11;
1482   LLT S32 = LLT::scalar(32);
1483 
1484   auto Const0 = B.buildConstant(S32, FractBits - 32);
1485   auto Const1 = B.buildConstant(S32, ExpBits);
1486 
1487   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1488     .addUse(Const0.getReg(0))
1489     .addUse(Const1.getReg(0));
1490 
1491   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1492 }
1493 
1494 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1495   MachineInstr &MI, MachineRegisterInfo &MRI,
1496   MachineIRBuilder &B) const {
1497   B.setInstr(MI);
1498 
1499   const LLT S1 = LLT::scalar(1);
1500   const LLT S32 = LLT::scalar(32);
1501   const LLT S64 = LLT::scalar(64);
1502 
1503   Register Src = MI.getOperand(1).getReg();
1504   assert(MRI.getType(Src) == S64);
1505 
1506   // TODO: Should this use extract since the low half is unused?
1507   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1508   Register Hi = Unmerge.getReg(1);
1509 
1510   // Extract the upper half, since this is where we will find the sign and
1511   // exponent.
1512   auto Exp = extractF64Exponent(Hi, B);
1513 
1514   const unsigned FractBits = 52;
1515 
1516   // Extract the sign bit.
1517   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1518   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1519 
1520   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1521 
1522   const auto Zero32 = B.buildConstant(S32, 0);
1523 
1524   // Extend back to 64-bits.
1525   auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)});
1526 
1527   auto Shr = B.buildAShr(S64, FractMask, Exp);
1528   auto Not = B.buildNot(S64, Shr);
1529   auto Tmp0 = B.buildAnd(S64, Src, Not);
1530   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1531 
1532   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1533   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1534 
1535   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1536   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1537   return true;
1538 }
1539 
1540 bool AMDGPULegalizerInfo::legalizeITOFP(
1541   MachineInstr &MI, MachineRegisterInfo &MRI,
1542   MachineIRBuilder &B, bool Signed) const {
1543   B.setInstr(MI);
1544 
1545   Register Dst = MI.getOperand(0).getReg();
1546   Register Src = MI.getOperand(1).getReg();
1547 
1548   const LLT S64 = LLT::scalar(64);
1549   const LLT S32 = LLT::scalar(32);
1550 
1551   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1552 
1553   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1554 
1555   auto CvtHi = Signed ?
1556     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1557     B.buildUITOFP(S64, Unmerge.getReg(1));
1558 
1559   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1560 
1561   auto ThirtyTwo = B.buildConstant(S32, 32);
1562   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1563     .addUse(CvtHi.getReg(0))
1564     .addUse(ThirtyTwo.getReg(0));
1565 
1566   // TODO: Should this propagate fast-math-flags?
1567   B.buildFAdd(Dst, LdExp, CvtLo);
1568   MI.eraseFromParent();
1569   return true;
1570 }
1571 
1572 // TODO: Copied from DAG implementation. Verify logic and document how this
1573 // actually works.
1574 bool AMDGPULegalizerInfo::legalizeFPTOI(
1575   MachineInstr &MI, MachineRegisterInfo &MRI,
1576   MachineIRBuilder &B, bool Signed) const {
1577   B.setInstr(MI);
1578 
1579   Register Dst = MI.getOperand(0).getReg();
1580   Register Src = MI.getOperand(1).getReg();
1581 
1582   const LLT S64 = LLT::scalar(64);
1583   const LLT S32 = LLT::scalar(32);
1584 
1585   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1586 
1587   unsigned Flags = MI.getFlags();
1588 
1589   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
1590   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
1591   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
1592 
1593   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
1594   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
1595   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
1596 
1597   auto Hi = Signed ?
1598     B.buildFPTOSI(S32, FloorMul) :
1599     B.buildFPTOUI(S32, FloorMul);
1600   auto Lo = B.buildFPTOUI(S32, Fma);
1601 
1602   B.buildMerge(Dst, { Lo.getReg(0), Hi.getReg(0) });
1603   MI.eraseFromParent();
1604 
1605   return true;
1606 }
1607 
1608 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1609   MachineInstr &MI, MachineRegisterInfo &MRI,
1610   MachineIRBuilder &B) const {
1611   MachineFunction &MF = B.getMF();
1612   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1613 
1614   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1615                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1616 
1617   // With ieee_mode disabled, the instructions have the correct behavior
1618   // already for G_FMINNUM/G_FMAXNUM
1619   if (!MFI->getMode().IEEE)
1620     return !IsIEEEOp;
1621 
1622   if (IsIEEEOp)
1623     return true;
1624 
1625   MachineIRBuilder HelperBuilder(MI);
1626   GISelObserverWrapper DummyObserver;
1627   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1628   HelperBuilder.setInstr(MI);
1629   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1630 }
1631 
1632 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1633   MachineInstr &MI, MachineRegisterInfo &MRI,
1634   MachineIRBuilder &B) const {
1635   // TODO: Should move some of this into LegalizerHelper.
1636 
1637   // TODO: Promote dynamic indexing of s16 to s32
1638   // TODO: Dynamic s64 indexing is only legal for SGPR.
1639   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
1640   if (!IdxVal) // Dynamic case will be selected to register indexing.
1641     return true;
1642 
1643   Register Dst = MI.getOperand(0).getReg();
1644   Register Vec = MI.getOperand(1).getReg();
1645 
1646   LLT VecTy = MRI.getType(Vec);
1647   LLT EltTy = VecTy.getElementType();
1648   assert(EltTy == MRI.getType(Dst));
1649 
1650   B.setInstr(MI);
1651 
1652   if (IdxVal.getValue() < VecTy.getNumElements())
1653     B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits());
1654   else
1655     B.buildUndef(Dst);
1656 
1657   MI.eraseFromParent();
1658   return true;
1659 }
1660 
1661 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1662   MachineInstr &MI, MachineRegisterInfo &MRI,
1663   MachineIRBuilder &B) const {
1664   // TODO: Should move some of this into LegalizerHelper.
1665 
1666   // TODO: Promote dynamic indexing of s16 to s32
1667   // TODO: Dynamic s64 indexing is only legal for SGPR.
1668   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI);
1669   if (!IdxVal) // Dynamic case will be selected to register indexing.
1670     return true;
1671 
1672   Register Dst = MI.getOperand(0).getReg();
1673   Register Vec = MI.getOperand(1).getReg();
1674   Register Ins = MI.getOperand(2).getReg();
1675 
1676   LLT VecTy = MRI.getType(Vec);
1677   LLT EltTy = VecTy.getElementType();
1678   assert(EltTy == MRI.getType(Ins));
1679 
1680   B.setInstr(MI);
1681 
1682   if (IdxVal.getValue() < VecTy.getNumElements())
1683     B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits());
1684   else
1685     B.buildUndef(Dst);
1686 
1687   MI.eraseFromParent();
1688   return true;
1689 }
1690 
1691 static bool isLegalVOP3PShuffleMask(ArrayRef<int> Mask) {
1692   assert(Mask.size() == 2);
1693 
1694   // If one half is undef, the other is trivially in the same reg.
1695   if (Mask[0] == -1 || Mask[1] == -1)
1696     return true;
1697   return ((Mask[0] == 0 || Mask[0] == 1) && (Mask[1] == 0 || Mask[1] == 1)) ||
1698          ((Mask[0] == 2 || Mask[0] == 3) && (Mask[1] == 2 || Mask[1] == 3));
1699 }
1700 
1701 bool AMDGPULegalizerInfo::legalizeShuffleVector(
1702   MachineInstr &MI, MachineRegisterInfo &MRI,
1703   MachineIRBuilder &B) const {
1704   const LLT V2S16 = LLT::vector(2, 16);
1705 
1706   Register Dst = MI.getOperand(0).getReg();
1707   Register Src0 = MI.getOperand(1).getReg();
1708   LLT DstTy = MRI.getType(Dst);
1709   LLT SrcTy = MRI.getType(Src0);
1710 
1711   if (SrcTy == V2S16 && DstTy == V2S16 &&
1712       isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
1713     return true;
1714 
1715   MachineIRBuilder HelperBuilder(MI);
1716   GISelObserverWrapper DummyObserver;
1717   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
1718   HelperBuilder.setInstr(MI);
1719   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
1720 }
1721 
1722 bool AMDGPULegalizerInfo::legalizeSinCos(
1723   MachineInstr &MI, MachineRegisterInfo &MRI,
1724   MachineIRBuilder &B) const {
1725   B.setInstr(MI);
1726 
1727   Register DstReg = MI.getOperand(0).getReg();
1728   Register SrcReg = MI.getOperand(1).getReg();
1729   LLT Ty = MRI.getType(DstReg);
1730   unsigned Flags = MI.getFlags();
1731 
1732   Register TrigVal;
1733   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1734   if (ST.hasTrigReducedRange()) {
1735     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1736     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1737       .addUse(MulVal.getReg(0))
1738       .setMIFlags(Flags).getReg(0);
1739   } else
1740     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1741 
1742   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1743     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1744   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1745     .addUse(TrigVal)
1746     .setMIFlags(Flags);
1747   MI.eraseFromParent();
1748   return true;
1749 }
1750 
1751 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1752   Register DstReg, LLT PtrTy,
1753   MachineIRBuilder &B, const GlobalValue *GV,
1754   unsigned Offset, unsigned GAFlags) const {
1755   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1756   // to the following code sequence:
1757   //
1758   // For constant address space:
1759   //   s_getpc_b64 s[0:1]
1760   //   s_add_u32 s0, s0, $symbol
1761   //   s_addc_u32 s1, s1, 0
1762   //
1763   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1764   //   a fixup or relocation is emitted to replace $symbol with a literal
1765   //   constant, which is a pc-relative offset from the encoding of the $symbol
1766   //   operand to the global variable.
1767   //
1768   // For global address space:
1769   //   s_getpc_b64 s[0:1]
1770   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1771   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1772   //
1773   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1774   //   fixups or relocations are emitted to replace $symbol@*@lo and
1775   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1776   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
1777   //   operand to the global variable.
1778   //
1779   // What we want here is an offset from the value returned by s_getpc
1780   // (which is the address of the s_add_u32 instruction) to the global
1781   // variable, but since the encoding of $symbol starts 4 bytes after the start
1782   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1783   // small. This requires us to add 4 to the global variable offset in order to
1784   // compute the correct address.
1785 
1786   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1787 
1788   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1789     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1790 
1791   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1792     .addDef(PCReg);
1793 
1794   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1795   if (GAFlags == SIInstrInfo::MO_NONE)
1796     MIB.addImm(0);
1797   else
1798     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1799 
1800   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1801 
1802   if (PtrTy.getSizeInBits() == 32)
1803     B.buildExtract(DstReg, PCReg, 0);
1804   return true;
1805  }
1806 
1807 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1808   MachineInstr &MI, MachineRegisterInfo &MRI,
1809   MachineIRBuilder &B) const {
1810   Register DstReg = MI.getOperand(0).getReg();
1811   LLT Ty = MRI.getType(DstReg);
1812   unsigned AS = Ty.getAddressSpace();
1813 
1814   const GlobalValue *GV = MI.getOperand(1).getGlobal();
1815   MachineFunction &MF = B.getMF();
1816   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1817   B.setInstr(MI);
1818 
1819   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1820     if (!MFI->isEntryFunction()) {
1821       const Function &Fn = MF.getFunction();
1822       DiagnosticInfoUnsupported BadLDSDecl(
1823         Fn, "local memory global used by non-kernel function", MI.getDebugLoc());
1824       Fn.getContext().diagnose(BadLDSDecl);
1825     }
1826 
1827     // TODO: We could emit code to handle the initialization somewhere.
1828     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1829       const SITargetLowering *TLI = ST.getTargetLowering();
1830       if (!TLI->shouldUseLDSConstAddress(GV)) {
1831         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
1832         return true; // Leave in place;
1833       }
1834 
1835       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1836       MI.eraseFromParent();
1837       return true;
1838     }
1839 
1840     const Function &Fn = MF.getFunction();
1841     DiagnosticInfoUnsupported BadInit(
1842       Fn, "unsupported initializer for address space", MI.getDebugLoc());
1843     Fn.getContext().diagnose(BadInit);
1844     return true;
1845   }
1846 
1847   const SITargetLowering *TLI = ST.getTargetLowering();
1848 
1849   if (TLI->shouldEmitFixup(GV)) {
1850     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
1851     MI.eraseFromParent();
1852     return true;
1853   }
1854 
1855   if (TLI->shouldEmitPCReloc(GV)) {
1856     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
1857     MI.eraseFromParent();
1858     return true;
1859   }
1860 
1861   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1862   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
1863 
1864   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
1865     MachinePointerInfo::getGOT(MF),
1866     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1867     MachineMemOperand::MOInvariant,
1868     8 /*Size*/, 8 /*Align*/);
1869 
1870   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
1871 
1872   if (Ty.getSizeInBits() == 32) {
1873     // Truncate if this is a 32-bit constant adrdess.
1874     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
1875     B.buildExtract(DstReg, Load, 0);
1876   } else
1877     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
1878 
1879   MI.eraseFromParent();
1880   return true;
1881 }
1882 
1883 bool AMDGPULegalizerInfo::legalizeLoad(
1884   MachineInstr &MI, MachineRegisterInfo &MRI,
1885   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
1886   B.setInstr(MI);
1887   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1888   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
1889   Observer.changingInstr(MI);
1890   MI.getOperand(1).setReg(Cast.getReg(0));
1891   Observer.changedInstr(MI);
1892   return true;
1893 }
1894 
1895 bool AMDGPULegalizerInfo::legalizeFMad(
1896   MachineInstr &MI, MachineRegisterInfo &MRI,
1897   MachineIRBuilder &B) const {
1898   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1899   assert(Ty.isScalar());
1900 
1901   MachineFunction &MF = B.getMF();
1902   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1903 
1904   // TODO: Always legal with future ftz flag.
1905   // FIXME: Do we need just output?
1906   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
1907     return true;
1908   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
1909     return true;
1910 
1911   MachineIRBuilder HelperBuilder(MI);
1912   GISelObserverWrapper DummyObserver;
1913   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1914   HelperBuilder.setMBB(*MI.getParent());
1915   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
1916 }
1917 
1918 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
1919   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
1920   Register DstReg = MI.getOperand(0).getReg();
1921   Register PtrReg = MI.getOperand(1).getReg();
1922   Register CmpVal = MI.getOperand(2).getReg();
1923   Register NewVal = MI.getOperand(3).getReg();
1924 
1925   assert(SITargetLowering::isFlatGlobalAddrSpace(
1926            MRI.getType(PtrReg).getAddressSpace()) &&
1927          "this should not have been custom lowered");
1928 
1929   LLT ValTy = MRI.getType(CmpVal);
1930   LLT VecTy = LLT::vector(2, ValTy);
1931 
1932   B.setInstr(MI);
1933   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
1934 
1935   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
1936     .addDef(DstReg)
1937     .addUse(PtrReg)
1938     .addUse(PackedVal)
1939     .setMemRefs(MI.memoperands());
1940 
1941   MI.eraseFromParent();
1942   return true;
1943 }
1944 
1945 bool AMDGPULegalizerInfo::legalizeFlog(
1946   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
1947   Register Dst = MI.getOperand(0).getReg();
1948   Register Src = MI.getOperand(1).getReg();
1949   LLT Ty = B.getMRI()->getType(Dst);
1950   unsigned Flags = MI.getFlags();
1951   B.setInstr(MI);
1952 
1953   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
1954   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
1955 
1956   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
1957   MI.eraseFromParent();
1958   return true;
1959 }
1960 
1961 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
1962                                        MachineIRBuilder &B) const {
1963   Register Dst = MI.getOperand(0).getReg();
1964   Register Src = MI.getOperand(1).getReg();
1965   unsigned Flags = MI.getFlags();
1966   LLT Ty = B.getMRI()->getType(Dst);
1967   B.setInstr(MI);
1968 
1969   auto K = B.buildFConstant(Ty, numbers::log2e);
1970   auto Mul = B.buildFMul(Ty, Src, K, Flags);
1971   B.buildFExp2(Dst, Mul, Flags);
1972   MI.eraseFromParent();
1973   return true;
1974 }
1975 
1976 // Turn an illegal packed v2s16 build vector into bit operations.
1977 // TODO: This should probably be a bitcast action in LegalizerHelper.
1978 bool AMDGPULegalizerInfo::legalizeBuildVector(
1979   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
1980   Register Dst = MI.getOperand(0).getReg();
1981   LLT DstTy = MRI.getType(Dst);
1982   const LLT S32 = LLT::scalar(32);
1983   const LLT V2S16 = LLT::vector(2, 16);
1984   (void)DstTy;
1985   (void)V2S16;
1986   assert(DstTy == V2S16);
1987 
1988   Register Src0 = MI.getOperand(1).getReg();
1989   Register Src1 = MI.getOperand(2).getReg();
1990   assert(MRI.getType(Src0) == LLT::scalar(16));
1991 
1992   B.setInstr(MI);
1993   auto Merge = B.buildMerge(S32, {Src0, Src1});
1994   B.buildBitcast(Dst, Merge);
1995 
1996   MI.eraseFromParent();
1997   return true;
1998 }
1999 
2000 // Return the use branch instruction, otherwise null if the usage is invalid.
2001 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2002                                        MachineRegisterInfo &MRI,
2003                                        MachineInstr *&Br) {
2004   Register CondDef = MI.getOperand(0).getReg();
2005   if (!MRI.hasOneNonDBGUse(CondDef))
2006     return nullptr;
2007 
2008   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2009   if (UseMI.getParent() != MI.getParent() ||
2010       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2011     return nullptr;
2012 
2013   // Make sure the cond br is followed by a G_BR
2014   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2015   if (Next != MI.getParent()->end()) {
2016     if (Next->getOpcode() != AMDGPU::G_BR)
2017       return nullptr;
2018     Br = &*Next;
2019   }
2020 
2021   return &UseMI;
2022 }
2023 
2024 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
2025                                                 Register Reg, LLT Ty) const {
2026   Register LiveIn = MRI.getLiveInVirtReg(Reg);
2027   if (LiveIn)
2028     return LiveIn;
2029 
2030   Register NewReg = MRI.createGenericVirtualRegister(Ty);
2031   MRI.addLiveIn(Reg, NewReg);
2032   return NewReg;
2033 }
2034 
2035 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2036                                          const ArgDescriptor *Arg) const {
2037   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2038     return false; // TODO: Handle these
2039 
2040   assert(Arg->getRegister().isPhysical());
2041 
2042   MachineRegisterInfo &MRI = *B.getMRI();
2043 
2044   LLT Ty = MRI.getType(DstReg);
2045   Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
2046 
2047   if (Arg->isMasked()) {
2048     // TODO: Should we try to emit this once in the entry block?
2049     const LLT S32 = LLT::scalar(32);
2050     const unsigned Mask = Arg->getMask();
2051     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2052 
2053     Register AndMaskSrc = LiveIn;
2054 
2055     if (Shift != 0) {
2056       auto ShiftAmt = B.buildConstant(S32, Shift);
2057       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2058     }
2059 
2060     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2061   } else
2062     B.buildCopy(DstReg, LiveIn);
2063 
2064   // Insert the argument copy if it doens't already exist.
2065   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
2066   if (!MRI.getVRegDef(LiveIn)) {
2067     // FIXME: Should have scoped insert pt
2068     MachineBasicBlock &OrigInsBB = B.getMBB();
2069     auto OrigInsPt = B.getInsertPt();
2070 
2071     MachineBasicBlock &EntryMBB = B.getMF().front();
2072     EntryMBB.addLiveIn(Arg->getRegister());
2073     B.setInsertPt(EntryMBB, EntryMBB.begin());
2074     B.buildCopy(LiveIn, Arg->getRegister());
2075 
2076     B.setInsertPt(OrigInsBB, OrigInsPt);
2077   }
2078 
2079   return true;
2080 }
2081 
2082 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2083   MachineInstr &MI,
2084   MachineRegisterInfo &MRI,
2085   MachineIRBuilder &B,
2086   AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2087   B.setInstr(MI);
2088 
2089   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2090 
2091   const ArgDescriptor *Arg;
2092   const TargetRegisterClass *RC;
2093   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
2094   if (!Arg) {
2095     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
2096     return false;
2097   }
2098 
2099   if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
2100     MI.eraseFromParent();
2101     return true;
2102   }
2103 
2104   return false;
2105 }
2106 
2107 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2108                                        MachineRegisterInfo &MRI,
2109                                        MachineIRBuilder &B) const {
2110   B.setInstr(MI);
2111   Register Dst = MI.getOperand(0).getReg();
2112   LLT DstTy = MRI.getType(Dst);
2113   LLT S16 = LLT::scalar(16);
2114   LLT S32 = LLT::scalar(32);
2115   LLT S64 = LLT::scalar(64);
2116 
2117   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2118     return true;
2119 
2120   if (DstTy == S16)
2121     return legalizeFDIV16(MI, MRI, B);
2122   if (DstTy == S32)
2123     return legalizeFDIV32(MI, MRI, B);
2124   if (DstTy == S64)
2125     return legalizeFDIV64(MI, MRI, B);
2126 
2127   return false;
2128 }
2129 
2130 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2131                                                  MachineRegisterInfo &MRI,
2132                                                  MachineIRBuilder &B) const {
2133   Register Res = MI.getOperand(0).getReg();
2134   Register LHS = MI.getOperand(1).getReg();
2135   Register RHS = MI.getOperand(2).getReg();
2136 
2137   uint16_t Flags = MI.getFlags();
2138 
2139   LLT ResTy = MRI.getType(Res);
2140   LLT S32 = LLT::scalar(32);
2141   LLT S64 = LLT::scalar(64);
2142 
2143   const MachineFunction &MF = B.getMF();
2144   bool Unsafe =
2145     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2146 
2147   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2148     return false;
2149 
2150   if (!Unsafe && ResTy == S32 &&
2151       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2152     return false;
2153 
2154   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2155     // 1 / x -> RCP(x)
2156     if (CLHS->isExactlyValue(1.0)) {
2157       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2158         .addUse(RHS)
2159         .setMIFlags(Flags);
2160 
2161       MI.eraseFromParent();
2162       return true;
2163     }
2164 
2165     // -1 / x -> RCP( FNEG(x) )
2166     if (CLHS->isExactlyValue(-1.0)) {
2167       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2168       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2169         .addUse(FNeg.getReg(0))
2170         .setMIFlags(Flags);
2171 
2172       MI.eraseFromParent();
2173       return true;
2174     }
2175   }
2176 
2177   // x / y -> x * (1.0 / y)
2178   if (Unsafe) {
2179     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2180       .addUse(RHS)
2181       .setMIFlags(Flags);
2182     B.buildFMul(Res, LHS, RCP, Flags);
2183 
2184     MI.eraseFromParent();
2185     return true;
2186   }
2187 
2188   return false;
2189 }
2190 
2191 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2192                                          MachineRegisterInfo &MRI,
2193                                          MachineIRBuilder &B) const {
2194   B.setInstr(MI);
2195   Register Res = MI.getOperand(0).getReg();
2196   Register LHS = MI.getOperand(1).getReg();
2197   Register RHS = MI.getOperand(2).getReg();
2198 
2199   uint16_t Flags = MI.getFlags();
2200 
2201   LLT S16 = LLT::scalar(16);
2202   LLT S32 = LLT::scalar(32);
2203 
2204   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2205   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2206 
2207   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2208     .addUse(RHSExt.getReg(0))
2209     .setMIFlags(Flags);
2210 
2211   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2212   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2213 
2214   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2215     .addUse(RDst.getReg(0))
2216     .addUse(RHS)
2217     .addUse(LHS)
2218     .setMIFlags(Flags);
2219 
2220   MI.eraseFromParent();
2221   return true;
2222 }
2223 
2224 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2225 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2226 static void toggleSPDenormMode(bool Enable,
2227                                MachineIRBuilder &B,
2228                                const GCNSubtarget &ST,
2229                                AMDGPU::SIModeRegisterDefaults Mode) {
2230   // Set SP denorm mode to this value.
2231   unsigned SPDenormMode =
2232     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
2233 
2234   if (ST.hasDenormModeInst()) {
2235     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2236     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
2237 
2238     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2239     B.buildInstr(AMDGPU::S_DENORM_MODE)
2240       .addImm(NewDenormModeValue);
2241 
2242   } else {
2243     // Select FP32 bit field in mode register.
2244     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2245                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2246                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2247 
2248     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2249       .addImm(SPDenormMode)
2250       .addImm(SPDenormModeBitField);
2251   }
2252 }
2253 
2254 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2255                                          MachineRegisterInfo &MRI,
2256                                          MachineIRBuilder &B) const {
2257   B.setInstr(MI);
2258   Register Res = MI.getOperand(0).getReg();
2259   Register LHS = MI.getOperand(1).getReg();
2260   Register RHS = MI.getOperand(2).getReg();
2261   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2262   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2263 
2264   uint16_t Flags = MI.getFlags();
2265 
2266   LLT S32 = LLT::scalar(32);
2267   LLT S1 = LLT::scalar(1);
2268 
2269   auto One = B.buildFConstant(S32, 1.0f);
2270 
2271   auto DenominatorScaled =
2272     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2273       .addUse(RHS)
2274       .addUse(LHS)
2275       .addImm(1)
2276       .setMIFlags(Flags);
2277   auto NumeratorScaled =
2278     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2279       .addUse(LHS)
2280       .addUse(RHS)
2281       .addImm(0)
2282       .setMIFlags(Flags);
2283 
2284   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2285     .addUse(DenominatorScaled.getReg(0))
2286     .setMIFlags(Flags);
2287   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2288 
2289   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2290   // aren't modeled as reading it.
2291   if (!Mode.allFP32Denormals())
2292     toggleSPDenormMode(true, B, ST, Mode);
2293 
2294   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2295   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2296   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2297   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2298   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2299   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2300 
2301   if (!Mode.allFP32Denormals())
2302     toggleSPDenormMode(false, B, ST, Mode);
2303 
2304   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2305     .addUse(Fma4.getReg(0))
2306     .addUse(Fma1.getReg(0))
2307     .addUse(Fma3.getReg(0))
2308     .addUse(NumeratorScaled.getReg(1))
2309     .setMIFlags(Flags);
2310 
2311   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2312     .addUse(Fmas.getReg(0))
2313     .addUse(RHS)
2314     .addUse(LHS)
2315     .setMIFlags(Flags);
2316 
2317   MI.eraseFromParent();
2318   return true;
2319 }
2320 
2321 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
2322                                          MachineRegisterInfo &MRI,
2323                                          MachineIRBuilder &B) const {
2324   B.setInstr(MI);
2325   Register Res = MI.getOperand(0).getReg();
2326   Register LHS = MI.getOperand(1).getReg();
2327   Register RHS = MI.getOperand(2).getReg();
2328 
2329   uint16_t Flags = MI.getFlags();
2330 
2331   LLT S64 = LLT::scalar(64);
2332   LLT S1 = LLT::scalar(1);
2333 
2334   auto One = B.buildFConstant(S64, 1.0);
2335 
2336   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2337     .addUse(LHS)
2338     .addUse(RHS)
2339     .addImm(1)
2340     .setMIFlags(Flags);
2341 
2342   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
2343 
2344   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
2345     .addUse(DivScale0.getReg(0))
2346     .setMIFlags(Flags);
2347 
2348   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
2349   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
2350   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
2351 
2352   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2353     .addUse(LHS)
2354     .addUse(RHS)
2355     .addImm(0)
2356     .setMIFlags(Flags);
2357 
2358   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
2359   auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags);
2360   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
2361 
2362   Register Scale;
2363   if (!ST.hasUsableDivScaleConditionOutput()) {
2364     // Workaround a hardware bug on SI where the condition output from div_scale
2365     // is not usable.
2366 
2367     LLT S32 = LLT::scalar(32);
2368 
2369     auto NumUnmerge = B.buildUnmerge(S32, LHS);
2370     auto DenUnmerge = B.buildUnmerge(S32, RHS);
2371     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
2372     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
2373 
2374     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
2375                               Scale1Unmerge.getReg(1));
2376     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
2377                               Scale0Unmerge.getReg(1));
2378     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
2379   } else {
2380     Scale = DivScale1.getReg(1);
2381   }
2382 
2383   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
2384     .addUse(Fma4.getReg(0))
2385     .addUse(Fma3.getReg(0))
2386     .addUse(Mul.getReg(0))
2387     .addUse(Scale)
2388     .setMIFlags(Flags);
2389 
2390   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
2391     .addUse(Fmas.getReg(0))
2392     .addUse(RHS)
2393     .addUse(LHS)
2394     .setMIFlags(Flags);
2395 
2396   MI.eraseFromParent();
2397   return true;
2398 }
2399 
2400 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
2401                                                  MachineRegisterInfo &MRI,
2402                                                  MachineIRBuilder &B) const {
2403   B.setInstr(MI);
2404   Register Res = MI.getOperand(0).getReg();
2405   Register LHS = MI.getOperand(2).getReg();
2406   Register RHS = MI.getOperand(3).getReg();
2407   uint16_t Flags = MI.getFlags();
2408 
2409   LLT S32 = LLT::scalar(32);
2410   LLT S1 = LLT::scalar(1);
2411 
2412   auto Abs = B.buildFAbs(S32, RHS, Flags);
2413   const APFloat C0Val(1.0f);
2414 
2415   auto C0 = B.buildConstant(S32, 0x6f800000);
2416   auto C1 = B.buildConstant(S32, 0x2f800000);
2417   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
2418 
2419   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
2420   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
2421 
2422   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
2423 
2424   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2425     .addUse(Mul0.getReg(0))
2426     .setMIFlags(Flags);
2427 
2428   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
2429 
2430   B.buildFMul(Res, Sel, Mul1, Flags);
2431 
2432   MI.eraseFromParent();
2433   return true;
2434 }
2435 
2436 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
2437                                                  MachineRegisterInfo &MRI,
2438                                                  MachineIRBuilder &B) const {
2439   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2440   if (!MFI->isEntryFunction()) {
2441     return legalizePreloadedArgIntrin(MI, MRI, B,
2442                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
2443   }
2444 
2445   B.setInstr(MI);
2446 
2447   uint64_t Offset =
2448     ST.getTargetLowering()->getImplicitParameterOffset(
2449       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
2450   Register DstReg = MI.getOperand(0).getReg();
2451   LLT DstTy = MRI.getType(DstReg);
2452   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
2453 
2454   const ArgDescriptor *Arg;
2455   const TargetRegisterClass *RC;
2456   std::tie(Arg, RC)
2457     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2458   if (!Arg)
2459     return false;
2460 
2461   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
2462   if (!loadInputValue(KernargPtrReg, B, Arg))
2463     return false;
2464 
2465   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
2466   MI.eraseFromParent();
2467   return true;
2468 }
2469 
2470 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
2471                                               MachineRegisterInfo &MRI,
2472                                               MachineIRBuilder &B,
2473                                               unsigned AddrSpace) const {
2474   B.setInstr(MI);
2475   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
2476   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
2477   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
2478   MI.eraseFromParent();
2479   return true;
2480 }
2481 
2482 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
2483 // offset (the offset that is included in bounds checking and swizzling, to be
2484 // split between the instruction's voffset and immoffset fields) and soffset
2485 // (the offset that is excluded from bounds checking and swizzling, to go in
2486 // the instruction's soffset field).  This function takes the first kind of
2487 // offset and figures out how to split it between voffset and immoffset.
2488 std::tuple<Register, unsigned, unsigned>
2489 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
2490                                         Register OrigOffset) const {
2491   const unsigned MaxImm = 4095;
2492   Register BaseReg;
2493   unsigned TotalConstOffset;
2494   MachineInstr *OffsetDef;
2495   const LLT S32 = LLT::scalar(32);
2496 
2497   std::tie(BaseReg, TotalConstOffset, OffsetDef)
2498     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
2499 
2500   unsigned ImmOffset = TotalConstOffset;
2501 
2502   // If the immediate value is too big for the immoffset field, put the value
2503   // and -4096 into the immoffset field so that the value that is copied/added
2504   // for the voffset field is a multiple of 4096, and it stands more chance
2505   // of being CSEd with the copy/add for another similar load/store.
2506   // However, do not do that rounding down to a multiple of 4096 if that is a
2507   // negative number, as it appears to be illegal to have a negative offset
2508   // in the vgpr, even if adding the immediate offset makes it positive.
2509   unsigned Overflow = ImmOffset & ~MaxImm;
2510   ImmOffset -= Overflow;
2511   if ((int32_t)Overflow < 0) {
2512     Overflow += ImmOffset;
2513     ImmOffset = 0;
2514   }
2515 
2516   if (Overflow != 0) {
2517     if (!BaseReg) {
2518       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
2519     } else {
2520       auto OverflowVal = B.buildConstant(S32, Overflow);
2521       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
2522     }
2523   }
2524 
2525   if (!BaseReg)
2526     BaseReg = B.buildConstant(S32, 0).getReg(0);
2527 
2528   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
2529 }
2530 
2531 /// Handle register layout difference for f16 images for some subtargets.
2532 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
2533                                              MachineRegisterInfo &MRI,
2534                                              Register Reg) const {
2535   if (!ST.hasUnpackedD16VMem())
2536     return Reg;
2537 
2538   const LLT S16 = LLT::scalar(16);
2539   const LLT S32 = LLT::scalar(32);
2540   LLT StoreVT = MRI.getType(Reg);
2541   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
2542 
2543   auto Unmerge = B.buildUnmerge(S16, Reg);
2544 
2545   SmallVector<Register, 4> WideRegs;
2546   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
2547     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
2548 
2549   int NumElts = StoreVT.getNumElements();
2550 
2551   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
2552 }
2553 
2554 Register AMDGPULegalizerInfo::fixStoreSourceType(
2555   MachineIRBuilder &B, Register VData, bool IsFormat) const {
2556   MachineRegisterInfo *MRI = B.getMRI();
2557   LLT Ty = MRI->getType(VData);
2558 
2559   const LLT S16 = LLT::scalar(16);
2560 
2561   // Fixup illegal register types for i8 stores.
2562   if (Ty == LLT::scalar(8) || Ty == S16) {
2563     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
2564     return AnyExt;
2565   }
2566 
2567   if (Ty.isVector()) {
2568     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
2569       if (IsFormat)
2570         return handleD16VData(B, *MRI, VData);
2571     }
2572   }
2573 
2574   return VData;
2575 }
2576 
2577 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
2578                                               MachineRegisterInfo &MRI,
2579                                               MachineIRBuilder &B,
2580                                               bool IsTyped,
2581                                               bool IsFormat) const {
2582   B.setInstr(MI);
2583 
2584   Register VData = MI.getOperand(1).getReg();
2585   LLT Ty = MRI.getType(VData);
2586   LLT EltTy = Ty.getScalarType();
2587   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
2588   const LLT S32 = LLT::scalar(32);
2589 
2590   VData = fixStoreSourceType(B, VData, IsFormat);
2591   Register RSrc = MI.getOperand(2).getReg();
2592 
2593   MachineMemOperand *MMO = *MI.memoperands_begin();
2594   const int MemSize = MMO->getSize();
2595 
2596   unsigned ImmOffset;
2597   unsigned TotalOffset;
2598 
2599   // The typed intrinsics add an immediate after the registers.
2600   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
2601 
2602   // The struct intrinsic variants add one additional operand over raw.
2603   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
2604   Register VIndex;
2605   int OpOffset = 0;
2606   if (HasVIndex) {
2607     VIndex = MI.getOperand(3).getReg();
2608     OpOffset = 1;
2609   }
2610 
2611   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
2612   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
2613 
2614   unsigned Format = 0;
2615   if (IsTyped) {
2616     Format = MI.getOperand(5 + OpOffset).getImm();
2617     ++OpOffset;
2618   }
2619 
2620   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
2621 
2622   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
2623   if (TotalOffset != 0)
2624     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
2625 
2626   unsigned Opc;
2627   if (IsTyped) {
2628     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
2629                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
2630   } else if (IsFormat) {
2631     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
2632                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
2633   } else {
2634     switch (MemSize) {
2635     case 1:
2636       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
2637       break;
2638     case 2:
2639       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
2640       break;
2641     default:
2642       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
2643       break;
2644     }
2645   }
2646 
2647   if (!VIndex)
2648     VIndex = B.buildConstant(S32, 0).getReg(0);
2649 
2650   auto MIB = B.buildInstr(Opc)
2651     .addUse(VData)              // vdata
2652     .addUse(RSrc)               // rsrc
2653     .addUse(VIndex)             // vindex
2654     .addUse(VOffset)            // voffset
2655     .addUse(SOffset)            // soffset
2656     .addImm(ImmOffset);         // offset(imm)
2657 
2658   if (IsTyped)
2659     MIB.addImm(Format);
2660 
2661   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
2662      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
2663      .addMemOperand(MMO);
2664 
2665   MI.eraseFromParent();
2666   return true;
2667 }
2668 
2669 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
2670                                              MachineRegisterInfo &MRI,
2671                                              MachineIRBuilder &B,
2672                                              bool IsFormat,
2673                                              bool IsTyped) const {
2674   B.setInstr(MI);
2675 
2676   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
2677   MachineMemOperand *MMO = *MI.memoperands_begin();
2678   const int MemSize = MMO->getSize();
2679   const LLT S32 = LLT::scalar(32);
2680 
2681   Register Dst = MI.getOperand(0).getReg();
2682   Register RSrc = MI.getOperand(2).getReg();
2683 
2684   // The typed intrinsics add an immediate after the registers.
2685   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
2686 
2687   // The struct intrinsic variants add one additional operand over raw.
2688   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
2689   Register VIndex;
2690   int OpOffset = 0;
2691   if (HasVIndex) {
2692     VIndex = MI.getOperand(3).getReg();
2693     OpOffset = 1;
2694   }
2695 
2696   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
2697   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
2698 
2699   unsigned Format = 0;
2700   if (IsTyped) {
2701     Format = MI.getOperand(5 + OpOffset).getImm();
2702     ++OpOffset;
2703   }
2704 
2705   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
2706   unsigned ImmOffset;
2707   unsigned TotalOffset;
2708 
2709   LLT Ty = MRI.getType(Dst);
2710   LLT EltTy = Ty.getScalarType();
2711   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
2712   const bool Unpacked = ST.hasUnpackedD16VMem();
2713 
2714   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
2715   if (TotalOffset != 0)
2716     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
2717 
2718   unsigned Opc;
2719 
2720   if (IsTyped) {
2721     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
2722                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
2723   } else if (IsFormat) {
2724     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
2725                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
2726   } else {
2727     switch (MemSize) {
2728     case 1:
2729       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
2730       break;
2731     case 2:
2732       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
2733       break;
2734     default:
2735       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
2736       break;
2737     }
2738   }
2739 
2740   Register LoadDstReg;
2741 
2742   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
2743   LLT UnpackedTy = Ty.changeElementSize(32);
2744 
2745   if (IsExtLoad)
2746     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
2747   else if (Unpacked && IsD16 && Ty.isVector())
2748     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
2749   else
2750     LoadDstReg = Dst;
2751 
2752   if (!VIndex)
2753     VIndex = B.buildConstant(S32, 0).getReg(0);
2754 
2755   auto MIB = B.buildInstr(Opc)
2756     .addDef(LoadDstReg)         // vdata
2757     .addUse(RSrc)               // rsrc
2758     .addUse(VIndex)             // vindex
2759     .addUse(VOffset)            // voffset
2760     .addUse(SOffset)            // soffset
2761     .addImm(ImmOffset);         // offset(imm)
2762 
2763   if (IsTyped)
2764     MIB.addImm(Format);
2765 
2766   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
2767      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
2768      .addMemOperand(MMO);
2769 
2770   if (LoadDstReg != Dst) {
2771     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2772 
2773     // Widen result for extending loads was widened.
2774     if (IsExtLoad)
2775       B.buildTrunc(Dst, LoadDstReg);
2776     else {
2777       // Repack to original 16-bit vector result
2778       // FIXME: G_TRUNC should work, but legalization currently fails
2779       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
2780       SmallVector<Register, 4> Repack;
2781       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
2782         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
2783       B.buildMerge(Dst, Repack);
2784     }
2785   }
2786 
2787   MI.eraseFromParent();
2788   return true;
2789 }
2790 
2791 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
2792                                                MachineIRBuilder &B,
2793                                                bool IsInc) const {
2794   B.setInstr(MI);
2795   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
2796                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
2797   B.buildInstr(Opc)
2798     .addDef(MI.getOperand(0).getReg())
2799     .addUse(MI.getOperand(2).getReg())
2800     .addUse(MI.getOperand(3).getReg())
2801     .cloneMemRefs(MI);
2802   MI.eraseFromParent();
2803   return true;
2804 }
2805 
2806 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
2807   switch (IntrID) {
2808   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
2809   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
2810     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
2811   case Intrinsic::amdgcn_raw_buffer_atomic_add:
2812   case Intrinsic::amdgcn_struct_buffer_atomic_add:
2813     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
2814   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
2815   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
2816     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
2817   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
2818   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
2819     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
2820   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
2821   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
2822     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
2823   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
2824   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
2825     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
2826   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
2827   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
2828     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
2829   case Intrinsic::amdgcn_raw_buffer_atomic_and:
2830   case Intrinsic::amdgcn_struct_buffer_atomic_and:
2831     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
2832   case Intrinsic::amdgcn_raw_buffer_atomic_or:
2833   case Intrinsic::amdgcn_struct_buffer_atomic_or:
2834     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
2835   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
2836   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
2837     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
2838   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
2839   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
2840     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
2841   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
2842   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
2843     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
2844   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
2845   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
2846     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
2847   default:
2848     llvm_unreachable("unhandled atomic opcode");
2849   }
2850 }
2851 
2852 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
2853                                                MachineIRBuilder &B,
2854                                                Intrinsic::ID IID) const {
2855   B.setInstr(MI);
2856 
2857   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
2858                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
2859 
2860   Register Dst = MI.getOperand(0).getReg();
2861   Register VData = MI.getOperand(2).getReg();
2862 
2863   Register CmpVal;
2864   int OpOffset = 0;
2865 
2866   if (IsCmpSwap) {
2867     CmpVal = MI.getOperand(3 + OpOffset).getReg();
2868     ++OpOffset;
2869   }
2870 
2871   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
2872   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
2873 
2874   // The struct intrinsic variants add one additional operand over raw.
2875   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
2876   Register VIndex;
2877   if (HasVIndex) {
2878     VIndex = MI.getOperand(4 + OpOffset).getReg();
2879     ++OpOffset;
2880   }
2881 
2882   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
2883   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
2884   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
2885 
2886   MachineMemOperand *MMO = *MI.memoperands_begin();
2887 
2888   unsigned ImmOffset;
2889   unsigned TotalOffset;
2890   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
2891   if (TotalOffset != 0)
2892     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
2893 
2894   if (!VIndex)
2895     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
2896 
2897   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
2898     .addDef(Dst)
2899     .addUse(VData); // vdata
2900 
2901   if (IsCmpSwap)
2902     MIB.addReg(CmpVal);
2903 
2904   MIB.addUse(RSrc)               // rsrc
2905      .addUse(VIndex)             // vindex
2906      .addUse(VOffset)            // voffset
2907      .addUse(SOffset)            // soffset
2908      .addImm(ImmOffset)          // offset(imm)
2909      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
2910      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
2911      .addMemOperand(MMO);
2912 
2913   MI.eraseFromParent();
2914   return true;
2915 }
2916 
2917 // Produce a vector of s16 elements from s32 pieces.
2918 static void truncToS16Vector(MachineIRBuilder &B, Register DstReg,
2919                              ArrayRef<Register> UnmergeParts) {
2920   const LLT S16 = LLT::scalar(16);
2921 
2922   SmallVector<Register, 4> RemergeParts(UnmergeParts.size());
2923   for (int I = 0, E = UnmergeParts.size(); I != E; ++I)
2924     RemergeParts[I] = B.buildTrunc(S16, UnmergeParts[I]).getReg(0);
2925 
2926   B.buildBuildVector(DstReg, RemergeParts);
2927 }
2928 
2929 /// Convert a set of s32 registers to a result vector with s16 elements.
2930 static void bitcastToS16Vector(MachineIRBuilder &B, Register DstReg,
2931                                ArrayRef<Register> UnmergeParts) {
2932   MachineRegisterInfo &MRI = *B.getMRI();
2933   const LLT V2S16 = LLT::vector(2, 16);
2934   LLT TargetTy = MRI.getType(DstReg);
2935   int NumElts = UnmergeParts.size();
2936 
2937   if (NumElts == 1) {
2938     assert(TargetTy == V2S16);
2939     B.buildBitcast(DstReg, UnmergeParts[0]);
2940     return;
2941   }
2942 
2943   SmallVector<Register, 4> RemergeParts(NumElts);
2944   for (int I = 0; I != NumElts; ++I)
2945     RemergeParts[I] = B.buildBitcast(V2S16, UnmergeParts[I]).getReg(0);
2946 
2947   if (TargetTy.getSizeInBits() == 32u * NumElts) {
2948     B.buildConcatVectors(DstReg, RemergeParts);
2949     return;
2950   }
2951 
2952   const LLT V3S16 = LLT::vector(3, 16);
2953   const LLT V6S16 = LLT::vector(6, 16);
2954 
2955   // Widen to v6s16 and unpack v3 parts.
2956   assert(TargetTy == V3S16);
2957 
2958   RemergeParts.push_back(B.buildUndef(V2S16).getReg(0));
2959   auto Concat = B.buildConcatVectors(V6S16, RemergeParts);
2960   B.buildUnmerge({DstReg, MRI.createGenericVirtualRegister(V3S16)}, Concat);
2961 }
2962 
2963 // FIXME: Just vector trunc should be sufficent, but legalization currently
2964 // broken.
2965 static void repackUnpackedD16Load(MachineIRBuilder &B, Register DstReg,
2966                                   Register WideDstReg) {
2967   const LLT S32 = LLT::scalar(32);
2968   const LLT S16 = LLT::scalar(16);
2969 
2970   auto Unmerge = B.buildUnmerge(S32, WideDstReg);
2971 
2972   int NumOps = Unmerge->getNumOperands() - 1;
2973   SmallVector<Register, 4> RemergeParts(NumOps);
2974   for (int I = 0; I != NumOps; ++I)
2975     RemergeParts[I] = B.buildTrunc(S16, Unmerge.getReg(I)).getReg(0);
2976 
2977   B.buildBuildVector(DstReg, RemergeParts);
2978 }
2979 
2980 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
2981     MachineInstr &MI, MachineIRBuilder &B,
2982     GISelChangeObserver &Observer,
2983     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
2984   bool IsTFE = MI.getNumExplicitDefs() == 2;
2985 
2986   // We are only processing the operands of d16 image operations on subtargets
2987   // that use the unpacked register layout, or need to repack the TFE result.
2988 
2989   // TODO: Need to handle a16 images too
2990   // TODO: Do we need to guard against already legalized intrinsics?
2991   if (!IsTFE && !ST.hasUnpackedD16VMem())
2992     return true;
2993 
2994   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
2995     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
2996 
2997   if (BaseOpcode->Atomic) // No d16 atomics, or TFE.
2998     return true;
2999 
3000   B.setInstr(MI);
3001 
3002   MachineRegisterInfo *MRI = B.getMRI();
3003   const LLT S32 = LLT::scalar(32);
3004   const LLT S16 = LLT::scalar(16);
3005 
3006   if (BaseOpcode->Store) { // No TFE for stores?
3007     Register VData = MI.getOperand(1).getReg();
3008     LLT Ty = MRI->getType(VData);
3009     if (!Ty.isVector() || Ty.getElementType() != S16)
3010       return true;
3011 
3012     B.setInstr(MI);
3013 
3014     Observer.changingInstr(MI);
3015     MI.getOperand(1).setReg(handleD16VData(B, *MRI, VData));
3016     Observer.changedInstr(MI);
3017     return true;
3018   }
3019 
3020   Register DstReg = MI.getOperand(0).getReg();
3021   LLT Ty = MRI->getType(DstReg);
3022   const LLT EltTy = Ty.getScalarType();
3023   const bool IsD16 = Ty.getScalarType() == S16;
3024   const unsigned NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
3025 
3026   if (IsTFE) {
3027     // In the IR, TFE is supposed to be used with a 2 element struct return
3028     // type. The intruction really returns these two values in one contiguous
3029     // register, with one additional dword beyond the loaded data. Rewrite the
3030     // return type to use a single register result.
3031     Register Dst1Reg = MI.getOperand(1).getReg();
3032     if (MRI->getType(Dst1Reg) != S32)
3033       return false;
3034 
3035     // TODO: Make sure the TFE operand bit is set.
3036 
3037     // The raw dword aligned data component of the load. The only legal cases
3038     // where this matters should be when using the packed D16 format, for
3039     // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
3040     LLT RoundedTy;
3041     LLT TFETy;
3042 
3043     if (IsD16 && ST.hasUnpackedD16VMem()) {
3044       RoundedTy = LLT::scalarOrVector(NumElts, 32);
3045       TFETy = LLT::vector(NumElts + 1, 32);
3046     } else {
3047       unsigned EltSize = Ty.getScalarSizeInBits();
3048       unsigned RoundedElts = (Ty.getSizeInBits() + 31) / 32;
3049       unsigned RoundedSize = 32 * RoundedElts;
3050       RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
3051       TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
3052     }
3053 
3054     Register TFEReg = MRI->createGenericVirtualRegister(TFETy);
3055     Observer.changingInstr(MI);
3056 
3057     MI.getOperand(0).setReg(TFEReg);
3058     MI.RemoveOperand(1);
3059 
3060     Observer.changedInstr(MI);
3061 
3062     // Insert after the instruction.
3063     B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3064 
3065     // Now figure out how to copy the new result register back into the old
3066     // result.
3067 
3068     SmallVector<Register, 5> UnmergeResults(TFETy.getNumElements(), Dst1Reg);
3069     int NumDataElts = TFETy.getNumElements() - 1;
3070 
3071     if (!Ty.isVector()) {
3072       // Simplest case is a trivial unmerge (plus a truncate for d16).
3073       UnmergeResults[0] = Ty == S32 ?
3074         DstReg : MRI->createGenericVirtualRegister(S32);
3075 
3076       B.buildUnmerge(UnmergeResults, TFEReg);
3077       if (Ty != S32)
3078         B.buildTrunc(DstReg, UnmergeResults[0]);
3079       return true;
3080     }
3081 
3082     // We have to repack into a new vector of some kind.
3083     for (int I = 0; I != NumDataElts; ++I)
3084       UnmergeResults[I] = MRI->createGenericVirtualRegister(S32);
3085     B.buildUnmerge(UnmergeResults, TFEReg);
3086 
3087     // Drop the final TFE element.
3088     ArrayRef<Register> DataPart(UnmergeResults.data(), NumDataElts);
3089 
3090     if (EltTy == S32)
3091       B.buildBuildVector(DstReg, DataPart);
3092     else if (ST.hasUnpackedD16VMem())
3093       truncToS16Vector(B, DstReg, DataPart);
3094     else
3095       bitcastToS16Vector(B, DstReg, DataPart);
3096 
3097     return true;
3098   }
3099 
3100   // Must be an image load.
3101   if (!Ty.isVector() || Ty.getElementType() != S16)
3102     return true;
3103 
3104   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3105 
3106   LLT WidenedTy = Ty.changeElementType(S32);
3107   Register WideDstReg = MRI->createGenericVirtualRegister(WidenedTy);
3108 
3109   Observer.changingInstr(MI);
3110   MI.getOperand(0).setReg(WideDstReg);
3111   Observer.changedInstr(MI);
3112 
3113   repackUnpackedD16Load(B, DstReg, WideDstReg);
3114   return true;
3115 }
3116 
3117 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
3118   MachineInstr &MI, MachineIRBuilder &B,
3119   GISelChangeObserver &Observer) const {
3120   Register Dst = MI.getOperand(0).getReg();
3121   LLT Ty = B.getMRI()->getType(Dst);
3122   unsigned Size = Ty.getSizeInBits();
3123 
3124   // There are no 96-bit result scalar loads, but widening to 128-bit should
3125   // always be legal. We may need to restore this to a 96-bit result if it turns
3126   // out this needs to be converted to a vector load during RegBankSelect.
3127   if (isPowerOf2_32(Size))
3128     return true;
3129 
3130   LegalizerHelper Helper(B.getMF(), *this, Observer, B);
3131   B.setInstr(MI);
3132 
3133   Observer.changingInstr(MI);
3134 
3135   if (Ty.isVector())
3136     Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
3137   else
3138     Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
3139 
3140   Observer.changedInstr(MI);
3141   return true;
3142 }
3143 
3144 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
3145                                             MachineIRBuilder &B,
3146                                             GISelChangeObserver &Observer) const {
3147   MachineRegisterInfo &MRI = *B.getMRI();
3148 
3149   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
3150   auto IntrID = MI.getIntrinsicID();
3151   switch (IntrID) {
3152   case Intrinsic::amdgcn_if:
3153   case Intrinsic::amdgcn_else: {
3154     MachineInstr *Br = nullptr;
3155     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
3156       const SIRegisterInfo *TRI
3157         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
3158 
3159       B.setInstr(*BrCond);
3160       Register Def = MI.getOperand(1).getReg();
3161       Register Use = MI.getOperand(3).getReg();
3162 
3163       MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB();
3164       if (Br)
3165         BrTarget = Br->getOperand(0).getMBB();
3166 
3167       if (IntrID == Intrinsic::amdgcn_if) {
3168         B.buildInstr(AMDGPU::SI_IF)
3169           .addDef(Def)
3170           .addUse(Use)
3171           .addMBB(BrTarget);
3172       } else {
3173         B.buildInstr(AMDGPU::SI_ELSE)
3174           .addDef(Def)
3175           .addUse(Use)
3176           .addMBB(BrTarget)
3177           .addImm(0);
3178       }
3179 
3180       if (Br)
3181         Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB());
3182 
3183       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
3184       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
3185       MI.eraseFromParent();
3186       BrCond->eraseFromParent();
3187       return true;
3188     }
3189 
3190     return false;
3191   }
3192   case Intrinsic::amdgcn_loop: {
3193     MachineInstr *Br = nullptr;
3194     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
3195       const SIRegisterInfo *TRI
3196         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
3197 
3198       B.setInstr(*BrCond);
3199 
3200       // FIXME: Need to adjust branch targets based on unconditional branch.
3201       Register Reg = MI.getOperand(2).getReg();
3202       B.buildInstr(AMDGPU::SI_LOOP)
3203         .addUse(Reg)
3204         .addMBB(BrCond->getOperand(1).getMBB());
3205       MI.eraseFromParent();
3206       BrCond->eraseFromParent();
3207       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
3208       return true;
3209     }
3210 
3211     return false;
3212   }
3213   case Intrinsic::amdgcn_kernarg_segment_ptr:
3214     return legalizePreloadedArgIntrin(
3215       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
3216   case Intrinsic::amdgcn_implicitarg_ptr:
3217     return legalizeImplicitArgPtr(MI, MRI, B);
3218   case Intrinsic::amdgcn_workitem_id_x:
3219     return legalizePreloadedArgIntrin(MI, MRI, B,
3220                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
3221   case Intrinsic::amdgcn_workitem_id_y:
3222     return legalizePreloadedArgIntrin(MI, MRI, B,
3223                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
3224   case Intrinsic::amdgcn_workitem_id_z:
3225     return legalizePreloadedArgIntrin(MI, MRI, B,
3226                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
3227   case Intrinsic::amdgcn_workgroup_id_x:
3228     return legalizePreloadedArgIntrin(MI, MRI, B,
3229                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
3230   case Intrinsic::amdgcn_workgroup_id_y:
3231     return legalizePreloadedArgIntrin(MI, MRI, B,
3232                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
3233   case Intrinsic::amdgcn_workgroup_id_z:
3234     return legalizePreloadedArgIntrin(MI, MRI, B,
3235                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
3236   case Intrinsic::amdgcn_dispatch_ptr:
3237     return legalizePreloadedArgIntrin(MI, MRI, B,
3238                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
3239   case Intrinsic::amdgcn_queue_ptr:
3240     return legalizePreloadedArgIntrin(MI, MRI, B,
3241                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
3242   case Intrinsic::amdgcn_implicit_buffer_ptr:
3243     return legalizePreloadedArgIntrin(
3244       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
3245   case Intrinsic::amdgcn_dispatch_id:
3246     return legalizePreloadedArgIntrin(MI, MRI, B,
3247                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
3248   case Intrinsic::amdgcn_fdiv_fast:
3249     return legalizeFDIVFastIntrin(MI, MRI, B);
3250   case Intrinsic::amdgcn_is_shared:
3251     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
3252   case Intrinsic::amdgcn_is_private:
3253     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
3254   case Intrinsic::amdgcn_wavefrontsize: {
3255     B.setInstr(MI);
3256     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
3257     MI.eraseFromParent();
3258     return true;
3259   }
3260   case Intrinsic::amdgcn_s_buffer_load:
3261     return legalizeSBufferLoad(MI, B, Observer);
3262   case Intrinsic::amdgcn_raw_buffer_store:
3263   case Intrinsic::amdgcn_struct_buffer_store:
3264     return legalizeBufferStore(MI, MRI, B, false, false);
3265   case Intrinsic::amdgcn_raw_buffer_store_format:
3266   case Intrinsic::amdgcn_struct_buffer_store_format:
3267     return legalizeBufferStore(MI, MRI, B, false, true);
3268   case Intrinsic::amdgcn_raw_tbuffer_store:
3269   case Intrinsic::amdgcn_struct_tbuffer_store:
3270     return legalizeBufferStore(MI, MRI, B, true, true);
3271   case Intrinsic::amdgcn_raw_buffer_load:
3272   case Intrinsic::amdgcn_struct_buffer_load:
3273     return legalizeBufferLoad(MI, MRI, B, false, false);
3274   case Intrinsic::amdgcn_raw_buffer_load_format:
3275   case Intrinsic::amdgcn_struct_buffer_load_format:
3276     return legalizeBufferLoad(MI, MRI, B, true, false);
3277   case Intrinsic::amdgcn_raw_tbuffer_load:
3278   case Intrinsic::amdgcn_struct_tbuffer_load:
3279     return legalizeBufferLoad(MI, MRI, B, true, true);
3280   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3281   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3282   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3283   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3284   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3285   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3286   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3287   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3288   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3289   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3290   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3291   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3292   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3293   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3294   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3295   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3296   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3297   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3298   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3299   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3300   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3301   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3302   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3303   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3304   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3305   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3306     return legalizeBufferAtomic(MI, B, IntrID);
3307   case Intrinsic::amdgcn_atomic_inc:
3308     return legalizeAtomicIncDec(MI, B, true);
3309   case Intrinsic::amdgcn_atomic_dec:
3310     return legalizeAtomicIncDec(MI, B, false);
3311   default: {
3312     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
3313             AMDGPU::getImageDimIntrinsicInfo(IntrID))
3314       return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr);
3315     return true;
3316   }
3317   }
3318 
3319   return true;
3320 }
3321