1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
19 #endif
20 
21 #include "AMDGPULegalizerInfo.h"
22 
23 #include "AMDGPU.h"
24 #include "AMDGPUGlobalISelUtils.h"
25 #include "AMDGPUTargetMachine.h"
26 #include "SIMachineFunctionInfo.h"
27 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
28 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
29 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
30 #include "llvm/CodeGen/TargetOpcodes.h"
31 #include "llvm/CodeGen/ValueTypes.h"
32 #include "llvm/IR/DerivedTypes.h"
33 #include "llvm/IR/DiagnosticInfo.h"
34 #include "llvm/IR/Type.h"
35 #include "llvm/Support/Debug.h"
36 
37 #define DEBUG_TYPE "amdgpu-legalinfo"
38 
39 using namespace llvm;
40 using namespace LegalizeActions;
41 using namespace LegalizeMutations;
42 using namespace LegalityPredicates;
43 using namespace MIPatternMatch;
44 
45 static LegalityPredicate isMultiple32(unsigned TypeIdx,
46                                       unsigned MaxSize = 1024) {
47   return [=](const LegalityQuery &Query) {
48     const LLT Ty = Query.Types[TypeIdx];
49     const LLT EltTy = Ty.getScalarType();
50     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
51   };
52 }
53 
54 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
55   return [=](const LegalityQuery &Query) {
56     return Query.Types[TypeIdx].getSizeInBits() == Size;
57   };
58 }
59 
60 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
61   return [=](const LegalityQuery &Query) {
62     const LLT Ty = Query.Types[TypeIdx];
63     return Ty.isVector() &&
64            Ty.getNumElements() % 2 != 0 &&
65            Ty.getElementType().getSizeInBits() < 32 &&
66            Ty.getSizeInBits() % 32 != 0;
67   };
68 }
69 
70 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
71   return [=](const LegalityQuery &Query) {
72     const LLT Ty = Query.Types[TypeIdx];
73     const LLT EltTy = Ty.getScalarType();
74     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
75   };
76 }
77 
78 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
79   return [=](const LegalityQuery &Query) {
80     const LLT Ty = Query.Types[TypeIdx];
81     const LLT EltTy = Ty.getElementType();
82     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
83   };
84 }
85 
86 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
87   return [=](const LegalityQuery &Query) {
88     const LLT Ty = Query.Types[TypeIdx];
89     const LLT EltTy = Ty.getElementType();
90     unsigned Size = Ty.getSizeInBits();
91     unsigned Pieces = (Size + 63) / 64;
92     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
93     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
94   };
95 }
96 
97 // Increase the number of vector elements to reach the next multiple of 32-bit
98 // type.
99 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
100   return [=](const LegalityQuery &Query) {
101     const LLT Ty = Query.Types[TypeIdx];
102 
103     const LLT EltTy = Ty.getElementType();
104     const int Size = Ty.getSizeInBits();
105     const int EltSize = EltTy.getSizeInBits();
106     const int NextMul32 = (Size + 31) / 32;
107 
108     assert(EltSize < 32);
109 
110     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
111     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
112   };
113 }
114 
115 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
116   return [=](const LegalityQuery &Query) {
117     const LLT QueryTy = Query.Types[TypeIdx];
118     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
119   };
120 }
121 
122 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
123   return [=](const LegalityQuery &Query) {
124     const LLT QueryTy = Query.Types[TypeIdx];
125     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
126   };
127 }
128 
129 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
130   return [=](const LegalityQuery &Query) {
131     const LLT QueryTy = Query.Types[TypeIdx];
132     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
133   };
134 }
135 
136 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
137 // v2s16.
138 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
139   return [=](const LegalityQuery &Query) {
140     const LLT Ty = Query.Types[TypeIdx];
141     if (Ty.isVector()) {
142       const int EltSize = Ty.getElementType().getSizeInBits();
143       return EltSize == 32 || EltSize == 64 ||
144             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
145              EltSize == 128 || EltSize == 256;
146     }
147 
148     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
149   };
150 }
151 
152 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
153   return [=](const LegalityQuery &Query) {
154     return Query.Types[TypeIdx].getElementType() == Type;
155   };
156 }
157 
158 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
159   return [=](const LegalityQuery &Query) {
160     const LLT Ty = Query.Types[TypeIdx];
161     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
162            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
163   };
164 }
165 
166 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
167                                          const GCNTargetMachine &TM)
168   :  ST(ST_) {
169   using namespace TargetOpcode;
170 
171   auto GetAddrSpacePtr = [&TM](unsigned AS) {
172     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
173   };
174 
175   const LLT S1 = LLT::scalar(1);
176   const LLT S8 = LLT::scalar(8);
177   const LLT S16 = LLT::scalar(16);
178   const LLT S32 = LLT::scalar(32);
179   const LLT S64 = LLT::scalar(64);
180   const LLT S96 = LLT::scalar(96);
181   const LLT S128 = LLT::scalar(128);
182   const LLT S256 = LLT::scalar(256);
183   const LLT S1024 = LLT::scalar(1024);
184 
185   const LLT V2S16 = LLT::vector(2, 16);
186   const LLT V4S16 = LLT::vector(4, 16);
187 
188   const LLT V2S32 = LLT::vector(2, 32);
189   const LLT V3S32 = LLT::vector(3, 32);
190   const LLT V4S32 = LLT::vector(4, 32);
191   const LLT V5S32 = LLT::vector(5, 32);
192   const LLT V6S32 = LLT::vector(6, 32);
193   const LLT V7S32 = LLT::vector(7, 32);
194   const LLT V8S32 = LLT::vector(8, 32);
195   const LLT V9S32 = LLT::vector(9, 32);
196   const LLT V10S32 = LLT::vector(10, 32);
197   const LLT V11S32 = LLT::vector(11, 32);
198   const LLT V12S32 = LLT::vector(12, 32);
199   const LLT V13S32 = LLT::vector(13, 32);
200   const LLT V14S32 = LLT::vector(14, 32);
201   const LLT V15S32 = LLT::vector(15, 32);
202   const LLT V16S32 = LLT::vector(16, 32);
203   const LLT V32S32 = LLT::vector(32, 32);
204 
205   const LLT V2S64 = LLT::vector(2, 64);
206   const LLT V3S64 = LLT::vector(3, 64);
207   const LLT V4S64 = LLT::vector(4, 64);
208   const LLT V5S64 = LLT::vector(5, 64);
209   const LLT V6S64 = LLT::vector(6, 64);
210   const LLT V7S64 = LLT::vector(7, 64);
211   const LLT V8S64 = LLT::vector(8, 64);
212   const LLT V16S64 = LLT::vector(16, 64);
213 
214   std::initializer_list<LLT> AllS32Vectors =
215     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
216      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
217   std::initializer_list<LLT> AllS64Vectors =
218     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
219 
220   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
221   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
222   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
223   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
224   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
225   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
226   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
227 
228   const LLT CodePtr = FlatPtr;
229 
230   const std::initializer_list<LLT> AddrSpaces64 = {
231     GlobalPtr, ConstantPtr, FlatPtr
232   };
233 
234   const std::initializer_list<LLT> AddrSpaces32 = {
235     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
236   };
237 
238   const std::initializer_list<LLT> FPTypesBase = {
239     S32, S64
240   };
241 
242   const std::initializer_list<LLT> FPTypes16 = {
243     S32, S64, S16
244   };
245 
246   const std::initializer_list<LLT> FPTypesPK16 = {
247     S32, S64, S16, V2S16
248   };
249 
250   const LLT MinLegalScalarShiftTy = ST.has16BitInsts() ? S16 : S32;
251 
252   setAction({G_BRCOND, S1}, Legal); // VCC branches
253   setAction({G_BRCOND, S32}, Legal); // SCC branches
254 
255   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
256   // elements for v3s16
257   getActionDefinitionsBuilder(G_PHI)
258     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
259     .legalFor(AllS32Vectors)
260     .legalFor(AllS64Vectors)
261     .legalFor(AddrSpaces64)
262     .legalFor(AddrSpaces32)
263     .clampScalar(0, S32, S256)
264     .widenScalarToNextPow2(0, 32)
265     .clampMaxNumElements(0, S32, 16)
266     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
267     .legalIf(isPointer(0));
268 
269   if (ST.has16BitInsts()) {
270     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
271       .legalFor({S32, S16})
272       .clampScalar(0, S16, S32)
273       .scalarize(0);
274   } else {
275     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
276       .legalFor({S32})
277       .clampScalar(0, S32, S32)
278       .scalarize(0);
279   }
280 
281   // FIXME: Not really legal. Placeholder for custom lowering.
282   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
283     .legalFor({S32, S64})
284     .clampScalar(0, S32, S64)
285     .widenScalarToNextPow2(0, 32)
286     .scalarize(0);
287 
288   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
289     .legalFor({S32})
290     .clampScalar(0, S32, S32)
291     .scalarize(0);
292 
293   // Report legal for any types we can handle anywhere. For the cases only legal
294   // on the SALU, RegBankSelect will be able to re-legalize.
295   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
296     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
297     .clampScalar(0, S32, S64)
298     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
299     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
300     .widenScalarToNextPow2(0)
301     .scalarize(0);
302 
303   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
304                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
305     .legalFor({{S32, S1}, {S32, S32}})
306     .clampScalar(0, S32, S32)
307     .scalarize(0); // TODO: Implement.
308 
309   getActionDefinitionsBuilder(G_BITCAST)
310     // Don't worry about the size constraint.
311     .legalIf(all(isRegisterType(0), isRegisterType(1)))
312     // FIXME: Testing hack
313     .legalForCartesianProduct({S16, LLT::vector(2, 8), })
314     .lower();
315 
316 
317   getActionDefinitionsBuilder(G_CONSTANT)
318     .legalFor({S1, S32, S64, S16, GlobalPtr,
319                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
320     .clampScalar(0, S32, S64)
321     .widenScalarToNextPow2(0)
322     .legalIf(isPointer(0));
323 
324   getActionDefinitionsBuilder(G_FCONSTANT)
325     .legalFor({S32, S64, S16})
326     .clampScalar(0, S16, S64);
327 
328   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
329     .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
330                ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
331     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
332     .clampScalarOrElt(0, S32, S1024)
333     .legalIf(isMultiple32(0))
334     .widenScalarToNextPow2(0, 32)
335     .clampMaxNumElements(0, S32, 16);
336 
337   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
338   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
339     .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr});
340   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
341 
342   auto &FPOpActions = getActionDefinitionsBuilder(
343     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
344     .legalFor({S32, S64});
345   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
346     .customFor({S32, S64});
347   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
348     .customFor({S32, S64});
349 
350   if (ST.has16BitInsts()) {
351     if (ST.hasVOP3PInsts())
352       FPOpActions.legalFor({S16, V2S16});
353     else
354       FPOpActions.legalFor({S16});
355 
356     TrigActions.customFor({S16});
357     FDIVActions.customFor({S16});
358   }
359 
360   auto &MinNumMaxNum = getActionDefinitionsBuilder({
361       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
362 
363   if (ST.hasVOP3PInsts()) {
364     MinNumMaxNum.customFor(FPTypesPK16)
365       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
366       .clampMaxNumElements(0, S16, 2)
367       .clampScalar(0, S16, S64)
368       .scalarize(0);
369   } else if (ST.has16BitInsts()) {
370     MinNumMaxNum.customFor(FPTypes16)
371       .clampScalar(0, S16, S64)
372       .scalarize(0);
373   } else {
374     MinNumMaxNum.customFor(FPTypesBase)
375       .clampScalar(0, S32, S64)
376       .scalarize(0);
377   }
378 
379   if (ST.hasVOP3PInsts())
380     FPOpActions.clampMaxNumElements(0, S16, 2);
381 
382   FPOpActions
383     .scalarize(0)
384     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
385 
386   TrigActions
387     .scalarize(0)
388     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
389 
390   FDIVActions
391     .scalarize(0)
392     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
393 
394   getActionDefinitionsBuilder({G_FNEG, G_FABS})
395     .legalFor(FPTypesPK16)
396     .clampMaxNumElements(0, S16, 2)
397     .scalarize(0)
398     .clampScalar(0, S16, S64);
399 
400   if (ST.has16BitInsts()) {
401     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
402       .legalFor({S32, S64, S16})
403       .scalarize(0)
404       .clampScalar(0, S16, S64);
405   } else {
406     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
407       .legalFor({S32, S64})
408       .scalarize(0)
409       .clampScalar(0, S32, S64);
410   }
411 
412   getActionDefinitionsBuilder(G_FPTRUNC)
413     .legalFor({{S32, S64}, {S16, S32}})
414     .scalarize(0);
415 
416   getActionDefinitionsBuilder(G_FPEXT)
417     .legalFor({{S64, S32}, {S32, S16}})
418     .lowerFor({{S64, S16}}) // FIXME: Implement
419     .scalarize(0);
420 
421   getActionDefinitionsBuilder(G_FSUB)
422       // Use actual fsub instruction
423       .legalFor({S32})
424       // Must use fadd + fneg
425       .lowerFor({S64, S16, V2S16})
426       .scalarize(0)
427       .clampScalar(0, S32, S64);
428 
429   // Whether this is legal depends on the floating point mode for the function.
430   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
431   if (ST.hasMadF16())
432     FMad.customFor({S32, S16});
433   else
434     FMad.customFor({S32});
435   FMad.scalarize(0)
436       .lower();
437 
438   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
439     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
440                {S32, S1}, {S64, S1}, {S16, S1},
441                {S96, S32},
442                // FIXME: Hack
443                {S64, LLT::scalar(33)},
444                {S32, S8}, {S32, LLT::scalar(24)}})
445     .scalarize(0)
446     .clampScalar(0, S32, S64);
447 
448   // TODO: Split s1->s64 during regbankselect for VALU.
449   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
450     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
451     .lowerFor({{S32, S64}})
452     .lowerIf(typeIs(1, S1))
453     .customFor({{S64, S64}});
454   if (ST.has16BitInsts())
455     IToFP.legalFor({{S16, S16}});
456   IToFP.clampScalar(1, S32, S64)
457        .scalarize(0);
458 
459   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
460     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
461     .customFor({{S64, S64}});
462   if (ST.has16BitInsts())
463     FPToI.legalFor({{S16, S16}});
464   else
465     FPToI.minScalar(1, S32);
466 
467   FPToI.minScalar(0, S32)
468        .scalarize(0)
469        .lower();
470 
471   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
472     .scalarize(0)
473     .lower();
474 
475   if (ST.has16BitInsts()) {
476     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
477       .legalFor({S16, S32, S64})
478       .clampScalar(0, S16, S64)
479       .scalarize(0);
480   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
481     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
482       .legalFor({S32, S64})
483       .clampScalar(0, S32, S64)
484       .scalarize(0);
485   } else {
486     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
487       .legalFor({S32})
488       .customFor({S64})
489       .clampScalar(0, S32, S64)
490       .scalarize(0);
491   }
492 
493   getActionDefinitionsBuilder({G_PTR_ADD, G_PTR_MASK})
494     .scalarize(0)
495     .alwaysLegal();
496 
497   auto &CmpBuilder =
498     getActionDefinitionsBuilder(G_ICMP)
499     // The compare output type differs based on the register bank of the output,
500     // so make both s1 and s32 legal.
501     //
502     // Scalar compares producing output in scc will be promoted to s32, as that
503     // is the allocatable register type that will be needed for the copy from
504     // scc. This will be promoted during RegBankSelect, and we assume something
505     // before that won't try to use s32 result types.
506     //
507     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
508     // bank.
509     .legalForCartesianProduct(
510       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
511     .legalForCartesianProduct(
512       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
513   if (ST.has16BitInsts()) {
514     CmpBuilder.legalFor({{S1, S16}});
515   }
516 
517   CmpBuilder
518     .widenScalarToNextPow2(1)
519     .clampScalar(1, S32, S64)
520     .scalarize(0)
521     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
522 
523   getActionDefinitionsBuilder(G_FCMP)
524     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
525     .widenScalarToNextPow2(1)
526     .clampScalar(1, S32, S64)
527     .scalarize(0);
528 
529   // FIXME: fexp, flog2, flog10 needs to be custom lowered.
530   getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2,
531                                G_FLOG2})
532     .legalFor({S32})
533     .scalarize(0);
534 
535   getActionDefinitionsBuilder({G_FLOG, G_FLOG10})
536     .customFor({S32})
537     .clampScalar(0, S32, S32)
538     .scalarize(0);
539 
540   // The 64-bit versions produce 32-bit results, but only on the SALU.
541   getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF,
542                                G_CTTZ, G_CTTZ_ZERO_UNDEF,
543                                G_CTPOP})
544     .legalFor({{S32, S32}, {S32, S64}})
545     .clampScalar(0, S32, S32)
546     .clampScalar(1, S32, S64)
547     .scalarize(0)
548     .widenScalarToNextPow2(0, 32)
549     .widenScalarToNextPow2(1, 32);
550 
551   // TODO: Expand for > s32
552   getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE})
553     .legalFor({S32})
554     .clampScalar(0, S32, S32)
555     .scalarize(0);
556 
557   if (ST.has16BitInsts()) {
558     if (ST.hasVOP3PInsts()) {
559       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
560         .legalFor({S32, S16, V2S16})
561         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
562         .clampMaxNumElements(0, S16, 2)
563         .clampScalar(0, S16, S32)
564         .widenScalarToNextPow2(0)
565         .scalarize(0);
566     } else {
567       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
568         .legalFor({S32, S16})
569         .widenScalarToNextPow2(0)
570         .clampScalar(0, S16, S32)
571         .scalarize(0);
572     }
573   } else {
574     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
575       .legalFor({S32})
576       .clampScalar(0, S32, S32)
577       .widenScalarToNextPow2(0)
578       .scalarize(0);
579   }
580 
581   auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
582     return [=](const LegalityQuery &Query) {
583       return Query.Types[TypeIdx0].getSizeInBits() <
584              Query.Types[TypeIdx1].getSizeInBits();
585     };
586   };
587 
588   auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
589     return [=](const LegalityQuery &Query) {
590       return Query.Types[TypeIdx0].getSizeInBits() >
591              Query.Types[TypeIdx1].getSizeInBits();
592     };
593   };
594 
595   getActionDefinitionsBuilder(G_INTTOPTR)
596     // List the common cases
597     .legalForCartesianProduct(AddrSpaces64, {S64})
598     .legalForCartesianProduct(AddrSpaces32, {S32})
599     .scalarize(0)
600     // Accept any address space as long as the size matches
601     .legalIf(sameSize(0, 1))
602     .widenScalarIf(smallerThan(1, 0),
603       [](const LegalityQuery &Query) {
604         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
605       })
606     .narrowScalarIf(greaterThan(1, 0),
607       [](const LegalityQuery &Query) {
608         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
609       });
610 
611   getActionDefinitionsBuilder(G_PTRTOINT)
612     // List the common cases
613     .legalForCartesianProduct(AddrSpaces64, {S64})
614     .legalForCartesianProduct(AddrSpaces32, {S32})
615     .scalarize(0)
616     // Accept any address space as long as the size matches
617     .legalIf(sameSize(0, 1))
618     .widenScalarIf(smallerThan(0, 1),
619       [](const LegalityQuery &Query) {
620         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
621       })
622     .narrowScalarIf(
623       greaterThan(0, 1),
624       [](const LegalityQuery &Query) {
625         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
626       });
627 
628   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
629     .scalarize(0)
630     .custom();
631 
632   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
633   // handle some operations by just promoting the register during
634   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
635   auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned {
636     switch (AS) {
637     // FIXME: Private element size.
638     case AMDGPUAS::PRIVATE_ADDRESS:
639       return 32;
640     // FIXME: Check subtarget
641     case AMDGPUAS::LOCAL_ADDRESS:
642       return ST.useDS128() ? 128 : 64;
643 
644     // Treat constant and global as identical. SMRD loads are sometimes usable
645     // for global loads (ideally constant address space should be eliminated)
646     // depending on the context. Legality cannot be context dependent, but
647     // RegBankSelect can split the load as necessary depending on the pointer
648     // register bank/uniformity and if the memory is invariant or not written in
649     // a kernel.
650     case AMDGPUAS::CONSTANT_ADDRESS:
651     case AMDGPUAS::GLOBAL_ADDRESS:
652       return 512;
653     default:
654       return 128;
655     }
656   };
657 
658   const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool {
659     const LLT DstTy = Query.Types[0];
660 
661     // Split vector extloads.
662     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
663     unsigned Align = Query.MMODescrs[0].AlignInBits;
664 
665     if (MemSize < DstTy.getSizeInBits())
666       MemSize = std::max(MemSize, Align);
667 
668     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
669       return true;
670 
671     const LLT PtrTy = Query.Types[1];
672     unsigned AS = PtrTy.getAddressSpace();
673     if (MemSize > maxSizeForAddrSpace(AS))
674       return true;
675 
676     // Catch weird sized loads that don't evenly divide into the access sizes
677     // TODO: May be able to widen depending on alignment etc.
678     unsigned NumRegs = MemSize / 32;
679     if (NumRegs == 3 && !ST.hasDwordx3LoadStores())
680       return true;
681 
682     if (Align < MemSize) {
683       const SITargetLowering *TLI = ST.getTargetLowering();
684       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
685     }
686 
687     return false;
688   };
689 
690   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
691   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
692   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
693 
694   // TODO: Refine based on subtargets which support unaligned access or 128-bit
695   // LDS
696   // TODO: Unsupported flat for SI.
697 
698   for (unsigned Op : {G_LOAD, G_STORE}) {
699     const bool IsStore = Op == G_STORE;
700 
701     auto &Actions = getActionDefinitionsBuilder(Op);
702     // Whitelist the common cases.
703     // TODO: Pointer loads
704     // TODO: Wide constant loads
705     // TODO: Only CI+ has 3x loads
706     // TODO: Loads to s16 on gfx9
707     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
708                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
709                                       {V3S32, GlobalPtr, 96, GlobalAlign32},
710                                       {S96, GlobalPtr, 96, GlobalAlign32},
711                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
712                                       {S128, GlobalPtr, 128, GlobalAlign32},
713                                       {S64, GlobalPtr, 64, GlobalAlign32},
714                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
715                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
716                                       {S32, GlobalPtr, 8, GlobalAlign8},
717                                       {S32, GlobalPtr, 16, GlobalAlign16},
718 
719                                       {S32, LocalPtr, 32, 32},
720                                       {S64, LocalPtr, 64, 32},
721                                       {V2S32, LocalPtr, 64, 32},
722                                       {S32, LocalPtr, 8, 8},
723                                       {S32, LocalPtr, 16, 16},
724                                       {V2S16, LocalPtr, 32, 32},
725 
726                                       {S32, PrivatePtr, 32, 32},
727                                       {S32, PrivatePtr, 8, 8},
728                                       {S32, PrivatePtr, 16, 16},
729                                       {V2S16, PrivatePtr, 32, 32},
730 
731                                       {S32, FlatPtr, 32, GlobalAlign32},
732                                       {S32, FlatPtr, 16, GlobalAlign16},
733                                       {S32, FlatPtr, 8, GlobalAlign8},
734                                       {V2S16, FlatPtr, 32, GlobalAlign32},
735 
736                                       {S32, ConstantPtr, 32, GlobalAlign32},
737                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
738                                       {V3S32, ConstantPtr, 96, GlobalAlign32},
739                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
740                                       {S64, ConstantPtr, 64, GlobalAlign32},
741                                       {S128, ConstantPtr, 128, GlobalAlign32},
742                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
743     Actions
744         .customIf(typeIs(1, Constant32Ptr))
745         .narrowScalarIf(
746             [=](const LegalityQuery &Query) -> bool {
747               return !Query.Types[0].isVector() && needToSplitLoad(Query);
748             },
749             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
750               const LLT DstTy = Query.Types[0];
751               const LLT PtrTy = Query.Types[1];
752 
753               const unsigned DstSize = DstTy.getSizeInBits();
754               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
755 
756               // Split extloads.
757               if (DstSize > MemSize)
758                 return std::make_pair(0, LLT::scalar(MemSize));
759 
760               if (DstSize > 32 && (DstSize % 32 != 0)) {
761                 // FIXME: Need a way to specify non-extload of larger size if
762                 // suitably aligned.
763                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
764               }
765 
766               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
767               if (MemSize > MaxSize)
768                 return std::make_pair(0, LLT::scalar(MaxSize));
769 
770               unsigned Align = Query.MMODescrs[0].AlignInBits;
771               return std::make_pair(0, LLT::scalar(Align));
772             })
773         .fewerElementsIf(
774             [=](const LegalityQuery &Query) -> bool {
775               return Query.Types[0].isVector() && needToSplitLoad(Query);
776             },
777             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
778               const LLT DstTy = Query.Types[0];
779               const LLT PtrTy = Query.Types[1];
780 
781               LLT EltTy = DstTy.getElementType();
782               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
783 
784               // Split if it's too large for the address space.
785               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
786                 unsigned NumElts = DstTy.getNumElements();
787                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
788 
789                 // FIXME: Refine when odd breakdowns handled
790                 // The scalars will need to be re-legalized.
791                 if (NumPieces == 1 || NumPieces >= NumElts ||
792                     NumElts % NumPieces != 0)
793                   return std::make_pair(0, EltTy);
794 
795                 return std::make_pair(0,
796                                       LLT::vector(NumElts / NumPieces, EltTy));
797               }
798 
799               // Need to split because of alignment.
800               unsigned Align = Query.MMODescrs[0].AlignInBits;
801               unsigned EltSize = EltTy.getSizeInBits();
802               if (EltSize > Align &&
803                   (EltSize / Align < DstTy.getNumElements())) {
804                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
805               }
806 
807               // May need relegalization for the scalars.
808               return std::make_pair(0, EltTy);
809             })
810         .minScalar(0, S32);
811 
812     if (IsStore)
813       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
814 
815     // TODO: Need a bitcast lower option?
816     Actions
817         .legalIf([=](const LegalityQuery &Query) {
818           const LLT Ty0 = Query.Types[0];
819           unsigned Size = Ty0.getSizeInBits();
820           unsigned MemSize = Query.MMODescrs[0].SizeInBits;
821           unsigned Align = Query.MMODescrs[0].AlignInBits;
822 
823           // FIXME: Widening store from alignment not valid.
824           if (MemSize < Size)
825             MemSize = std::max(MemSize, Align);
826 
827           // No extending vector loads.
828           if (Size > MemSize && Ty0.isVector())
829             return false;
830 
831           switch (MemSize) {
832           case 8:
833           case 16:
834             return Size == 32;
835           case 32:
836           case 64:
837           case 128:
838             return true;
839           case 96:
840             return ST.hasDwordx3LoadStores();
841           case 256:
842           case 512:
843             return true;
844           default:
845             return false;
846           }
847         })
848         .widenScalarToNextPow2(0)
849         // TODO: v3s32->v4s32 with alignment
850         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
851   }
852 
853   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
854                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
855                                                   {S32, GlobalPtr, 16, 2 * 8},
856                                                   {S32, LocalPtr, 8, 8},
857                                                   {S32, LocalPtr, 16, 16},
858                                                   {S32, PrivatePtr, 8, 8},
859                                                   {S32, PrivatePtr, 16, 16},
860                                                   {S32, ConstantPtr, 8, 8},
861                                                   {S32, ConstantPtr, 16, 2 * 8}});
862   if (ST.hasFlatAddressSpace()) {
863     ExtLoads.legalForTypesWithMemDesc(
864         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
865   }
866 
867   ExtLoads.clampScalar(0, S32, S32)
868           .widenScalarToNextPow2(0)
869           .unsupportedIfMemSizeNotPow2()
870           .lower();
871 
872   auto &Atomics = getActionDefinitionsBuilder(
873     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
874      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
875      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
876      G_ATOMICRMW_UMIN})
877     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
878                {S64, GlobalPtr}, {S64, LocalPtr}});
879   if (ST.hasFlatAddressSpace()) {
880     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
881   }
882 
883   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
884     .legalFor({{S32, LocalPtr}});
885 
886   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
887   // demarshalling
888   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
889     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
890                 {S32, FlatPtr}, {S64, FlatPtr}})
891     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
892                {S32, RegionPtr}, {S64, RegionPtr}});
893   // TODO: Pointer types, any 32-bit or 64-bit vector
894 
895   // Condition should be s32 for scalar, s1 for vector.
896   getActionDefinitionsBuilder(G_SELECT)
897     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
898           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
899           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
900     .clampScalar(0, S16, S64)
901     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
902     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
903     .scalarize(1)
904     .clampMaxNumElements(0, S32, 2)
905     .clampMaxNumElements(0, LocalPtr, 2)
906     .clampMaxNumElements(0, PrivatePtr, 2)
907     .scalarize(0)
908     .widenScalarToNextPow2(0)
909     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
910 
911   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
912   // be more flexible with the shift amount type.
913   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
914     .legalFor({{S32, S32}, {S64, S32}});
915   if (ST.has16BitInsts()) {
916     if (ST.hasVOP3PInsts()) {
917       Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
918             .clampMaxNumElements(0, S16, 2);
919     } else
920       Shifts.legalFor({{S16, S32}, {S16, S16}});
921 
922     // TODO: Support 16-bit shift amounts
923     Shifts.clampScalar(1, S32, S32);
924     Shifts.clampScalar(0, S16, S64);
925     Shifts.widenScalarToNextPow2(0, 16);
926   } else {
927     // Make sure we legalize the shift amount type first, as the general
928     // expansion for the shifted type will produce much worse code if it hasn't
929     // been truncated already.
930     Shifts.clampScalar(1, S32, S32);
931     Shifts.clampScalar(0, S32, S64);
932     Shifts.widenScalarToNextPow2(0, 32);
933   }
934   Shifts.scalarize(0);
935 
936   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
937     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
938     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
939     unsigned IdxTypeIdx = 2;
940 
941     getActionDefinitionsBuilder(Op)
942       .customIf([=](const LegalityQuery &Query) {
943           const LLT EltTy = Query.Types[EltTypeIdx];
944           const LLT VecTy = Query.Types[VecTypeIdx];
945           const LLT IdxTy = Query.Types[IdxTypeIdx];
946           return (EltTy.getSizeInBits() == 16 ||
947                   EltTy.getSizeInBits() % 32 == 0) &&
948                  VecTy.getSizeInBits() % 32 == 0 &&
949                  VecTy.getSizeInBits() <= 1024 &&
950                  IdxTy.getSizeInBits() == 32;
951         })
952       .clampScalar(EltTypeIdx, S32, S64)
953       .clampScalar(VecTypeIdx, S32, S64)
954       .clampScalar(IdxTypeIdx, S32, S32);
955   }
956 
957   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
958     .unsupportedIf([=](const LegalityQuery &Query) {
959         const LLT &EltTy = Query.Types[1].getElementType();
960         return Query.Types[0] != EltTy;
961       });
962 
963   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
964     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
965     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
966 
967     // FIXME: Doesn't handle extract of illegal sizes.
968     getActionDefinitionsBuilder(Op)
969       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
970       // FIXME: Multiples of 16 should not be legal.
971       .legalIf([=](const LegalityQuery &Query) {
972           const LLT BigTy = Query.Types[BigTyIdx];
973           const LLT LitTy = Query.Types[LitTyIdx];
974           return (BigTy.getSizeInBits() % 32 == 0) &&
975                  (LitTy.getSizeInBits() % 16 == 0);
976         })
977       .widenScalarIf(
978         [=](const LegalityQuery &Query) {
979           const LLT BigTy = Query.Types[BigTyIdx];
980           return (BigTy.getScalarSizeInBits() < 16);
981         },
982         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
983       .widenScalarIf(
984         [=](const LegalityQuery &Query) {
985           const LLT LitTy = Query.Types[LitTyIdx];
986           return (LitTy.getScalarSizeInBits() < 16);
987         },
988         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
989       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
990       .widenScalarToNextPow2(BigTyIdx, 32);
991 
992   }
993 
994   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
995     .legalForCartesianProduct(AllS32Vectors, {S32})
996     .legalForCartesianProduct(AllS64Vectors, {S64})
997     .clampNumElements(0, V16S32, V32S32)
998     .clampNumElements(0, V2S64, V16S64)
999     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1000 
1001   if (ST.hasScalarPackInsts())
1002     BuildVector.legalFor({V2S16, S32});
1003 
1004   BuildVector
1005     .minScalarSameAs(1, 0)
1006     .legalIf(isRegisterType(0))
1007     .minScalarOrElt(0, S32);
1008 
1009   if (ST.hasScalarPackInsts()) {
1010     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1011       .legalFor({V2S16, S32})
1012       .lower();
1013   } else {
1014     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1015       .lower();
1016   }
1017 
1018   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1019     .legalIf(isRegisterType(0));
1020 
1021   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1022   // pre-legalize.
1023   if (ST.hasVOP3PInsts()) {
1024     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1025       .customFor({V2S16, V2S16})
1026       .lower();
1027   } else
1028     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1029 
1030   // Merge/Unmerge
1031   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1032     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1033     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1034 
1035     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1036       const LLT &Ty = Query.Types[TypeIdx];
1037       if (Ty.isVector()) {
1038         const LLT &EltTy = Ty.getElementType();
1039         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
1040           return true;
1041         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1042           return true;
1043       }
1044       return false;
1045     };
1046 
1047     auto &Builder = getActionDefinitionsBuilder(Op)
1048       // Try to widen to s16 first for small types.
1049       // TODO: Only do this on targets with legal s16 shifts
1050       .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1051 
1052       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1053       .lowerFor({{S16, V2S16}})
1054       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1055       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1056                            elementTypeIs(1, S16)),
1057                        changeTo(1, V2S16))
1058       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1059       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1060       // valid.
1061       .clampScalar(LitTyIdx, S32, S256)
1062       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1063       // Break up vectors with weird elements into scalars
1064       .fewerElementsIf(
1065         [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
1066         scalarize(0))
1067       .fewerElementsIf(
1068         [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
1069         scalarize(1))
1070       .clampScalar(BigTyIdx, S32, S1024);
1071 
1072     if (Op == G_MERGE_VALUES) {
1073       Builder.widenScalarIf(
1074         // TODO: Use 16-bit shifts if legal for 8-bit values?
1075         [=](const LegalityQuery &Query) {
1076           const LLT Ty = Query.Types[LitTyIdx];
1077           return Ty.getSizeInBits() < 32;
1078         },
1079         changeTo(LitTyIdx, S32));
1080     }
1081 
1082     Builder.widenScalarIf(
1083       [=](const LegalityQuery &Query) {
1084         const LLT Ty = Query.Types[BigTyIdx];
1085         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1086           Ty.getSizeInBits() % 16 != 0;
1087       },
1088       [=](const LegalityQuery &Query) {
1089         // Pick the next power of 2, or a multiple of 64 over 128.
1090         // Whichever is smaller.
1091         const LLT &Ty = Query.Types[BigTyIdx];
1092         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1093         if (NewSizeInBits >= 256) {
1094           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1095           if (RoundedTo < NewSizeInBits)
1096             NewSizeInBits = RoundedTo;
1097         }
1098         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1099       })
1100       .legalIf([=](const LegalityQuery &Query) {
1101           const LLT &BigTy = Query.Types[BigTyIdx];
1102           const LLT &LitTy = Query.Types[LitTyIdx];
1103 
1104           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1105             return false;
1106           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1107             return false;
1108 
1109           return BigTy.getSizeInBits() % 16 == 0 &&
1110                  LitTy.getSizeInBits() % 16 == 0 &&
1111                  BigTy.getSizeInBits() <= 1024;
1112         })
1113       // Any vectors left are the wrong size. Scalarize them.
1114       .scalarize(0)
1115       .scalarize(1);
1116   }
1117 
1118   // TODO: Make legal for s32, s64. s64 case needs break down in regbankselect.
1119   getActionDefinitionsBuilder(G_SEXT_INREG)
1120     .clampScalar(0, MinLegalScalarShiftTy, S64)
1121     .lower();
1122 
1123   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1124     .legalFor({S64});
1125 
1126   getActionDefinitionsBuilder({
1127       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1128       G_FCOPYSIGN,
1129 
1130       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1131       G_READ_REGISTER,
1132       G_WRITE_REGISTER,
1133 
1134       G_SADDO, G_SSUBO,
1135 
1136        // TODO: Implement
1137       G_FMINIMUM, G_FMAXIMUM
1138     }).lower();
1139 
1140   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1141         G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1142         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1143     .unsupported();
1144 
1145   computeTables();
1146   verify(*ST.getInstrInfo());
1147 }
1148 
1149 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1150                                          MachineRegisterInfo &MRI,
1151                                          MachineIRBuilder &B,
1152                                          GISelChangeObserver &Observer) const {
1153   switch (MI.getOpcode()) {
1154   case TargetOpcode::G_ADDRSPACE_CAST:
1155     return legalizeAddrSpaceCast(MI, MRI, B);
1156   case TargetOpcode::G_FRINT:
1157     return legalizeFrint(MI, MRI, B);
1158   case TargetOpcode::G_FCEIL:
1159     return legalizeFceil(MI, MRI, B);
1160   case TargetOpcode::G_INTRINSIC_TRUNC:
1161     return legalizeIntrinsicTrunc(MI, MRI, B);
1162   case TargetOpcode::G_SITOFP:
1163     return legalizeITOFP(MI, MRI, B, true);
1164   case TargetOpcode::G_UITOFP:
1165     return legalizeITOFP(MI, MRI, B, false);
1166   case TargetOpcode::G_FPTOSI:
1167     return legalizeFPTOI(MI, MRI, B, true);
1168   case TargetOpcode::G_FPTOUI:
1169     return legalizeFPTOI(MI, MRI, B, false);
1170   case TargetOpcode::G_FMINNUM:
1171   case TargetOpcode::G_FMAXNUM:
1172   case TargetOpcode::G_FMINNUM_IEEE:
1173   case TargetOpcode::G_FMAXNUM_IEEE:
1174     return legalizeMinNumMaxNum(MI, MRI, B);
1175   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1176     return legalizeExtractVectorElt(MI, MRI, B);
1177   case TargetOpcode::G_INSERT_VECTOR_ELT:
1178     return legalizeInsertVectorElt(MI, MRI, B);
1179   case TargetOpcode::G_SHUFFLE_VECTOR:
1180     return legalizeShuffleVector(MI, MRI, B);
1181   case TargetOpcode::G_FSIN:
1182   case TargetOpcode::G_FCOS:
1183     return legalizeSinCos(MI, MRI, B);
1184   case TargetOpcode::G_GLOBAL_VALUE:
1185     return legalizeGlobalValue(MI, MRI, B);
1186   case TargetOpcode::G_LOAD:
1187     return legalizeLoad(MI, MRI, B, Observer);
1188   case TargetOpcode::G_FMAD:
1189     return legalizeFMad(MI, MRI, B);
1190   case TargetOpcode::G_FDIV:
1191     return legalizeFDIV(MI, MRI, B);
1192   case TargetOpcode::G_ATOMIC_CMPXCHG:
1193     return legalizeAtomicCmpXChg(MI, MRI, B);
1194   case TargetOpcode::G_FLOG:
1195     return legalizeFlog(MI, B, 1.0f / numbers::log2ef);
1196   case TargetOpcode::G_FLOG10:
1197     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1198   default:
1199     return false;
1200   }
1201 
1202   llvm_unreachable("expected switch to return");
1203 }
1204 
1205 Register AMDGPULegalizerInfo::getSegmentAperture(
1206   unsigned AS,
1207   MachineRegisterInfo &MRI,
1208   MachineIRBuilder &B) const {
1209   MachineFunction &MF = B.getMF();
1210   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1211   const LLT S32 = LLT::scalar(32);
1212 
1213   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1214 
1215   if (ST.hasApertureRegs()) {
1216     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1217     // getreg.
1218     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1219         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1220         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1221     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1222         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1223         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1224     unsigned Encoding =
1225         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1226         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1227         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1228 
1229     Register ApertureReg = MRI.createGenericVirtualRegister(S32);
1230     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1231 
1232     B.buildInstr(AMDGPU::S_GETREG_B32)
1233       .addDef(GetReg)
1234       .addImm(Encoding);
1235     MRI.setType(GetReg, S32);
1236 
1237     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1238     B.buildInstr(TargetOpcode::G_SHL)
1239       .addDef(ApertureReg)
1240       .addUse(GetReg)
1241       .addUse(ShiftAmt.getReg(0));
1242 
1243     return ApertureReg;
1244   }
1245 
1246   Register QueuePtr = MRI.createGenericVirtualRegister(
1247     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1248 
1249   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1250   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1251     return Register();
1252 
1253   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1254   // private_segment_aperture_base_hi.
1255   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1256 
1257   // TODO: can we be smarter about machine pointer info?
1258   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1259   MachineMemOperand *MMO = MF.getMachineMemOperand(
1260     PtrInfo,
1261     MachineMemOperand::MOLoad |
1262     MachineMemOperand::MODereferenceable |
1263     MachineMemOperand::MOInvariant,
1264     4,
1265     MinAlign(64, StructOffset));
1266 
1267   Register LoadResult = MRI.createGenericVirtualRegister(S32);
1268   Register LoadAddr;
1269 
1270   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1271   B.buildLoad(LoadResult, LoadAddr, *MMO);
1272   return LoadResult;
1273 }
1274 
1275 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1276   MachineInstr &MI, MachineRegisterInfo &MRI,
1277   MachineIRBuilder &B) const {
1278   MachineFunction &MF = B.getMF();
1279 
1280   B.setInstr(MI);
1281 
1282   const LLT S32 = LLT::scalar(32);
1283   Register Dst = MI.getOperand(0).getReg();
1284   Register Src = MI.getOperand(1).getReg();
1285 
1286   LLT DstTy = MRI.getType(Dst);
1287   LLT SrcTy = MRI.getType(Src);
1288   unsigned DestAS = DstTy.getAddressSpace();
1289   unsigned SrcAS = SrcTy.getAddressSpace();
1290 
1291   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1292   // vector element.
1293   assert(!DstTy.isVector());
1294 
1295   const AMDGPUTargetMachine &TM
1296     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1297 
1298   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1299   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1300     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1301     return true;
1302   }
1303 
1304   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1305     // Truncate.
1306     B.buildExtract(Dst, Src, 0);
1307     MI.eraseFromParent();
1308     return true;
1309   }
1310 
1311   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1312     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1313     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1314 
1315     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1316     // another. Merge operands are required to be the same type, but creating an
1317     // extra ptrtoint would be kind of pointless.
1318     auto HighAddr = B.buildConstant(
1319       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1320     B.buildMerge(Dst, {Src, HighAddr.getReg(0)});
1321     MI.eraseFromParent();
1322     return true;
1323   }
1324 
1325   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1326     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1327            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1328     unsigned NullVal = TM.getNullPointerValue(DestAS);
1329 
1330     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1331     auto FlatNull = B.buildConstant(SrcTy, 0);
1332 
1333     Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy);
1334 
1335     // Extract low 32-bits of the pointer.
1336     B.buildExtract(PtrLo32, Src, 0);
1337 
1338     Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1339     B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
1340     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1341 
1342     MI.eraseFromParent();
1343     return true;
1344   }
1345 
1346   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1347     return false;
1348 
1349   if (!ST.hasFlatAddressSpace())
1350     return false;
1351 
1352   auto SegmentNull =
1353       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1354   auto FlatNull =
1355       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1356 
1357   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1358   if (!ApertureReg.isValid())
1359     return false;
1360 
1361   Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1362   B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
1363 
1364   Register BuildPtr = MRI.createGenericVirtualRegister(DstTy);
1365 
1366   // Coerce the type of the low half of the result so we can use merge_values.
1367   Register SrcAsInt = MRI.createGenericVirtualRegister(S32);
1368   B.buildInstr(TargetOpcode::G_PTRTOINT)
1369     .addDef(SrcAsInt)
1370     .addUse(Src);
1371 
1372   // TODO: Should we allow mismatched types but matching sizes in merges to
1373   // avoid the ptrtoint?
1374   B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
1375   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
1376 
1377   MI.eraseFromParent();
1378   return true;
1379 }
1380 
1381 bool AMDGPULegalizerInfo::legalizeFrint(
1382   MachineInstr &MI, MachineRegisterInfo &MRI,
1383   MachineIRBuilder &B) const {
1384   B.setInstr(MI);
1385 
1386   Register Src = MI.getOperand(1).getReg();
1387   LLT Ty = MRI.getType(Src);
1388   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1389 
1390   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1391   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1392 
1393   auto C1 = B.buildFConstant(Ty, C1Val);
1394   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1395 
1396   // TODO: Should this propagate fast-math-flags?
1397   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1398   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1399 
1400   auto C2 = B.buildFConstant(Ty, C2Val);
1401   auto Fabs = B.buildFAbs(Ty, Src);
1402 
1403   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1404   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1405   return true;
1406 }
1407 
1408 bool AMDGPULegalizerInfo::legalizeFceil(
1409   MachineInstr &MI, MachineRegisterInfo &MRI,
1410   MachineIRBuilder &B) const {
1411   B.setInstr(MI);
1412 
1413   const LLT S1 = LLT::scalar(1);
1414   const LLT S64 = LLT::scalar(64);
1415 
1416   Register Src = MI.getOperand(1).getReg();
1417   assert(MRI.getType(Src) == S64);
1418 
1419   // result = trunc(src)
1420   // if (src > 0.0 && src != result)
1421   //   result += 1.0
1422 
1423   auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src});
1424 
1425   const auto Zero = B.buildFConstant(S64, 0.0);
1426   const auto One = B.buildFConstant(S64, 1.0);
1427   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1428   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1429   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1430   auto Add = B.buildSelect(S64, And, One, Zero);
1431 
1432   // TODO: Should this propagate fast-math-flags?
1433   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1434   return true;
1435 }
1436 
1437 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1438                                               MachineIRBuilder &B) {
1439   const unsigned FractBits = 52;
1440   const unsigned ExpBits = 11;
1441   LLT S32 = LLT::scalar(32);
1442 
1443   auto Const0 = B.buildConstant(S32, FractBits - 32);
1444   auto Const1 = B.buildConstant(S32, ExpBits);
1445 
1446   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1447     .addUse(Const0.getReg(0))
1448     .addUse(Const1.getReg(0));
1449 
1450   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1451 }
1452 
1453 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1454   MachineInstr &MI, MachineRegisterInfo &MRI,
1455   MachineIRBuilder &B) const {
1456   B.setInstr(MI);
1457 
1458   const LLT S1 = LLT::scalar(1);
1459   const LLT S32 = LLT::scalar(32);
1460   const LLT S64 = LLT::scalar(64);
1461 
1462   Register Src = MI.getOperand(1).getReg();
1463   assert(MRI.getType(Src) == S64);
1464 
1465   // TODO: Should this use extract since the low half is unused?
1466   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1467   Register Hi = Unmerge.getReg(1);
1468 
1469   // Extract the upper half, since this is where we will find the sign and
1470   // exponent.
1471   auto Exp = extractF64Exponent(Hi, B);
1472 
1473   const unsigned FractBits = 52;
1474 
1475   // Extract the sign bit.
1476   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1477   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1478 
1479   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1480 
1481   const auto Zero32 = B.buildConstant(S32, 0);
1482 
1483   // Extend back to 64-bits.
1484   auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)});
1485 
1486   auto Shr = B.buildAShr(S64, FractMask, Exp);
1487   auto Not = B.buildNot(S64, Shr);
1488   auto Tmp0 = B.buildAnd(S64, Src, Not);
1489   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1490 
1491   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1492   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1493 
1494   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1495   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1496   return true;
1497 }
1498 
1499 bool AMDGPULegalizerInfo::legalizeITOFP(
1500   MachineInstr &MI, MachineRegisterInfo &MRI,
1501   MachineIRBuilder &B, bool Signed) const {
1502   B.setInstr(MI);
1503 
1504   Register Dst = MI.getOperand(0).getReg();
1505   Register Src = MI.getOperand(1).getReg();
1506 
1507   const LLT S64 = LLT::scalar(64);
1508   const LLT S32 = LLT::scalar(32);
1509 
1510   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1511 
1512   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1513 
1514   auto CvtHi = Signed ?
1515     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1516     B.buildUITOFP(S64, Unmerge.getReg(1));
1517 
1518   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1519 
1520   auto ThirtyTwo = B.buildConstant(S32, 32);
1521   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1522     .addUse(CvtHi.getReg(0))
1523     .addUse(ThirtyTwo.getReg(0));
1524 
1525   // TODO: Should this propagate fast-math-flags?
1526   B.buildFAdd(Dst, LdExp, CvtLo);
1527   MI.eraseFromParent();
1528   return true;
1529 }
1530 
1531 // TODO: Copied from DAG implementation. Verify logic and document how this
1532 // actually works.
1533 bool AMDGPULegalizerInfo::legalizeFPTOI(
1534   MachineInstr &MI, MachineRegisterInfo &MRI,
1535   MachineIRBuilder &B, bool Signed) const {
1536   B.setInstr(MI);
1537 
1538   Register Dst = MI.getOperand(0).getReg();
1539   Register Src = MI.getOperand(1).getReg();
1540 
1541   const LLT S64 = LLT::scalar(64);
1542   const LLT S32 = LLT::scalar(32);
1543 
1544   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1545 
1546   unsigned Flags = MI.getFlags();
1547 
1548   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
1549   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
1550   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
1551 
1552   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
1553   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
1554   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
1555 
1556   auto Hi = Signed ?
1557     B.buildFPTOSI(S32, FloorMul) :
1558     B.buildFPTOUI(S32, FloorMul);
1559   auto Lo = B.buildFPTOUI(S32, Fma);
1560 
1561   B.buildMerge(Dst, { Lo.getReg(0), Hi.getReg(0) });
1562   MI.eraseFromParent();
1563 
1564   return true;
1565 }
1566 
1567 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1568   MachineInstr &MI, MachineRegisterInfo &MRI,
1569   MachineIRBuilder &B) const {
1570   MachineFunction &MF = B.getMF();
1571   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1572 
1573   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1574                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1575 
1576   // With ieee_mode disabled, the instructions have the correct behavior
1577   // already for G_FMINNUM/G_FMAXNUM
1578   if (!MFI->getMode().IEEE)
1579     return !IsIEEEOp;
1580 
1581   if (IsIEEEOp)
1582     return true;
1583 
1584   MachineIRBuilder HelperBuilder(MI);
1585   GISelObserverWrapper DummyObserver;
1586   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1587   HelperBuilder.setInstr(MI);
1588   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1589 }
1590 
1591 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1592   MachineInstr &MI, MachineRegisterInfo &MRI,
1593   MachineIRBuilder &B) const {
1594   // TODO: Should move some of this into LegalizerHelper.
1595 
1596   // TODO: Promote dynamic indexing of s16 to s32
1597   // TODO: Dynamic s64 indexing is only legal for SGPR.
1598   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
1599   if (!IdxVal) // Dynamic case will be selected to register indexing.
1600     return true;
1601 
1602   Register Dst = MI.getOperand(0).getReg();
1603   Register Vec = MI.getOperand(1).getReg();
1604 
1605   LLT VecTy = MRI.getType(Vec);
1606   LLT EltTy = VecTy.getElementType();
1607   assert(EltTy == MRI.getType(Dst));
1608 
1609   B.setInstr(MI);
1610 
1611   if (IdxVal.getValue() < VecTy.getNumElements())
1612     B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits());
1613   else
1614     B.buildUndef(Dst);
1615 
1616   MI.eraseFromParent();
1617   return true;
1618 }
1619 
1620 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1621   MachineInstr &MI, MachineRegisterInfo &MRI,
1622   MachineIRBuilder &B) const {
1623   // TODO: Should move some of this into LegalizerHelper.
1624 
1625   // TODO: Promote dynamic indexing of s16 to s32
1626   // TODO: Dynamic s64 indexing is only legal for SGPR.
1627   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI);
1628   if (!IdxVal) // Dynamic case will be selected to register indexing.
1629     return true;
1630 
1631   Register Dst = MI.getOperand(0).getReg();
1632   Register Vec = MI.getOperand(1).getReg();
1633   Register Ins = MI.getOperand(2).getReg();
1634 
1635   LLT VecTy = MRI.getType(Vec);
1636   LLT EltTy = VecTy.getElementType();
1637   assert(EltTy == MRI.getType(Ins));
1638 
1639   B.setInstr(MI);
1640 
1641   if (IdxVal.getValue() < VecTy.getNumElements())
1642     B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits());
1643   else
1644     B.buildUndef(Dst);
1645 
1646   MI.eraseFromParent();
1647   return true;
1648 }
1649 
1650 static bool isLegalVOP3PShuffleMask(ArrayRef<int> Mask) {
1651   assert(Mask.size() == 2);
1652 
1653   // If one half is undef, the other is trivially in the same reg.
1654   if (Mask[0] == -1 || Mask[1] == -1)
1655     return true;
1656   return ((Mask[0] == 0 || Mask[0] == 1) && (Mask[1] == 0 || Mask[1] == 1)) ||
1657          ((Mask[0] == 2 || Mask[0] == 3) && (Mask[1] == 2 || Mask[1] == 3));
1658 }
1659 
1660 bool AMDGPULegalizerInfo::legalizeShuffleVector(
1661   MachineInstr &MI, MachineRegisterInfo &MRI,
1662   MachineIRBuilder &B) const {
1663   const LLT V2S16 = LLT::vector(2, 16);
1664 
1665   Register Dst = MI.getOperand(0).getReg();
1666   Register Src0 = MI.getOperand(1).getReg();
1667   LLT DstTy = MRI.getType(Dst);
1668   LLT SrcTy = MRI.getType(Src0);
1669 
1670   if (SrcTy == V2S16 && DstTy == V2S16 &&
1671       isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
1672     return true;
1673 
1674   MachineIRBuilder HelperBuilder(MI);
1675   GISelObserverWrapper DummyObserver;
1676   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
1677   HelperBuilder.setInstr(MI);
1678   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
1679 }
1680 
1681 bool AMDGPULegalizerInfo::legalizeSinCos(
1682   MachineInstr &MI, MachineRegisterInfo &MRI,
1683   MachineIRBuilder &B) const {
1684   B.setInstr(MI);
1685 
1686   Register DstReg = MI.getOperand(0).getReg();
1687   Register SrcReg = MI.getOperand(1).getReg();
1688   LLT Ty = MRI.getType(DstReg);
1689   unsigned Flags = MI.getFlags();
1690 
1691   Register TrigVal;
1692   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1693   if (ST.hasTrigReducedRange()) {
1694     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1695     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1696       .addUse(MulVal.getReg(0))
1697       .setMIFlags(Flags).getReg(0);
1698   } else
1699     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1700 
1701   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1702     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1703   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1704     .addUse(TrigVal)
1705     .setMIFlags(Flags);
1706   MI.eraseFromParent();
1707   return true;
1708 }
1709 
1710 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1711   Register DstReg, LLT PtrTy,
1712   MachineIRBuilder &B, const GlobalValue *GV,
1713   unsigned Offset, unsigned GAFlags) const {
1714   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1715   // to the following code sequence:
1716   //
1717   // For constant address space:
1718   //   s_getpc_b64 s[0:1]
1719   //   s_add_u32 s0, s0, $symbol
1720   //   s_addc_u32 s1, s1, 0
1721   //
1722   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1723   //   a fixup or relocation is emitted to replace $symbol with a literal
1724   //   constant, which is a pc-relative offset from the encoding of the $symbol
1725   //   operand to the global variable.
1726   //
1727   // For global address space:
1728   //   s_getpc_b64 s[0:1]
1729   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1730   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1731   //
1732   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1733   //   fixups or relocations are emitted to replace $symbol@*@lo and
1734   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1735   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
1736   //   operand to the global variable.
1737   //
1738   // What we want here is an offset from the value returned by s_getpc
1739   // (which is the address of the s_add_u32 instruction) to the global
1740   // variable, but since the encoding of $symbol starts 4 bytes after the start
1741   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1742   // small. This requires us to add 4 to the global variable offset in order to
1743   // compute the correct address.
1744 
1745   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1746 
1747   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1748     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1749 
1750   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1751     .addDef(PCReg);
1752 
1753   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1754   if (GAFlags == SIInstrInfo::MO_NONE)
1755     MIB.addImm(0);
1756   else
1757     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1758 
1759   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1760 
1761   if (PtrTy.getSizeInBits() == 32)
1762     B.buildExtract(DstReg, PCReg, 0);
1763   return true;
1764  }
1765 
1766 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1767   MachineInstr &MI, MachineRegisterInfo &MRI,
1768   MachineIRBuilder &B) const {
1769   Register DstReg = MI.getOperand(0).getReg();
1770   LLT Ty = MRI.getType(DstReg);
1771   unsigned AS = Ty.getAddressSpace();
1772 
1773   const GlobalValue *GV = MI.getOperand(1).getGlobal();
1774   MachineFunction &MF = B.getMF();
1775   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1776   B.setInstr(MI);
1777 
1778   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1779     if (!MFI->isEntryFunction()) {
1780       const Function &Fn = MF.getFunction();
1781       DiagnosticInfoUnsupported BadLDSDecl(
1782         Fn, "local memory global used by non-kernel function", MI.getDebugLoc());
1783       Fn.getContext().diagnose(BadLDSDecl);
1784     }
1785 
1786     // TODO: We could emit code to handle the initialization somewhere.
1787     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1788       const SITargetLowering *TLI = ST.getTargetLowering();
1789       if (!TLI->shouldUseLDSConstAddress(GV)) {
1790         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
1791         return true; // Leave in place;
1792       }
1793 
1794       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1795       MI.eraseFromParent();
1796       return true;
1797     }
1798 
1799     const Function &Fn = MF.getFunction();
1800     DiagnosticInfoUnsupported BadInit(
1801       Fn, "unsupported initializer for address space", MI.getDebugLoc());
1802     Fn.getContext().diagnose(BadInit);
1803     return true;
1804   }
1805 
1806   const SITargetLowering *TLI = ST.getTargetLowering();
1807 
1808   if (TLI->shouldEmitFixup(GV)) {
1809     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
1810     MI.eraseFromParent();
1811     return true;
1812   }
1813 
1814   if (TLI->shouldEmitPCReloc(GV)) {
1815     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
1816     MI.eraseFromParent();
1817     return true;
1818   }
1819 
1820   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1821   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
1822 
1823   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
1824     MachinePointerInfo::getGOT(MF),
1825     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1826     MachineMemOperand::MOInvariant,
1827     8 /*Size*/, 8 /*Align*/);
1828 
1829   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
1830 
1831   if (Ty.getSizeInBits() == 32) {
1832     // Truncate if this is a 32-bit constant adrdess.
1833     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
1834     B.buildExtract(DstReg, Load, 0);
1835   } else
1836     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
1837 
1838   MI.eraseFromParent();
1839   return true;
1840 }
1841 
1842 bool AMDGPULegalizerInfo::legalizeLoad(
1843   MachineInstr &MI, MachineRegisterInfo &MRI,
1844   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
1845   B.setInstr(MI);
1846   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1847   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
1848   Observer.changingInstr(MI);
1849   MI.getOperand(1).setReg(Cast.getReg(0));
1850   Observer.changedInstr(MI);
1851   return true;
1852 }
1853 
1854 bool AMDGPULegalizerInfo::legalizeFMad(
1855   MachineInstr &MI, MachineRegisterInfo &MRI,
1856   MachineIRBuilder &B) const {
1857   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1858   assert(Ty.isScalar());
1859 
1860   MachineFunction &MF = B.getMF();
1861   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1862 
1863   // TODO: Always legal with future ftz flag.
1864   if (Ty == LLT::scalar(32) && !MFI->getMode().FP32Denormals)
1865     return true;
1866   if (Ty == LLT::scalar(16) && !MFI->getMode().FP64FP16Denormals)
1867     return true;
1868 
1869 
1870   MachineIRBuilder HelperBuilder(MI);
1871   GISelObserverWrapper DummyObserver;
1872   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1873   HelperBuilder.setMBB(*MI.getParent());
1874   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
1875 }
1876 
1877 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
1878   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
1879   Register DstReg = MI.getOperand(0).getReg();
1880   Register PtrReg = MI.getOperand(1).getReg();
1881   Register CmpVal = MI.getOperand(2).getReg();
1882   Register NewVal = MI.getOperand(3).getReg();
1883 
1884   assert(SITargetLowering::isFlatGlobalAddrSpace(
1885            MRI.getType(PtrReg).getAddressSpace()) &&
1886          "this should not have been custom lowered");
1887 
1888   LLT ValTy = MRI.getType(CmpVal);
1889   LLT VecTy = LLT::vector(2, ValTy);
1890 
1891   B.setInstr(MI);
1892   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
1893 
1894   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
1895     .addDef(DstReg)
1896     .addUse(PtrReg)
1897     .addUse(PackedVal)
1898     .setMemRefs(MI.memoperands());
1899 
1900   MI.eraseFromParent();
1901   return true;
1902 }
1903 
1904 bool AMDGPULegalizerInfo::legalizeFlog(
1905   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
1906   Register Dst = MI.getOperand(0).getReg();
1907   Register Src = MI.getOperand(1).getReg();
1908   LLT Ty = B.getMRI()->getType(Dst);
1909   unsigned Flags = MI.getFlags();
1910   B.setInstr(MI);
1911 
1912   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
1913   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
1914 
1915   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
1916   MI.eraseFromParent();
1917   return true;
1918 }
1919 
1920 // Return the use branch instruction, otherwise null if the usage is invalid.
1921 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
1922                                        MachineRegisterInfo &MRI,
1923                                        MachineInstr *&Br) {
1924   Register CondDef = MI.getOperand(0).getReg();
1925   if (!MRI.hasOneNonDBGUse(CondDef))
1926     return nullptr;
1927 
1928   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
1929   if (UseMI.getParent() != MI.getParent() ||
1930       UseMI.getOpcode() != AMDGPU::G_BRCOND)
1931     return nullptr;
1932 
1933   // Make sure the cond br is followed by a G_BR
1934   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
1935   if (Next != MI.getParent()->end()) {
1936     if (Next->getOpcode() != AMDGPU::G_BR)
1937       return nullptr;
1938     Br = &*Next;
1939   }
1940 
1941   return &UseMI;
1942 }
1943 
1944 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
1945                                                 Register Reg, LLT Ty) const {
1946   Register LiveIn = MRI.getLiveInVirtReg(Reg);
1947   if (LiveIn)
1948     return LiveIn;
1949 
1950   Register NewReg = MRI.createGenericVirtualRegister(Ty);
1951   MRI.addLiveIn(Reg, NewReg);
1952   return NewReg;
1953 }
1954 
1955 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
1956                                          const ArgDescriptor *Arg) const {
1957   if (!Arg->isRegister() || !Arg->getRegister().isValid())
1958     return false; // TODO: Handle these
1959 
1960   assert(Arg->getRegister().isPhysical());
1961 
1962   MachineRegisterInfo &MRI = *B.getMRI();
1963 
1964   LLT Ty = MRI.getType(DstReg);
1965   Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
1966 
1967   if (Arg->isMasked()) {
1968     // TODO: Should we try to emit this once in the entry block?
1969     const LLT S32 = LLT::scalar(32);
1970     const unsigned Mask = Arg->getMask();
1971     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
1972 
1973     Register AndMaskSrc = LiveIn;
1974 
1975     if (Shift != 0) {
1976       auto ShiftAmt = B.buildConstant(S32, Shift);
1977       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
1978     }
1979 
1980     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
1981   } else
1982     B.buildCopy(DstReg, LiveIn);
1983 
1984   // Insert the argument copy if it doens't already exist.
1985   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
1986   if (!MRI.getVRegDef(LiveIn)) {
1987     // FIXME: Should have scoped insert pt
1988     MachineBasicBlock &OrigInsBB = B.getMBB();
1989     auto OrigInsPt = B.getInsertPt();
1990 
1991     MachineBasicBlock &EntryMBB = B.getMF().front();
1992     EntryMBB.addLiveIn(Arg->getRegister());
1993     B.setInsertPt(EntryMBB, EntryMBB.begin());
1994     B.buildCopy(LiveIn, Arg->getRegister());
1995 
1996     B.setInsertPt(OrigInsBB, OrigInsPt);
1997   }
1998 
1999   return true;
2000 }
2001 
2002 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2003   MachineInstr &MI,
2004   MachineRegisterInfo &MRI,
2005   MachineIRBuilder &B,
2006   AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2007   B.setInstr(MI);
2008 
2009   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2010 
2011   const ArgDescriptor *Arg;
2012   const TargetRegisterClass *RC;
2013   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
2014   if (!Arg) {
2015     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
2016     return false;
2017   }
2018 
2019   if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
2020     MI.eraseFromParent();
2021     return true;
2022   }
2023 
2024   return false;
2025 }
2026 
2027 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2028                                        MachineRegisterInfo &MRI,
2029                                        MachineIRBuilder &B) const {
2030   B.setInstr(MI);
2031   Register Dst = MI.getOperand(0).getReg();
2032   LLT DstTy = MRI.getType(Dst);
2033   LLT S16 = LLT::scalar(16);
2034   LLT S32 = LLT::scalar(32);
2035   LLT S64 = LLT::scalar(64);
2036 
2037   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2038     return true;
2039 
2040   if (DstTy == S16)
2041     return legalizeFDIV16(MI, MRI, B);
2042   if (DstTy == S32)
2043     return legalizeFDIV32(MI, MRI, B);
2044   if (DstTy == S64)
2045     return legalizeFDIV64(MI, MRI, B);
2046 
2047   return false;
2048 }
2049 
2050 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2051                                                  MachineRegisterInfo &MRI,
2052                                                  MachineIRBuilder &B) const {
2053   Register Res = MI.getOperand(0).getReg();
2054   Register LHS = MI.getOperand(1).getReg();
2055   Register RHS = MI.getOperand(2).getReg();
2056 
2057   uint16_t Flags = MI.getFlags();
2058 
2059   LLT ResTy = MRI.getType(Res);
2060   LLT S32 = LLT::scalar(32);
2061   LLT S64 = LLT::scalar(64);
2062 
2063   const MachineFunction &MF = B.getMF();
2064   bool Unsafe =
2065     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2066 
2067   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2068     return false;
2069 
2070   if (!Unsafe && ResTy == S32 &&
2071       MF.getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals)
2072     return false;
2073 
2074   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2075     // 1 / x -> RCP(x)
2076     if (CLHS->isExactlyValue(1.0)) {
2077       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2078         .addUse(RHS)
2079         .setMIFlags(Flags);
2080 
2081       MI.eraseFromParent();
2082       return true;
2083     }
2084 
2085     // -1 / x -> RCP( FNEG(x) )
2086     if (CLHS->isExactlyValue(-1.0)) {
2087       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2088       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2089         .addUse(FNeg.getReg(0))
2090         .setMIFlags(Flags);
2091 
2092       MI.eraseFromParent();
2093       return true;
2094     }
2095   }
2096 
2097   // x / y -> x * (1.0 / y)
2098   if (Unsafe) {
2099     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2100       .addUse(RHS)
2101       .setMIFlags(Flags);
2102     B.buildFMul(Res, LHS, RCP, Flags);
2103 
2104     MI.eraseFromParent();
2105     return true;
2106   }
2107 
2108   return false;
2109 }
2110 
2111 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2112                                          MachineRegisterInfo &MRI,
2113                                          MachineIRBuilder &B) const {
2114   B.setInstr(MI);
2115   Register Res = MI.getOperand(0).getReg();
2116   Register LHS = MI.getOperand(1).getReg();
2117   Register RHS = MI.getOperand(2).getReg();
2118 
2119   uint16_t Flags = MI.getFlags();
2120 
2121   LLT S16 = LLT::scalar(16);
2122   LLT S32 = LLT::scalar(32);
2123 
2124   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2125   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2126 
2127   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2128     .addUse(RHSExt.getReg(0))
2129     .setMIFlags(Flags);
2130 
2131   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2132   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2133 
2134   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2135     .addUse(RDst.getReg(0))
2136     .addUse(RHS)
2137     .addUse(LHS)
2138     .setMIFlags(Flags);
2139 
2140   MI.eraseFromParent();
2141   return true;
2142 }
2143 
2144 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2145 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2146 static void toggleSPDenormMode(bool Enable,
2147                                MachineIRBuilder &B,
2148                                const GCNSubtarget &ST,
2149                                AMDGPU::SIModeRegisterDefaults Mode) {
2150   // Set SP denorm mode to this value.
2151   unsigned SPDenormMode =
2152     Enable ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
2153 
2154   if (ST.hasDenormModeInst()) {
2155     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2156     unsigned DPDenormModeDefault = Mode.FP64FP16Denormals
2157                                    ? FP_DENORM_FLUSH_NONE
2158                                    : FP_DENORM_FLUSH_IN_FLUSH_OUT;
2159 
2160     unsigned NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2161     B.buildInstr(AMDGPU::S_DENORM_MODE)
2162       .addImm(NewDenormModeValue);
2163 
2164   } else {
2165     // Select FP32 bit field in mode register.
2166     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2167                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2168                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2169 
2170     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2171       .addImm(SPDenormMode)
2172       .addImm(SPDenormModeBitField);
2173   }
2174 }
2175 
2176 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2177                                          MachineRegisterInfo &MRI,
2178                                          MachineIRBuilder &B) const {
2179   B.setInstr(MI);
2180   Register Res = MI.getOperand(0).getReg();
2181   Register LHS = MI.getOperand(1).getReg();
2182   Register RHS = MI.getOperand(2).getReg();
2183   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2184   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2185 
2186   uint16_t Flags = MI.getFlags();
2187 
2188   LLT S32 = LLT::scalar(32);
2189   LLT S1 = LLT::scalar(1);
2190 
2191   auto One = B.buildFConstant(S32, 1.0f);
2192 
2193   auto DenominatorScaled =
2194     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2195       .addUse(RHS)
2196       .addUse(LHS)
2197       .addImm(1)
2198       .setMIFlags(Flags);
2199   auto NumeratorScaled =
2200     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2201       .addUse(LHS)
2202       .addUse(RHS)
2203       .addImm(0)
2204       .setMIFlags(Flags);
2205 
2206   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2207     .addUse(DenominatorScaled.getReg(0))
2208     .setMIFlags(Flags);
2209   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2210 
2211   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2212   // aren't modeled as reading it.
2213   if (!Mode.FP32Denormals)
2214     toggleSPDenormMode(true, B, ST, Mode);
2215 
2216   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2217   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2218   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2219   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2220   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2221   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2222 
2223   if (!Mode.FP32Denormals)
2224     toggleSPDenormMode(false, B, ST, Mode);
2225 
2226   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2227     .addUse(Fma4.getReg(0))
2228     .addUse(Fma1.getReg(0))
2229     .addUse(Fma3.getReg(0))
2230     .addUse(NumeratorScaled.getReg(1))
2231     .setMIFlags(Flags);
2232 
2233   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2234     .addUse(Fmas.getReg(0))
2235     .addUse(RHS)
2236     .addUse(LHS)
2237     .setMIFlags(Flags);
2238 
2239   MI.eraseFromParent();
2240   return true;
2241 }
2242 
2243 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
2244                                          MachineRegisterInfo &MRI,
2245                                          MachineIRBuilder &B) const {
2246   B.setInstr(MI);
2247   Register Res = MI.getOperand(0).getReg();
2248   Register LHS = MI.getOperand(1).getReg();
2249   Register RHS = MI.getOperand(2).getReg();
2250 
2251   uint16_t Flags = MI.getFlags();
2252 
2253   LLT S64 = LLT::scalar(64);
2254   LLT S1 = LLT::scalar(1);
2255 
2256   auto One = B.buildFConstant(S64, 1.0);
2257 
2258   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2259     .addUse(LHS)
2260     .addUse(RHS)
2261     .addImm(1)
2262     .setMIFlags(Flags);
2263 
2264   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
2265 
2266   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
2267     .addUse(DivScale0.getReg(0))
2268     .setMIFlags(Flags);
2269 
2270   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
2271   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
2272   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
2273 
2274   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2275     .addUse(LHS)
2276     .addUse(RHS)
2277     .addImm(0)
2278     .setMIFlags(Flags);
2279 
2280   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
2281   auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags);
2282   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
2283 
2284   Register Scale;
2285   if (!ST.hasUsableDivScaleConditionOutput()) {
2286     // Workaround a hardware bug on SI where the condition output from div_scale
2287     // is not usable.
2288 
2289     Scale = MRI.createGenericVirtualRegister(S1);
2290 
2291     LLT S32 = LLT::scalar(32);
2292 
2293     auto NumUnmerge = B.buildUnmerge(S32, LHS);
2294     auto DenUnmerge = B.buildUnmerge(S32, RHS);
2295     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
2296     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
2297 
2298     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
2299                               Scale1Unmerge.getReg(1));
2300     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
2301                               Scale0Unmerge.getReg(1));
2302     B.buildXor(Scale, CmpNum, CmpDen);
2303   } else {
2304     Scale = DivScale1.getReg(1);
2305   }
2306 
2307   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
2308     .addUse(Fma4.getReg(0))
2309     .addUse(Fma3.getReg(0))
2310     .addUse(Mul.getReg(0))
2311     .addUse(Scale)
2312     .setMIFlags(Flags);
2313 
2314   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
2315     .addUse(Fmas.getReg(0))
2316     .addUse(RHS)
2317     .addUse(LHS)
2318     .setMIFlags(Flags);
2319 
2320   MI.eraseFromParent();
2321   return true;
2322 }
2323 
2324 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
2325                                                  MachineRegisterInfo &MRI,
2326                                                  MachineIRBuilder &B) const {
2327   B.setInstr(MI);
2328   Register Res = MI.getOperand(0).getReg();
2329   Register LHS = MI.getOperand(2).getReg();
2330   Register RHS = MI.getOperand(3).getReg();
2331   uint16_t Flags = MI.getFlags();
2332 
2333   LLT S32 = LLT::scalar(32);
2334   LLT S1 = LLT::scalar(1);
2335 
2336   auto Abs = B.buildFAbs(S32, RHS, Flags);
2337   const APFloat C0Val(1.0f);
2338 
2339   auto C0 = B.buildConstant(S32, 0x6f800000);
2340   auto C1 = B.buildConstant(S32, 0x2f800000);
2341   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
2342 
2343   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
2344   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
2345 
2346   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
2347 
2348   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2349     .addUse(Mul0.getReg(0))
2350     .setMIFlags(Flags);
2351 
2352   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
2353 
2354   B.buildFMul(Res, Sel, Mul1, Flags);
2355 
2356   MI.eraseFromParent();
2357   return true;
2358 }
2359 
2360 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
2361                                                  MachineRegisterInfo &MRI,
2362                                                  MachineIRBuilder &B) const {
2363   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2364   if (!MFI->isEntryFunction()) {
2365     return legalizePreloadedArgIntrin(MI, MRI, B,
2366                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
2367   }
2368 
2369   B.setInstr(MI);
2370 
2371   uint64_t Offset =
2372     ST.getTargetLowering()->getImplicitParameterOffset(
2373       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
2374   Register DstReg = MI.getOperand(0).getReg();
2375   LLT DstTy = MRI.getType(DstReg);
2376   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
2377 
2378   const ArgDescriptor *Arg;
2379   const TargetRegisterClass *RC;
2380   std::tie(Arg, RC)
2381     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2382   if (!Arg)
2383     return false;
2384 
2385   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
2386   if (!loadInputValue(KernargPtrReg, B, Arg))
2387     return false;
2388 
2389   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
2390   MI.eraseFromParent();
2391   return true;
2392 }
2393 
2394 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
2395                                               MachineRegisterInfo &MRI,
2396                                               MachineIRBuilder &B,
2397                                               unsigned AddrSpace) const {
2398   B.setInstr(MI);
2399   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
2400   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
2401   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
2402   MI.eraseFromParent();
2403   return true;
2404 }
2405 
2406 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
2407 // offset (the offset that is included in bounds checking and swizzling, to be
2408 // split between the instruction's voffset and immoffset fields) and soffset
2409 // (the offset that is excluded from bounds checking and swizzling, to go in
2410 // the instruction's soffset field).  This function takes the first kind of
2411 // offset and figures out how to split it between voffset and immoffset.
2412 std::tuple<Register, unsigned, unsigned>
2413 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
2414                                         Register OrigOffset) const {
2415   const unsigned MaxImm = 4095;
2416   Register BaseReg;
2417   unsigned TotalConstOffset;
2418   MachineInstr *OffsetDef;
2419   const LLT S32 = LLT::scalar(32);
2420 
2421   std::tie(BaseReg, TotalConstOffset, OffsetDef)
2422     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
2423 
2424   unsigned ImmOffset = TotalConstOffset;
2425 
2426   // If the immediate value is too big for the immoffset field, put the value
2427   // and -4096 into the immoffset field so that the value that is copied/added
2428   // for the voffset field is a multiple of 4096, and it stands more chance
2429   // of being CSEd with the copy/add for another similar load/store.
2430   // However, do not do that rounding down to a multiple of 4096 if that is a
2431   // negative number, as it appears to be illegal to have a negative offset
2432   // in the vgpr, even if adding the immediate offset makes it positive.
2433   unsigned Overflow = ImmOffset & ~MaxImm;
2434   ImmOffset -= Overflow;
2435   if ((int32_t)Overflow < 0) {
2436     Overflow += ImmOffset;
2437     ImmOffset = 0;
2438   }
2439 
2440   if (Overflow != 0) {
2441     if (!BaseReg) {
2442       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
2443     } else {
2444       auto OverflowVal = B.buildConstant(S32, Overflow);
2445       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
2446     }
2447   }
2448 
2449   if (!BaseReg)
2450     BaseReg = B.buildConstant(S32, 0).getReg(0);
2451 
2452   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
2453 }
2454 
2455 /// Handle register layout difference for f16 images for some subtargets.
2456 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
2457                                              MachineRegisterInfo &MRI,
2458                                              Register Reg) const {
2459   if (!ST.hasUnpackedD16VMem())
2460     return Reg;
2461 
2462   const LLT S16 = LLT::scalar(16);
2463   const LLT S32 = LLT::scalar(32);
2464   LLT StoreVT = MRI.getType(Reg);
2465   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
2466 
2467   auto Unmerge = B.buildUnmerge(S16, Reg);
2468 
2469   SmallVector<Register, 4> WideRegs;
2470   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
2471     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
2472 
2473   int NumElts = StoreVT.getNumElements();
2474 
2475   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
2476 }
2477 
2478 Register AMDGPULegalizerInfo::fixStoreSourceType(
2479   MachineIRBuilder &B, Register VData, bool IsFormat) const {
2480   MachineRegisterInfo *MRI = B.getMRI();
2481   LLT Ty = MRI->getType(VData);
2482 
2483   const LLT S16 = LLT::scalar(16);
2484 
2485   // Fixup illegal register types for i8 stores.
2486   if (Ty == LLT::scalar(8) || Ty == S16) {
2487     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
2488     return AnyExt;
2489   }
2490 
2491   if (Ty.isVector()) {
2492     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
2493       if (IsFormat)
2494         return handleD16VData(B, *MRI, VData);
2495     }
2496   }
2497 
2498   return VData;
2499 }
2500 
2501 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
2502                                               MachineRegisterInfo &MRI,
2503                                               MachineIRBuilder &B,
2504                                               bool IsTyped,
2505                                               bool IsFormat) const {
2506   B.setInstr(MI);
2507 
2508   Register VData = MI.getOperand(1).getReg();
2509   LLT Ty = MRI.getType(VData);
2510   LLT EltTy = Ty.getScalarType();
2511   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
2512   const LLT S32 = LLT::scalar(32);
2513 
2514   VData = fixStoreSourceType(B, VData, IsFormat);
2515   Register RSrc = MI.getOperand(2).getReg();
2516 
2517   MachineMemOperand *MMO = *MI.memoperands_begin();
2518   const int MemSize = MMO->getSize();
2519 
2520   unsigned ImmOffset;
2521   unsigned TotalOffset;
2522 
2523   // The typed intrinsics add an immediate after the registers.
2524   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
2525 
2526   // The struct intrinsic variants add one additional operand over raw.
2527   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
2528   Register VIndex;
2529   int OpOffset = 0;
2530   if (HasVIndex) {
2531     VIndex = MI.getOperand(3).getReg();
2532     OpOffset = 1;
2533   }
2534 
2535   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
2536   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
2537 
2538   unsigned Format = 0;
2539   if (IsTyped) {
2540     Format = MI.getOperand(5 + OpOffset).getImm();
2541     ++OpOffset;
2542   }
2543 
2544   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
2545 
2546   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
2547   if (TotalOffset != 0)
2548     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
2549 
2550   unsigned Opc;
2551   if (IsTyped) {
2552     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
2553                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
2554   } else if (IsFormat) {
2555     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
2556                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
2557   } else {
2558     switch (MemSize) {
2559     case 1:
2560       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
2561       break;
2562     case 2:
2563       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
2564       break;
2565     default:
2566       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
2567       break;
2568     }
2569   }
2570 
2571   if (!VIndex)
2572     VIndex = B.buildConstant(S32, 0).getReg(0);
2573 
2574   auto MIB = B.buildInstr(Opc)
2575     .addUse(VData)              // vdata
2576     .addUse(RSrc)               // rsrc
2577     .addUse(VIndex)             // vindex
2578     .addUse(VOffset)            // voffset
2579     .addUse(SOffset)            // soffset
2580     .addImm(ImmOffset);         // offset(imm)
2581 
2582   if (IsTyped)
2583     MIB.addImm(Format);
2584 
2585   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
2586      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
2587      .addMemOperand(MMO);
2588 
2589   MI.eraseFromParent();
2590   return true;
2591 }
2592 
2593 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
2594                                              MachineRegisterInfo &MRI,
2595                                              MachineIRBuilder &B,
2596                                              bool IsFormat,
2597                                              bool IsTyped) const {
2598   B.setInstr(MI);
2599 
2600   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
2601   MachineMemOperand *MMO = *MI.memoperands_begin();
2602   const int MemSize = MMO->getSize();
2603   const LLT S32 = LLT::scalar(32);
2604 
2605   Register Dst = MI.getOperand(0).getReg();
2606   Register RSrc = MI.getOperand(2).getReg();
2607 
2608   // The typed intrinsics add an immediate after the registers.
2609   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
2610 
2611   // The struct intrinsic variants add one additional operand over raw.
2612   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
2613   Register VIndex;
2614   int OpOffset = 0;
2615   if (HasVIndex) {
2616     VIndex = MI.getOperand(3).getReg();
2617     OpOffset = 1;
2618   }
2619 
2620   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
2621   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
2622 
2623   unsigned Format = 0;
2624   if (IsTyped) {
2625     Format = MI.getOperand(5 + OpOffset).getImm();
2626     ++OpOffset;
2627   }
2628 
2629   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
2630   unsigned ImmOffset;
2631   unsigned TotalOffset;
2632 
2633   LLT Ty = MRI.getType(Dst);
2634   LLT EltTy = Ty.getScalarType();
2635   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
2636   const bool Unpacked = ST.hasUnpackedD16VMem();
2637 
2638   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
2639   if (TotalOffset != 0)
2640     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
2641 
2642   unsigned Opc;
2643 
2644   if (IsTyped) {
2645     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
2646                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
2647   } else if (IsFormat) {
2648     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
2649                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
2650   } else {
2651     switch (MemSize) {
2652     case 1:
2653       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
2654       break;
2655     case 2:
2656       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
2657       break;
2658     default:
2659       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
2660       break;
2661     }
2662   }
2663 
2664   Register LoadDstReg;
2665 
2666   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
2667   LLT UnpackedTy = Ty.changeElementSize(32);
2668 
2669   if (IsExtLoad)
2670     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
2671   else if (Unpacked && IsD16 && Ty.isVector())
2672     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
2673   else
2674     LoadDstReg = Dst;
2675 
2676   if (!VIndex)
2677     VIndex = B.buildConstant(S32, 0).getReg(0);
2678 
2679   auto MIB = B.buildInstr(Opc)
2680     .addDef(LoadDstReg)         // vdata
2681     .addUse(RSrc)               // rsrc
2682     .addUse(VIndex)             // vindex
2683     .addUse(VOffset)            // voffset
2684     .addUse(SOffset)            // soffset
2685     .addImm(ImmOffset);         // offset(imm)
2686 
2687   if (IsTyped)
2688     MIB.addImm(Format);
2689 
2690   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
2691      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
2692      .addMemOperand(MMO);
2693 
2694   if (LoadDstReg != Dst) {
2695     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2696 
2697     // Widen result for extending loads was widened.
2698     if (IsExtLoad)
2699       B.buildTrunc(Dst, LoadDstReg);
2700     else {
2701       // Repack to original 16-bit vector result
2702       // FIXME: G_TRUNC should work, but legalization currently fails
2703       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
2704       SmallVector<Register, 4> Repack;
2705       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
2706         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
2707       B.buildMerge(Dst, Repack);
2708     }
2709   }
2710 
2711   MI.eraseFromParent();
2712   return true;
2713 }
2714 
2715 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
2716                                                MachineIRBuilder &B,
2717                                                bool IsInc) const {
2718   B.setInstr(MI);
2719   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
2720                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
2721   B.buildInstr(Opc)
2722     .addDef(MI.getOperand(0).getReg())
2723     .addUse(MI.getOperand(2).getReg())
2724     .addUse(MI.getOperand(3).getReg())
2725     .cloneMemRefs(MI);
2726   MI.eraseFromParent();
2727   return true;
2728 }
2729 
2730 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
2731   switch (IntrID) {
2732   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
2733   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
2734     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
2735   case Intrinsic::amdgcn_raw_buffer_atomic_add:
2736   case Intrinsic::amdgcn_struct_buffer_atomic_add:
2737     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
2738   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
2739   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
2740     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
2741   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
2742   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
2743     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
2744   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
2745   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
2746     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
2747   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
2748   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
2749     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
2750   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
2751   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
2752     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
2753   case Intrinsic::amdgcn_raw_buffer_atomic_and:
2754   case Intrinsic::amdgcn_struct_buffer_atomic_and:
2755     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
2756   case Intrinsic::amdgcn_raw_buffer_atomic_or:
2757   case Intrinsic::amdgcn_struct_buffer_atomic_or:
2758     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
2759   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
2760   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
2761     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
2762   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
2763   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
2764     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
2765   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
2766   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
2767     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
2768   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
2769   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
2770     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
2771   default:
2772     llvm_unreachable("unhandled atomic opcode");
2773   }
2774 }
2775 
2776 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
2777                                                MachineIRBuilder &B,
2778                                                Intrinsic::ID IID) const {
2779   B.setInstr(MI);
2780 
2781   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
2782                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
2783 
2784   Register Dst = MI.getOperand(0).getReg();
2785   Register VData = MI.getOperand(2).getReg();
2786 
2787   Register CmpVal;
2788   int OpOffset = 0;
2789 
2790   if (IsCmpSwap) {
2791     CmpVal = MI.getOperand(3 + OpOffset).getReg();
2792     ++OpOffset;
2793   }
2794 
2795   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
2796   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
2797 
2798   // The struct intrinsic variants add one additional operand over raw.
2799   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
2800   Register VIndex;
2801   if (HasVIndex) {
2802     VIndex = MI.getOperand(4 + OpOffset).getReg();
2803     ++OpOffset;
2804   }
2805 
2806   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
2807   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
2808   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
2809 
2810   MachineMemOperand *MMO = *MI.memoperands_begin();
2811 
2812   unsigned ImmOffset;
2813   unsigned TotalOffset;
2814   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
2815   if (TotalOffset != 0)
2816     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
2817 
2818   if (!VIndex)
2819     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
2820 
2821   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
2822     .addDef(Dst)
2823     .addUse(VData); // vdata
2824 
2825   if (IsCmpSwap)
2826     MIB.addReg(CmpVal);
2827 
2828   MIB.addUse(RSrc)               // rsrc
2829      .addUse(VIndex)             // vindex
2830      .addUse(VOffset)            // voffset
2831      .addUse(SOffset)            // soffset
2832      .addImm(ImmOffset)          // offset(imm)
2833      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
2834      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
2835      .addMemOperand(MMO);
2836 
2837   MI.eraseFromParent();
2838   return true;
2839 }
2840 
2841 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
2842     MachineInstr &MI, MachineIRBuilder &B,
2843     GISelChangeObserver &Observer,
2844     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
2845   // We are only processing the operands of d16 image operations on subtargets
2846   // that use the unpacked register layout.
2847   if (!ST.hasUnpackedD16VMem())
2848     return true;
2849 
2850   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
2851     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
2852 
2853   if (BaseOpcode->Atomic) // No d16 atomics
2854     return true;
2855 
2856   MachineRegisterInfo *MRI = B.getMRI();
2857   const LLT S32 = LLT::scalar(32);
2858   const LLT S16 = LLT::scalar(16);
2859 
2860   if (BaseOpcode->Store) {
2861     Register VData = MI.getOperand(1).getReg();
2862     LLT Ty = MRI->getType(VData);
2863     if (!Ty.isVector() || Ty.getElementType() != S16)
2864       return true;
2865 
2866     B.setInstr(MI);
2867 
2868     Observer.changingInstr(MI);
2869     MI.getOperand(1).setReg(handleD16VData(B, *MRI, VData));
2870     Observer.changedInstr(MI);
2871     return true;
2872   }
2873 
2874   // Must be an image load.
2875   Register DstReg = MI.getOperand(0).getReg();
2876   LLT Ty = MRI->getType(DstReg);
2877   if (!Ty.isVector() || Ty.getElementType() != S16)
2878     return true;
2879 
2880   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
2881 
2882   LLT WidenedTy = Ty.changeElementType(S32);
2883   Register WideDstReg = MRI->createGenericVirtualRegister(WidenedTy);
2884 
2885   Observer.changingInstr(MI);
2886   MI.getOperand(0).setReg(WideDstReg);
2887   Observer.changedInstr(MI);
2888 
2889   // FIXME: Just vector trunc should be sufficent, but legalization currently
2890   // broken.
2891   auto Unmerge = B.buildUnmerge(S32, WideDstReg);
2892 
2893   int NumOps = Unmerge->getNumOperands() - 1;
2894   SmallVector<Register, 4> RemergeParts(NumOps);
2895   for (int I = 0; I != NumOps; ++I)
2896     RemergeParts[I] = B.buildTrunc(S16, Unmerge.getReg(I)).getReg(0);
2897 
2898   B.buildBuildVector(DstReg, RemergeParts);
2899   return true;
2900 }
2901 
2902 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
2903                                             MachineIRBuilder &B,
2904                                             GISelChangeObserver &Observer) const {
2905   MachineRegisterInfo &MRI = *B.getMRI();
2906 
2907   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
2908   auto IntrID = MI.getIntrinsicID();
2909   switch (IntrID) {
2910   case Intrinsic::amdgcn_if:
2911   case Intrinsic::amdgcn_else: {
2912     MachineInstr *Br = nullptr;
2913     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
2914       const SIRegisterInfo *TRI
2915         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
2916 
2917       B.setInstr(*BrCond);
2918       Register Def = MI.getOperand(1).getReg();
2919       Register Use = MI.getOperand(3).getReg();
2920 
2921       MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB();
2922       if (Br)
2923         BrTarget = Br->getOperand(0).getMBB();
2924 
2925       if (IntrID == Intrinsic::amdgcn_if) {
2926         B.buildInstr(AMDGPU::SI_IF)
2927           .addDef(Def)
2928           .addUse(Use)
2929           .addMBB(BrTarget);
2930       } else {
2931         B.buildInstr(AMDGPU::SI_ELSE)
2932           .addDef(Def)
2933           .addUse(Use)
2934           .addMBB(BrTarget)
2935           .addImm(0);
2936       }
2937 
2938       if (Br)
2939         Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB());
2940 
2941       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
2942       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
2943       MI.eraseFromParent();
2944       BrCond->eraseFromParent();
2945       return true;
2946     }
2947 
2948     return false;
2949   }
2950   case Intrinsic::amdgcn_loop: {
2951     MachineInstr *Br = nullptr;
2952     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
2953       const SIRegisterInfo *TRI
2954         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
2955 
2956       B.setInstr(*BrCond);
2957 
2958       // FIXME: Need to adjust branch targets based on unconditional branch.
2959       Register Reg = MI.getOperand(2).getReg();
2960       B.buildInstr(AMDGPU::SI_LOOP)
2961         .addUse(Reg)
2962         .addMBB(BrCond->getOperand(1).getMBB());
2963       MI.eraseFromParent();
2964       BrCond->eraseFromParent();
2965       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
2966       return true;
2967     }
2968 
2969     return false;
2970   }
2971   case Intrinsic::amdgcn_kernarg_segment_ptr:
2972     return legalizePreloadedArgIntrin(
2973       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2974   case Intrinsic::amdgcn_implicitarg_ptr:
2975     return legalizeImplicitArgPtr(MI, MRI, B);
2976   case Intrinsic::amdgcn_workitem_id_x:
2977     return legalizePreloadedArgIntrin(MI, MRI, B,
2978                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
2979   case Intrinsic::amdgcn_workitem_id_y:
2980     return legalizePreloadedArgIntrin(MI, MRI, B,
2981                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
2982   case Intrinsic::amdgcn_workitem_id_z:
2983     return legalizePreloadedArgIntrin(MI, MRI, B,
2984                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
2985   case Intrinsic::amdgcn_workgroup_id_x:
2986     return legalizePreloadedArgIntrin(MI, MRI, B,
2987                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
2988   case Intrinsic::amdgcn_workgroup_id_y:
2989     return legalizePreloadedArgIntrin(MI, MRI, B,
2990                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
2991   case Intrinsic::amdgcn_workgroup_id_z:
2992     return legalizePreloadedArgIntrin(MI, MRI, B,
2993                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
2994   case Intrinsic::amdgcn_dispatch_ptr:
2995     return legalizePreloadedArgIntrin(MI, MRI, B,
2996                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
2997   case Intrinsic::amdgcn_queue_ptr:
2998     return legalizePreloadedArgIntrin(MI, MRI, B,
2999                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
3000   case Intrinsic::amdgcn_implicit_buffer_ptr:
3001     return legalizePreloadedArgIntrin(
3002       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
3003   case Intrinsic::amdgcn_dispatch_id:
3004     return legalizePreloadedArgIntrin(MI, MRI, B,
3005                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
3006   case Intrinsic::amdgcn_fdiv_fast:
3007     return legalizeFDIVFastIntrin(MI, MRI, B);
3008   case Intrinsic::amdgcn_is_shared:
3009     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
3010   case Intrinsic::amdgcn_is_private:
3011     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
3012   case Intrinsic::amdgcn_wavefrontsize: {
3013     B.setInstr(MI);
3014     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
3015     MI.eraseFromParent();
3016     return true;
3017   }
3018   case Intrinsic::amdgcn_raw_buffer_store:
3019   case Intrinsic::amdgcn_struct_buffer_store:
3020     return legalizeBufferStore(MI, MRI, B, false, false);
3021   case Intrinsic::amdgcn_raw_buffer_store_format:
3022   case Intrinsic::amdgcn_struct_buffer_store_format:
3023     return legalizeBufferStore(MI, MRI, B, false, true);
3024   case Intrinsic::amdgcn_raw_tbuffer_store:
3025   case Intrinsic::amdgcn_struct_tbuffer_store:
3026     return legalizeBufferStore(MI, MRI, B, true, true);
3027   case Intrinsic::amdgcn_raw_buffer_load:
3028   case Intrinsic::amdgcn_struct_buffer_load:
3029     return legalizeBufferLoad(MI, MRI, B, false, false);
3030   case Intrinsic::amdgcn_raw_buffer_load_format:
3031   case Intrinsic::amdgcn_struct_buffer_load_format:
3032     return legalizeBufferLoad(MI, MRI, B, true, false);
3033   case Intrinsic::amdgcn_raw_tbuffer_load:
3034   case Intrinsic::amdgcn_struct_tbuffer_load:
3035     return legalizeBufferLoad(MI, MRI, B, true, true);
3036   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3037   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3038   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3039   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3040   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3041   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3042   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3043   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3044   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3045   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3046   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3047   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3048   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3049   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3050   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3051   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3052   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3053   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3054   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3055   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3056   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3057   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3058   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3059   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3060   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3061   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3062     return legalizeBufferAtomic(MI, B, IntrID);
3063   case Intrinsic::amdgcn_atomic_inc:
3064     return legalizeAtomicIncDec(MI, B, true);
3065   case Intrinsic::amdgcn_atomic_dec:
3066     return legalizeAtomicIncDec(MI, B, false);
3067   default: {
3068     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
3069             AMDGPU::getImageDimIntrinsicInfo(IntrID))
3070       return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr);
3071     return true;
3072   }
3073   }
3074 
3075   return true;
3076 }
3077