1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
19 #endif
20 
21 #include "AMDGPU.h"
22 #include "AMDGPULegalizerInfo.h"
23 #include "AMDGPUTargetMachine.h"
24 #include "SIMachineFunctionInfo.h"
25 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/CodeGen/TargetOpcodes.h"
28 #include "llvm/CodeGen/ValueTypes.h"
29 #include "llvm/IR/DerivedTypes.h"
30 #include "llvm/IR/DiagnosticInfo.h"
31 #include "llvm/IR/Type.h"
32 #include "llvm/Support/Debug.h"
33 
34 #define DEBUG_TYPE "amdgpu-legalinfo"
35 
36 using namespace llvm;
37 using namespace LegalizeActions;
38 using namespace LegalizeMutations;
39 using namespace LegalityPredicates;
40 
41 
42 static LegalityPredicate isMultiple32(unsigned TypeIdx,
43                                       unsigned MaxSize = 512) {
44   return [=](const LegalityQuery &Query) {
45     const LLT Ty = Query.Types[TypeIdx];
46     const LLT EltTy = Ty.getScalarType();
47     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
48   };
49 }
50 
51 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
52   return [=](const LegalityQuery &Query) {
53     const LLT Ty = Query.Types[TypeIdx];
54     return Ty.isVector() &&
55            Ty.getNumElements() % 2 != 0 &&
56            Ty.getElementType().getSizeInBits() < 32;
57   };
58 }
59 
60 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
61   return [=](const LegalityQuery &Query) {
62     const LLT Ty = Query.Types[TypeIdx];
63     const LLT EltTy = Ty.getElementType();
64     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
65   };
66 }
67 
68 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
69   return [=](const LegalityQuery &Query) {
70     const LLT Ty = Query.Types[TypeIdx];
71     const LLT EltTy = Ty.getElementType();
72     unsigned Size = Ty.getSizeInBits();
73     unsigned Pieces = (Size + 63) / 64;
74     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
75     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
76   };
77 }
78 
79 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
80   return [=](const LegalityQuery &Query) {
81     const LLT QueryTy = Query.Types[TypeIdx];
82     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
83   };
84 }
85 
86 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
87   return [=](const LegalityQuery &Query) {
88     const LLT QueryTy = Query.Types[TypeIdx];
89     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
90   };
91 }
92 
93 // Any combination of 32 or 64-bit elements up to 512 bits, and multiples of
94 // v2s16.
95 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
96   return [=](const LegalityQuery &Query) {
97     const LLT Ty = Query.Types[TypeIdx];
98     if (Ty.isVector()) {
99       const int EltSize = Ty.getElementType().getSizeInBits();
100       return EltSize == 32 || EltSize == 64 ||
101             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
102              EltSize == 128 || EltSize == 256;
103     }
104 
105     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 512;
106   };
107 }
108 
109 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
110   return [=](const LegalityQuery &Query) {
111     return Query.Types[TypeIdx].getElementType() == Type;
112   };
113 }
114 
115 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
116                                          const GCNTargetMachine &TM)
117   :  ST(ST_) {
118   using namespace TargetOpcode;
119 
120   auto GetAddrSpacePtr = [&TM](unsigned AS) {
121     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
122   };
123 
124   const LLT S1 = LLT::scalar(1);
125   const LLT S8 = LLT::scalar(8);
126   const LLT S16 = LLT::scalar(16);
127   const LLT S32 = LLT::scalar(32);
128   const LLT S64 = LLT::scalar(64);
129   const LLT S128 = LLT::scalar(128);
130   const LLT S256 = LLT::scalar(256);
131   const LLT S512 = LLT::scalar(512);
132 
133   const LLT V2S16 = LLT::vector(2, 16);
134   const LLT V4S16 = LLT::vector(4, 16);
135 
136   const LLT V2S32 = LLT::vector(2, 32);
137   const LLT V3S32 = LLT::vector(3, 32);
138   const LLT V4S32 = LLT::vector(4, 32);
139   const LLT V5S32 = LLT::vector(5, 32);
140   const LLT V6S32 = LLT::vector(6, 32);
141   const LLT V7S32 = LLT::vector(7, 32);
142   const LLT V8S32 = LLT::vector(8, 32);
143   const LLT V9S32 = LLT::vector(9, 32);
144   const LLT V10S32 = LLT::vector(10, 32);
145   const LLT V11S32 = LLT::vector(11, 32);
146   const LLT V12S32 = LLT::vector(12, 32);
147   const LLT V13S32 = LLT::vector(13, 32);
148   const LLT V14S32 = LLT::vector(14, 32);
149   const LLT V15S32 = LLT::vector(15, 32);
150   const LLT V16S32 = LLT::vector(16, 32);
151 
152   const LLT V2S64 = LLT::vector(2, 64);
153   const LLT V3S64 = LLT::vector(3, 64);
154   const LLT V4S64 = LLT::vector(4, 64);
155   const LLT V5S64 = LLT::vector(5, 64);
156   const LLT V6S64 = LLT::vector(6, 64);
157   const LLT V7S64 = LLT::vector(7, 64);
158   const LLT V8S64 = LLT::vector(8, 64);
159 
160   std::initializer_list<LLT> AllS32Vectors =
161     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
162      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32};
163   std::initializer_list<LLT> AllS64Vectors =
164     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64};
165 
166   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
167   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
168   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
169   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
170   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
171   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
172   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
173 
174   const LLT CodePtr = FlatPtr;
175 
176   const std::initializer_list<LLT> AddrSpaces64 = {
177     GlobalPtr, ConstantPtr, FlatPtr
178   };
179 
180   const std::initializer_list<LLT> AddrSpaces32 = {
181     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
182   };
183 
184   const std::initializer_list<LLT> FPTypesBase = {
185     S32, S64
186   };
187 
188   const std::initializer_list<LLT> FPTypes16 = {
189     S32, S64, S16
190   };
191 
192   const std::initializer_list<LLT> FPTypesPK16 = {
193     S32, S64, S16, V2S16
194   };
195 
196   setAction({G_BRCOND, S1}, Legal);
197 
198   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
199   // elements for v3s16
200   getActionDefinitionsBuilder(G_PHI)
201     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
202     .legalFor(AllS32Vectors)
203     .legalFor(AllS64Vectors)
204     .legalFor(AddrSpaces64)
205     .legalFor(AddrSpaces32)
206     .clampScalar(0, S32, S256)
207     .widenScalarToNextPow2(0, 32)
208     .clampMaxNumElements(0, S32, 16)
209     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
210     .legalIf(isPointer(0));
211 
212   if (ST.has16BitInsts()) {
213     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
214       .legalFor({S32, S16})
215       .clampScalar(0, S16, S32)
216       .scalarize(0);
217   } else {
218     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
219       .legalFor({S32})
220       .clampScalar(0, S32, S32)
221       .scalarize(0);
222   }
223 
224   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
225     .legalFor({S32})
226     .clampScalar(0, S32, S32)
227     .scalarize(0);
228 
229   // Report legal for any types we can handle anywhere. For the cases only legal
230   // on the SALU, RegBankSelect will be able to re-legalize.
231   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
232     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
233     .clampScalar(0, S32, S64)
234     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
235     .fewerElementsIf(vectorWiderThan(0, 32), fewerEltsToSize64Vector(0))
236     .widenScalarToNextPow2(0)
237     .scalarize(0);
238 
239   getActionDefinitionsBuilder({G_UADDO, G_SADDO, G_USUBO, G_SSUBO,
240                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
241     .legalFor({{S32, S1}})
242     .clampScalar(0, S32, S32);
243 
244   getActionDefinitionsBuilder(G_BITCAST)
245     .legalForCartesianProduct({S32, V2S16})
246     .legalForCartesianProduct({S64, V2S32, V4S16})
247     .legalForCartesianProduct({V2S64, V4S32})
248     // Don't worry about the size constraint.
249     .legalIf(all(isPointer(0), isPointer(1)));
250 
251   getActionDefinitionsBuilder(G_FCONSTANT)
252     .legalFor({S32, S64, S16})
253     .clampScalar(0, S16, S64);
254 
255   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
256     .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
257                ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
258     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
259     .clampScalarOrElt(0, S32, S512)
260     .legalIf(isMultiple32(0))
261     .widenScalarToNextPow2(0, 32)
262     .clampMaxNumElements(0, S32, 16);
263 
264 
265   // FIXME: i1 operands to intrinsics should always be legal, but other i1
266   // values may not be legal.  We need to figure out how to distinguish
267   // between these two scenarios.
268   getActionDefinitionsBuilder(G_CONSTANT)
269     .legalFor({S1, S32, S64, S16, GlobalPtr,
270                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
271     .clampScalar(0, S32, S64)
272     .widenScalarToNextPow2(0)
273     .legalIf(isPointer(0));
274 
275   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
276   getActionDefinitionsBuilder(G_GLOBAL_VALUE).customFor({LocalPtr});
277 
278 
279   auto &FPOpActions = getActionDefinitionsBuilder(
280     { G_FADD, G_FMUL, G_FNEG, G_FABS, G_FMA, G_FCANONICALIZE})
281     .legalFor({S32, S64});
282   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
283     .customFor({S32, S64});
284 
285   if (ST.has16BitInsts()) {
286     if (ST.hasVOP3PInsts())
287       FPOpActions.legalFor({S16, V2S16});
288     else
289       FPOpActions.legalFor({S16});
290 
291     TrigActions.customFor({S16});
292   }
293 
294   auto &MinNumMaxNum = getActionDefinitionsBuilder({
295       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
296 
297   if (ST.hasVOP3PInsts()) {
298     MinNumMaxNum.customFor(FPTypesPK16)
299       .clampMaxNumElements(0, S16, 2)
300       .clampScalar(0, S16, S64)
301       .scalarize(0);
302   } else if (ST.has16BitInsts()) {
303     MinNumMaxNum.customFor(FPTypes16)
304       .clampScalar(0, S16, S64)
305       .scalarize(0);
306   } else {
307     MinNumMaxNum.customFor(FPTypesBase)
308       .clampScalar(0, S32, S64)
309       .scalarize(0);
310   }
311 
312   // TODO: Implement
313   getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
314 
315   if (ST.hasVOP3PInsts())
316     FPOpActions.clampMaxNumElements(0, S16, 2);
317 
318   FPOpActions
319     .scalarize(0)
320     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
321 
322   TrigActions
323     .scalarize(0)
324     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
325 
326   if (ST.has16BitInsts()) {
327     getActionDefinitionsBuilder(G_FSQRT)
328       .legalFor({S32, S64, S16})
329       .scalarize(0)
330       .clampScalar(0, S16, S64);
331   } else {
332     getActionDefinitionsBuilder(G_FSQRT)
333       .legalFor({S32, S64})
334       .scalarize(0)
335       .clampScalar(0, S32, S64);
336   }
337 
338   getActionDefinitionsBuilder(G_FPTRUNC)
339     .legalFor({{S32, S64}, {S16, S32}})
340     .scalarize(0);
341 
342   getActionDefinitionsBuilder(G_FPEXT)
343     .legalFor({{S64, S32}, {S32, S16}})
344     .lowerFor({{S64, S16}}) // FIXME: Implement
345     .scalarize(0);
346 
347   // TODO: Verify V_BFI_B32 is generated from expanded bit ops.
348   getActionDefinitionsBuilder(G_FCOPYSIGN).lower();
349 
350   getActionDefinitionsBuilder(G_FSUB)
351       // Use actual fsub instruction
352       .legalFor({S32})
353       // Must use fadd + fneg
354       .lowerFor({S64, S16, V2S16})
355       .scalarize(0)
356       .clampScalar(0, S32, S64);
357 
358   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
359     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
360                {S32, S1}, {S64, S1}, {S16, S1},
361                // FIXME: Hack
362                {S64, LLT::scalar(33)},
363                {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}})
364     .scalarize(0);
365 
366   getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
367     .legalFor({{S32, S32}, {S64, S32}})
368     .lowerFor({{S32, S64}})
369     .customFor({{S64, S64}})
370     .scalarize(0);
371 
372   getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
373     .legalFor({{S32, S32}, {S32, S64}})
374     .scalarize(0);
375 
376   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
377     .legalFor({S32, S64})
378     .scalarize(0);
379 
380   if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
381     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
382       .legalFor({S32, S64})
383       .clampScalar(0, S32, S64)
384       .scalarize(0);
385   } else {
386     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
387       .legalFor({S32})
388       .customFor({S64})
389       .clampScalar(0, S32, S64)
390       .scalarize(0);
391   }
392 
393   getActionDefinitionsBuilder(G_GEP)
394     .legalForCartesianProduct(AddrSpaces64, {S64})
395     .legalForCartesianProduct(AddrSpaces32, {S32})
396     .scalarize(0);
397 
398   getActionDefinitionsBuilder(G_PTR_MASK)
399     .scalarize(0)
400     .alwaysLegal();
401 
402   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
403 
404   auto &CmpBuilder =
405     getActionDefinitionsBuilder(G_ICMP)
406     .legalForCartesianProduct(
407       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
408     .legalFor({{S1, S32}, {S1, S64}});
409   if (ST.has16BitInsts()) {
410     CmpBuilder.legalFor({{S1, S16}});
411   }
412 
413   CmpBuilder
414     .widenScalarToNextPow2(1)
415     .clampScalar(1, S32, S64)
416     .scalarize(0)
417     .legalIf(all(typeIs(0, S1), isPointer(1)));
418 
419   getActionDefinitionsBuilder(G_FCMP)
420     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
421     .widenScalarToNextPow2(1)
422     .clampScalar(1, S32, S64)
423     .scalarize(0);
424 
425   // FIXME: fexp, flog2, flog10 needs to be custom lowered.
426   getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2,
427                                G_FLOG, G_FLOG2, G_FLOG10})
428     .legalFor({S32})
429     .scalarize(0);
430 
431   // The 64-bit versions produce 32-bit results, but only on the SALU.
432   getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF,
433                                G_CTTZ, G_CTTZ_ZERO_UNDEF,
434                                G_CTPOP})
435     .legalFor({{S32, S32}, {S32, S64}})
436     .clampScalar(0, S32, S32)
437     .clampScalar(1, S32, S64)
438     .scalarize(0)
439     .widenScalarToNextPow2(0, 32)
440     .widenScalarToNextPow2(1, 32);
441 
442   // TODO: Expand for > s32
443   getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE})
444     .legalFor({S32})
445     .clampScalar(0, S32, S32)
446     .scalarize(0);
447 
448   if (ST.has16BitInsts()) {
449     if (ST.hasVOP3PInsts()) {
450       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
451         .legalFor({S32, S16, V2S16})
452         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
453         .clampMaxNumElements(0, S16, 2)
454         .clampScalar(0, S16, S32)
455         .widenScalarToNextPow2(0)
456         .scalarize(0);
457     } else {
458       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
459         .legalFor({S32, S16})
460         .widenScalarToNextPow2(0)
461         .clampScalar(0, S16, S32)
462         .scalarize(0);
463     }
464   } else {
465     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
466       .legalFor({S32})
467       .clampScalar(0, S32, S32)
468       .widenScalarToNextPow2(0)
469       .scalarize(0);
470   }
471 
472   auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
473     return [=](const LegalityQuery &Query) {
474       return Query.Types[TypeIdx0].getSizeInBits() <
475              Query.Types[TypeIdx1].getSizeInBits();
476     };
477   };
478 
479   auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
480     return [=](const LegalityQuery &Query) {
481       return Query.Types[TypeIdx0].getSizeInBits() >
482              Query.Types[TypeIdx1].getSizeInBits();
483     };
484   };
485 
486   getActionDefinitionsBuilder(G_INTTOPTR)
487     // List the common cases
488     .legalForCartesianProduct(AddrSpaces64, {S64})
489     .legalForCartesianProduct(AddrSpaces32, {S32})
490     .scalarize(0)
491     // Accept any address space as long as the size matches
492     .legalIf(sameSize(0, 1))
493     .widenScalarIf(smallerThan(1, 0),
494       [](const LegalityQuery &Query) {
495         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
496       })
497     .narrowScalarIf(greaterThan(1, 0),
498       [](const LegalityQuery &Query) {
499         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
500       });
501 
502   getActionDefinitionsBuilder(G_PTRTOINT)
503     // List the common cases
504     .legalForCartesianProduct(AddrSpaces64, {S64})
505     .legalForCartesianProduct(AddrSpaces32, {S32})
506     .scalarize(0)
507     // Accept any address space as long as the size matches
508     .legalIf(sameSize(0, 1))
509     .widenScalarIf(smallerThan(0, 1),
510       [](const LegalityQuery &Query) {
511         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
512       })
513     .narrowScalarIf(
514       greaterThan(0, 1),
515       [](const LegalityQuery &Query) {
516         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
517       });
518 
519   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
520     .scalarize(0)
521     .custom();
522 
523   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
524   // handle some operations by just promoting the register during
525   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
526   getActionDefinitionsBuilder({G_LOAD, G_STORE})
527     .narrowScalarIf([](const LegalityQuery &Query) {
528         unsigned Size = Query.Types[0].getSizeInBits();
529         unsigned MemSize = Query.MMODescrs[0].SizeInBits;
530         return (Size > 32 && MemSize < Size);
531       },
532       [](const LegalityQuery &Query) {
533         return std::make_pair(0, LLT::scalar(32));
534       })
535     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
536     .fewerElementsIf([=](const LegalityQuery &Query) {
537         unsigned MemSize = Query.MMODescrs[0].SizeInBits;
538         return (MemSize == 96) &&
539                Query.Types[0].isVector() &&
540                !ST.hasDwordx3LoadStores();
541       },
542       [=](const LegalityQuery &Query) {
543         return std::make_pair(0, V2S32);
544       })
545     .legalIf([=](const LegalityQuery &Query) {
546         const LLT &Ty0 = Query.Types[0];
547 
548         unsigned Size = Ty0.getSizeInBits();
549         unsigned MemSize = Query.MMODescrs[0].SizeInBits;
550         if (Size < 32 || (Size > 32 && MemSize < Size))
551           return false;
552 
553         if (Ty0.isVector() && Size != MemSize)
554           return false;
555 
556         // TODO: Decompose private loads into 4-byte components.
557         // TODO: Illegal flat loads on SI
558         switch (MemSize) {
559         case 8:
560         case 16:
561           return Size == 32;
562         case 32:
563         case 64:
564         case 128:
565           return true;
566 
567         case 96:
568           return ST.hasDwordx3LoadStores();
569 
570         case 256:
571         case 512:
572           // TODO: Possibly support loads of i256 and i512 .  This will require
573           // adding i256 and i512 types to MVT in order for to be able to use
574           // TableGen.
575           // TODO: Add support for other vector types, this will require
576           //       defining more value mappings for the new types.
577           return Ty0.isVector() && (Ty0.getScalarType().getSizeInBits() == 32 ||
578                                     Ty0.getScalarType().getSizeInBits() == 64);
579 
580         default:
581           return false;
582         }
583       })
584     .clampScalar(0, S32, S64);
585 
586 
587   // FIXME: Handle alignment requirements.
588   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
589     .legalForTypesWithMemDesc({
590         {S32, GlobalPtr, 8, 8},
591         {S32, GlobalPtr, 16, 8},
592         {S32, LocalPtr, 8, 8},
593         {S32, LocalPtr, 16, 8},
594         {S32, PrivatePtr, 8, 8},
595         {S32, PrivatePtr, 16, 8}});
596   if (ST.hasFlatAddressSpace()) {
597     ExtLoads.legalForTypesWithMemDesc({{S32, FlatPtr, 8, 8},
598                                        {S32, FlatPtr, 16, 8}});
599   }
600 
601   ExtLoads.clampScalar(0, S32, S32)
602           .widenScalarToNextPow2(0)
603           .unsupportedIfMemSizeNotPow2()
604           .lower();
605 
606   auto &Atomics = getActionDefinitionsBuilder(
607     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
608      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
609      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
610      G_ATOMICRMW_UMIN, G_ATOMIC_CMPXCHG})
611     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
612                {S64, GlobalPtr}, {S64, LocalPtr}});
613   if (ST.hasFlatAddressSpace()) {
614     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
615   }
616 
617   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
618     .legalFor({{S32, LocalPtr}});
619 
620   // TODO: Pointer types, any 32-bit or 64-bit vector
621   getActionDefinitionsBuilder(G_SELECT)
622     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
623           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
624           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1})
625     .clampScalar(0, S16, S64)
626     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
627     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
628     .scalarize(1)
629     .clampMaxNumElements(0, S32, 2)
630     .clampMaxNumElements(0, LocalPtr, 2)
631     .clampMaxNumElements(0, PrivatePtr, 2)
632     .scalarize(0)
633     .widenScalarToNextPow2(0)
634     .legalIf(all(isPointer(0), typeIs(1, S1)));
635 
636   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
637   // be more flexible with the shift amount type.
638   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
639     .legalFor({{S32, S32}, {S64, S32}});
640   if (ST.has16BitInsts()) {
641     if (ST.hasVOP3PInsts()) {
642       Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
643             .clampMaxNumElements(0, S16, 2);
644     } else
645       Shifts.legalFor({{S16, S32}, {S16, S16}});
646 
647     Shifts.clampScalar(1, S16, S32);
648     Shifts.clampScalar(0, S16, S64);
649     Shifts.widenScalarToNextPow2(0, 16);
650   } else {
651     // Make sure we legalize the shift amount type first, as the general
652     // expansion for the shifted type will produce much worse code if it hasn't
653     // been truncated already.
654     Shifts.clampScalar(1, S32, S32);
655     Shifts.clampScalar(0, S32, S64);
656     Shifts.widenScalarToNextPow2(0, 32);
657   }
658   Shifts.scalarize(0);
659 
660   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
661     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
662     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
663     unsigned IdxTypeIdx = 2;
664 
665     getActionDefinitionsBuilder(Op)
666       .customIf([=](const LegalityQuery &Query) {
667           const LLT EltTy = Query.Types[EltTypeIdx];
668           const LLT VecTy = Query.Types[VecTypeIdx];
669           const LLT IdxTy = Query.Types[IdxTypeIdx];
670           return (EltTy.getSizeInBits() == 16 ||
671                   EltTy.getSizeInBits() % 32 == 0) &&
672                  VecTy.getSizeInBits() % 32 == 0 &&
673                  VecTy.getSizeInBits() <= 512 &&
674                  IdxTy.getSizeInBits() == 32;
675         })
676       .clampScalar(EltTypeIdx, S32, S64)
677       .clampScalar(VecTypeIdx, S32, S64)
678       .clampScalar(IdxTypeIdx, S32, S32);
679   }
680 
681   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
682     .unsupportedIf([=](const LegalityQuery &Query) {
683         const LLT &EltTy = Query.Types[1].getElementType();
684         return Query.Types[0] != EltTy;
685       });
686 
687   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
688     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
689     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
690 
691     // FIXME: Doesn't handle extract of illegal sizes.
692     getActionDefinitionsBuilder(Op)
693       .legalIf([=](const LegalityQuery &Query) {
694           const LLT BigTy = Query.Types[BigTyIdx];
695           const LLT LitTy = Query.Types[LitTyIdx];
696           return (BigTy.getSizeInBits() % 32 == 0) &&
697                  (LitTy.getSizeInBits() % 16 == 0);
698         })
699       .widenScalarIf(
700         [=](const LegalityQuery &Query) {
701           const LLT BigTy = Query.Types[BigTyIdx];
702           return (BigTy.getScalarSizeInBits() < 16);
703         },
704         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
705       .widenScalarIf(
706         [=](const LegalityQuery &Query) {
707           const LLT LitTy = Query.Types[LitTyIdx];
708           return (LitTy.getScalarSizeInBits() < 16);
709         },
710         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
711       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
712       .widenScalarToNextPow2(BigTyIdx, 32);
713 
714   }
715 
716   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
717     .legalForCartesianProduct(AllS32Vectors, {S32})
718     .legalForCartesianProduct(AllS64Vectors, {S64})
719     .clampNumElements(0, V16S32, V16S32)
720     .clampNumElements(0, V2S64, V8S64);
721 
722   if (ST.hasScalarPackInsts())
723     BuildVector.legalFor({V2S16, S32});
724 
725   BuildVector
726     .minScalarSameAs(1, 0)
727     .legalIf(isRegisterType(0))
728     .minScalarOrElt(0, S32);
729 
730   if (ST.hasScalarPackInsts()) {
731     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
732       .legalFor({V2S16, S32})
733       .lower();
734   } else {
735     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
736       .lower();
737   }
738 
739   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
740     .legalIf(isRegisterType(0));
741 
742   // TODO: Don't fully scalarize v2s16 pieces
743   getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
744 
745   // Merge/Unmerge
746   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
747     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
748     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
749 
750     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
751       const LLT &Ty = Query.Types[TypeIdx];
752       if (Ty.isVector()) {
753         const LLT &EltTy = Ty.getElementType();
754         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
755           return true;
756         if (!isPowerOf2_32(EltTy.getSizeInBits()))
757           return true;
758       }
759       return false;
760     };
761 
762     getActionDefinitionsBuilder(Op)
763       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
764       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
765       // worth considering the multiples of 64 since 2*192 and 2*384 are not
766       // valid.
767       .clampScalar(LitTyIdx, S16, S256)
768       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
769       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
770       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
771                            elementTypeIs(1, S16)),
772                        changeTo(1, V2S16))
773       // Break up vectors with weird elements into scalars
774       .fewerElementsIf(
775         [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
776         scalarize(0))
777       .fewerElementsIf(
778         [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
779         scalarize(1))
780       .clampScalar(BigTyIdx, S32, S512)
781       .lowerFor({{S16, V2S16}})
782       .widenScalarIf(
783         [=](const LegalityQuery &Query) {
784           const LLT &Ty = Query.Types[BigTyIdx];
785           return !isPowerOf2_32(Ty.getSizeInBits()) &&
786                  Ty.getSizeInBits() % 16 != 0;
787         },
788         [=](const LegalityQuery &Query) {
789           // Pick the next power of 2, or a multiple of 64 over 128.
790           // Whichever is smaller.
791           const LLT &Ty = Query.Types[BigTyIdx];
792           unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
793           if (NewSizeInBits >= 256) {
794             unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
795             if (RoundedTo < NewSizeInBits)
796               NewSizeInBits = RoundedTo;
797           }
798           return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
799         })
800       .legalIf([=](const LegalityQuery &Query) {
801           const LLT &BigTy = Query.Types[BigTyIdx];
802           const LLT &LitTy = Query.Types[LitTyIdx];
803 
804           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
805             return false;
806           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
807             return false;
808 
809           return BigTy.getSizeInBits() % 16 == 0 &&
810                  LitTy.getSizeInBits() % 16 == 0 &&
811                  BigTy.getSizeInBits() <= 512;
812         })
813       // Any vectors left are the wrong size. Scalarize them.
814       .scalarize(0)
815       .scalarize(1);
816   }
817 
818   getActionDefinitionsBuilder(G_SEXT_INREG).lower();
819 
820   computeTables();
821   verify(*ST.getInstrInfo());
822 }
823 
824 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
825                                          MachineRegisterInfo &MRI,
826                                          MachineIRBuilder &MIRBuilder,
827                                          GISelChangeObserver &Observer) const {
828   switch (MI.getOpcode()) {
829   case TargetOpcode::G_ADDRSPACE_CAST:
830     return legalizeAddrSpaceCast(MI, MRI, MIRBuilder);
831   case TargetOpcode::G_FRINT:
832     return legalizeFrint(MI, MRI, MIRBuilder);
833   case TargetOpcode::G_FCEIL:
834     return legalizeFceil(MI, MRI, MIRBuilder);
835   case TargetOpcode::G_INTRINSIC_TRUNC:
836     return legalizeIntrinsicTrunc(MI, MRI, MIRBuilder);
837   case TargetOpcode::G_SITOFP:
838     return legalizeITOFP(MI, MRI, MIRBuilder, true);
839   case TargetOpcode::G_UITOFP:
840     return legalizeITOFP(MI, MRI, MIRBuilder, false);
841   case TargetOpcode::G_FMINNUM:
842   case TargetOpcode::G_FMAXNUM:
843   case TargetOpcode::G_FMINNUM_IEEE:
844   case TargetOpcode::G_FMAXNUM_IEEE:
845     return legalizeMinNumMaxNum(MI, MRI, MIRBuilder);
846   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
847     return legalizeExtractVectorElt(MI, MRI, MIRBuilder);
848   case TargetOpcode::G_INSERT_VECTOR_ELT:
849     return legalizeInsertVectorElt(MI, MRI, MIRBuilder);
850   case TargetOpcode::G_FSIN:
851   case TargetOpcode::G_FCOS:
852     return legalizeSinCos(MI, MRI, MIRBuilder);
853   case TargetOpcode::G_GLOBAL_VALUE:
854     return legalizeGlobalValue(MI, MRI, MIRBuilder);
855   default:
856     return false;
857   }
858 
859   llvm_unreachable("expected switch to return");
860 }
861 
862 Register AMDGPULegalizerInfo::getSegmentAperture(
863   unsigned AS,
864   MachineRegisterInfo &MRI,
865   MachineIRBuilder &MIRBuilder) const {
866   MachineFunction &MF = MIRBuilder.getMF();
867   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
868   const LLT S32 = LLT::scalar(32);
869 
870   if (ST.hasApertureRegs()) {
871     // FIXME: Use inline constants (src_{shared, private}_base) instead of
872     // getreg.
873     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
874         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
875         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
876     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
877         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
878         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
879     unsigned Encoding =
880         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
881         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
882         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
883 
884     Register ApertureReg = MRI.createGenericVirtualRegister(S32);
885     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
886 
887     MIRBuilder.buildInstr(AMDGPU::S_GETREG_B32)
888       .addDef(GetReg)
889       .addImm(Encoding);
890     MRI.setType(GetReg, S32);
891 
892     auto ShiftAmt = MIRBuilder.buildConstant(S32, WidthM1 + 1);
893     MIRBuilder.buildInstr(TargetOpcode::G_SHL)
894       .addDef(ApertureReg)
895       .addUse(GetReg)
896       .addUse(ShiftAmt.getReg(0));
897 
898     return ApertureReg;
899   }
900 
901   Register QueuePtr = MRI.createGenericVirtualRegister(
902     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
903 
904   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
905   if (!loadInputValue(QueuePtr, MIRBuilder, &MFI->getArgInfo().QueuePtr))
906     return Register();
907 
908   // Offset into amd_queue_t for group_segment_aperture_base_hi /
909   // private_segment_aperture_base_hi.
910   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
911 
912   // FIXME: Don't use undef
913   Value *V = UndefValue::get(PointerType::get(
914                                Type::getInt8Ty(MF.getFunction().getContext()),
915                                AMDGPUAS::CONSTANT_ADDRESS));
916 
917   MachinePointerInfo PtrInfo(V, StructOffset);
918   MachineMemOperand *MMO = MF.getMachineMemOperand(
919     PtrInfo,
920     MachineMemOperand::MOLoad |
921     MachineMemOperand::MODereferenceable |
922     MachineMemOperand::MOInvariant,
923     4,
924     MinAlign(64, StructOffset));
925 
926   Register LoadResult = MRI.createGenericVirtualRegister(S32);
927   Register LoadAddr;
928 
929   MIRBuilder.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
930   MIRBuilder.buildLoad(LoadResult, LoadAddr, *MMO);
931   return LoadResult;
932 }
933 
934 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
935   MachineInstr &MI, MachineRegisterInfo &MRI,
936   MachineIRBuilder &MIRBuilder) const {
937   MachineFunction &MF = MIRBuilder.getMF();
938 
939   MIRBuilder.setInstr(MI);
940 
941   const LLT S32 = LLT::scalar(32);
942   Register Dst = MI.getOperand(0).getReg();
943   Register Src = MI.getOperand(1).getReg();
944 
945   LLT DstTy = MRI.getType(Dst);
946   LLT SrcTy = MRI.getType(Src);
947   unsigned DestAS = DstTy.getAddressSpace();
948   unsigned SrcAS = SrcTy.getAddressSpace();
949 
950   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
951   // vector element.
952   assert(!DstTy.isVector());
953 
954   const AMDGPUTargetMachine &TM
955     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
956 
957   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
958   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
959     MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::G_BITCAST));
960     return true;
961   }
962 
963   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
964     // Truncate.
965     MIRBuilder.buildExtract(Dst, Src, 0);
966     MI.eraseFromParent();
967     return true;
968   }
969 
970   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
971     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
972     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
973 
974     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
975     // another. Merge operands are required to be the same type, but creating an
976     // extra ptrtoint would be kind of pointless.
977     auto HighAddr = MIRBuilder.buildConstant(
978       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
979     MIRBuilder.buildMerge(Dst, {Src, HighAddr.getReg(0)});
980     MI.eraseFromParent();
981     return true;
982   }
983 
984   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
985     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
986            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
987     unsigned NullVal = TM.getNullPointerValue(DestAS);
988 
989     auto SegmentNull = MIRBuilder.buildConstant(DstTy, NullVal);
990     auto FlatNull = MIRBuilder.buildConstant(SrcTy, 0);
991 
992     Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy);
993 
994     // Extract low 32-bits of the pointer.
995     MIRBuilder.buildExtract(PtrLo32, Src, 0);
996 
997     Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
998     MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
999     MIRBuilder.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1000 
1001     MI.eraseFromParent();
1002     return true;
1003   }
1004 
1005   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1006     return false;
1007 
1008   if (!ST.hasFlatAddressSpace())
1009     return false;
1010 
1011   auto SegmentNull =
1012       MIRBuilder.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1013   auto FlatNull =
1014       MIRBuilder.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1015 
1016   Register ApertureReg = getSegmentAperture(DestAS, MRI, MIRBuilder);
1017   if (!ApertureReg.isValid())
1018     return false;
1019 
1020   Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1021   MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
1022 
1023   Register BuildPtr = MRI.createGenericVirtualRegister(DstTy);
1024 
1025   // Coerce the type of the low half of the result so we can use merge_values.
1026   Register SrcAsInt = MRI.createGenericVirtualRegister(S32);
1027   MIRBuilder.buildInstr(TargetOpcode::G_PTRTOINT)
1028     .addDef(SrcAsInt)
1029     .addUse(Src);
1030 
1031   // TODO: Should we allow mismatched types but matching sizes in merges to
1032   // avoid the ptrtoint?
1033   MIRBuilder.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
1034   MIRBuilder.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
1035 
1036   MI.eraseFromParent();
1037   return true;
1038 }
1039 
1040 bool AMDGPULegalizerInfo::legalizeFrint(
1041   MachineInstr &MI, MachineRegisterInfo &MRI,
1042   MachineIRBuilder &MIRBuilder) const {
1043   MIRBuilder.setInstr(MI);
1044 
1045   Register Src = MI.getOperand(1).getReg();
1046   LLT Ty = MRI.getType(Src);
1047   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1048 
1049   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1050   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1051 
1052   auto C1 = MIRBuilder.buildFConstant(Ty, C1Val);
1053   auto CopySign = MIRBuilder.buildFCopysign(Ty, C1, Src);
1054 
1055   // TODO: Should this propagate fast-math-flags?
1056   auto Tmp1 = MIRBuilder.buildFAdd(Ty, Src, CopySign);
1057   auto Tmp2 = MIRBuilder.buildFSub(Ty, Tmp1, CopySign);
1058 
1059   auto C2 = MIRBuilder.buildFConstant(Ty, C2Val);
1060   auto Fabs = MIRBuilder.buildFAbs(Ty, Src);
1061 
1062   auto Cond = MIRBuilder.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1063   MIRBuilder.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1064   return true;
1065 }
1066 
1067 bool AMDGPULegalizerInfo::legalizeFceil(
1068   MachineInstr &MI, MachineRegisterInfo &MRI,
1069   MachineIRBuilder &B) const {
1070   B.setInstr(MI);
1071 
1072   const LLT S1 = LLT::scalar(1);
1073   const LLT S64 = LLT::scalar(64);
1074 
1075   Register Src = MI.getOperand(1).getReg();
1076   assert(MRI.getType(Src) == S64);
1077 
1078   // result = trunc(src)
1079   // if (src > 0.0 && src != result)
1080   //   result += 1.0
1081 
1082   auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src});
1083 
1084   const auto Zero = B.buildFConstant(S64, 0.0);
1085   const auto One = B.buildFConstant(S64, 1.0);
1086   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1087   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1088   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1089   auto Add = B.buildSelect(S64, And, One, Zero);
1090 
1091   // TODO: Should this propagate fast-math-flags?
1092   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1093   return true;
1094 }
1095 
1096 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1097                                               MachineIRBuilder &B) {
1098   const unsigned FractBits = 52;
1099   const unsigned ExpBits = 11;
1100   LLT S32 = LLT::scalar(32);
1101 
1102   auto Const0 = B.buildConstant(S32, FractBits - 32);
1103   auto Const1 = B.buildConstant(S32, ExpBits);
1104 
1105   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1106     .addUse(Const0.getReg(0))
1107     .addUse(Const1.getReg(0));
1108 
1109   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1110 }
1111 
1112 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1113   MachineInstr &MI, MachineRegisterInfo &MRI,
1114   MachineIRBuilder &B) const {
1115   B.setInstr(MI);
1116 
1117   const LLT S1 = LLT::scalar(1);
1118   const LLT S32 = LLT::scalar(32);
1119   const LLT S64 = LLT::scalar(64);
1120 
1121   Register Src = MI.getOperand(1).getReg();
1122   assert(MRI.getType(Src) == S64);
1123 
1124   // TODO: Should this use extract since the low half is unused?
1125   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1126   Register Hi = Unmerge.getReg(1);
1127 
1128   // Extract the upper half, since this is where we will find the sign and
1129   // exponent.
1130   auto Exp = extractF64Exponent(Hi, B);
1131 
1132   const unsigned FractBits = 52;
1133 
1134   // Extract the sign bit.
1135   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1136   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1137 
1138   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1139 
1140   const auto Zero32 = B.buildConstant(S32, 0);
1141 
1142   // Extend back to 64-bits.
1143   auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)});
1144 
1145   auto Shr = B.buildAShr(S64, FractMask, Exp);
1146   auto Not = B.buildNot(S64, Shr);
1147   auto Tmp0 = B.buildAnd(S64, Src, Not);
1148   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1149 
1150   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1151   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1152 
1153   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1154   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1155   return true;
1156 }
1157 
1158 bool AMDGPULegalizerInfo::legalizeITOFP(
1159   MachineInstr &MI, MachineRegisterInfo &MRI,
1160   MachineIRBuilder &B, bool Signed) const {
1161   B.setInstr(MI);
1162 
1163   Register Dst = MI.getOperand(0).getReg();
1164   Register Src = MI.getOperand(1).getReg();
1165 
1166   const LLT S64 = LLT::scalar(64);
1167   const LLT S32 = LLT::scalar(32);
1168 
1169   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1170 
1171   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1172 
1173   auto CvtHi = Signed ?
1174     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1175     B.buildUITOFP(S64, Unmerge.getReg(1));
1176 
1177   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1178 
1179   auto ThirtyTwo = B.buildConstant(S32, 32);
1180   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1181     .addUse(CvtHi.getReg(0))
1182     .addUse(ThirtyTwo.getReg(0));
1183 
1184   // TODO: Should this propagate fast-math-flags?
1185   B.buildFAdd(Dst, LdExp, CvtLo);
1186   MI.eraseFromParent();
1187   return true;
1188 }
1189 
1190 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1191   MachineInstr &MI, MachineRegisterInfo &MRI,
1192   MachineIRBuilder &B) const {
1193   MachineFunction &MF = B.getMF();
1194   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1195 
1196   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1197                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1198 
1199   // With ieee_mode disabled, the instructions have the correct behavior
1200   // already for G_FMINNUM/G_FMAXNUM
1201   if (!MFI->getMode().IEEE)
1202     return !IsIEEEOp;
1203 
1204   if (IsIEEEOp)
1205     return true;
1206 
1207   MachineIRBuilder HelperBuilder(MI);
1208   GISelObserverWrapper DummyObserver;
1209   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1210   HelperBuilder.setMBB(*MI.getParent());
1211   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1212 }
1213 
1214 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1215   MachineInstr &MI, MachineRegisterInfo &MRI,
1216   MachineIRBuilder &B) const {
1217   // TODO: Should move some of this into LegalizerHelper.
1218 
1219   // TODO: Promote dynamic indexing of s16 to s32
1220   // TODO: Dynamic s64 indexing is only legal for SGPR.
1221   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
1222   if (!IdxVal) // Dynamic case will be selected to register indexing.
1223     return true;
1224 
1225   Register Dst = MI.getOperand(0).getReg();
1226   Register Vec = MI.getOperand(1).getReg();
1227 
1228   LLT VecTy = MRI.getType(Vec);
1229   LLT EltTy = VecTy.getElementType();
1230   assert(EltTy == MRI.getType(Dst));
1231 
1232   B.setInstr(MI);
1233 
1234   if (IdxVal.getValue() < VecTy.getNumElements())
1235     B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits());
1236   else
1237     B.buildUndef(Dst);
1238 
1239   MI.eraseFromParent();
1240   return true;
1241 }
1242 
1243 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1244   MachineInstr &MI, MachineRegisterInfo &MRI,
1245   MachineIRBuilder &B) const {
1246   // TODO: Should move some of this into LegalizerHelper.
1247 
1248   // TODO: Promote dynamic indexing of s16 to s32
1249   // TODO: Dynamic s64 indexing is only legal for SGPR.
1250   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI);
1251   if (!IdxVal) // Dynamic case will be selected to register indexing.
1252     return true;
1253 
1254   Register Dst = MI.getOperand(0).getReg();
1255   Register Vec = MI.getOperand(1).getReg();
1256   Register Ins = MI.getOperand(2).getReg();
1257 
1258   LLT VecTy = MRI.getType(Vec);
1259   LLT EltTy = VecTy.getElementType();
1260   assert(EltTy == MRI.getType(Ins));
1261 
1262   B.setInstr(MI);
1263 
1264   if (IdxVal.getValue() < VecTy.getNumElements())
1265     B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits());
1266   else
1267     B.buildUndef(Dst);
1268 
1269   MI.eraseFromParent();
1270   return true;
1271 }
1272 
1273 bool AMDGPULegalizerInfo::legalizeSinCos(
1274   MachineInstr &MI, MachineRegisterInfo &MRI,
1275   MachineIRBuilder &B) const {
1276   B.setInstr(MI);
1277 
1278   Register DstReg = MI.getOperand(0).getReg();
1279   Register SrcReg = MI.getOperand(1).getReg();
1280   LLT Ty = MRI.getType(DstReg);
1281   unsigned Flags = MI.getFlags();
1282 
1283   Register TrigVal;
1284   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1285   if (ST.hasTrigReducedRange()) {
1286     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1287     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1288       .addUse(MulVal.getReg(0))
1289       .setMIFlags(Flags).getReg(0);
1290   } else
1291     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1292 
1293   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1294     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1295   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1296     .addUse(TrigVal)
1297     .setMIFlags(Flags);
1298   MI.eraseFromParent();
1299   return true;
1300 }
1301 
1302 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1303   MachineInstr &MI, MachineRegisterInfo &MRI,
1304   MachineIRBuilder &B) const {
1305   Register DstReg = MI.getOperand(0).getReg();
1306   LLT Ty = MRI.getType(DstReg);
1307   unsigned AS = Ty.getAddressSpace();
1308 
1309   const GlobalValue *GV = MI.getOperand(1).getGlobal();
1310   MachineFunction &MF = B.getMF();
1311   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1312 
1313   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1314     B.setInstr(MI);
1315 
1316     if (!MFI->isEntryFunction()) {
1317       const Function &Fn = MF.getFunction();
1318       DiagnosticInfoUnsupported BadLDSDecl(
1319         Fn, "local memory global used by non-kernel function", MI.getDebugLoc());
1320       Fn.getContext().diagnose(BadLDSDecl);
1321     }
1322 
1323     // TODO: We could emit code to handle the initialization somewhere.
1324     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1325       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1326       MI.eraseFromParent();
1327       return true;
1328     }
1329   } else
1330     return false;
1331 
1332   const Function &Fn = MF.getFunction();
1333   DiagnosticInfoUnsupported BadInit(
1334     Fn, "unsupported initializer for address space", MI.getDebugLoc());
1335   Fn.getContext().diagnose(BadInit);
1336   return true;
1337 }
1338 
1339 // Return the use branch instruction, otherwise null if the usage is invalid.
1340 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
1341                                        MachineRegisterInfo &MRI) {
1342   Register CondDef = MI.getOperand(0).getReg();
1343   if (!MRI.hasOneNonDBGUse(CondDef))
1344     return nullptr;
1345 
1346   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
1347   return UseMI.getParent() == MI.getParent() &&
1348     UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr;
1349 }
1350 
1351 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
1352                                                 Register Reg, LLT Ty) const {
1353   Register LiveIn = MRI.getLiveInVirtReg(Reg);
1354   if (LiveIn)
1355     return LiveIn;
1356 
1357   Register NewReg = MRI.createGenericVirtualRegister(Ty);
1358   MRI.addLiveIn(Reg, NewReg);
1359   return NewReg;
1360 }
1361 
1362 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
1363                                          const ArgDescriptor *Arg) const {
1364   if (!Arg->isRegister() || !Arg->getRegister().isValid())
1365     return false; // TODO: Handle these
1366 
1367   assert(Arg->getRegister().isPhysical());
1368 
1369   MachineRegisterInfo &MRI = *B.getMRI();
1370 
1371   LLT Ty = MRI.getType(DstReg);
1372   Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
1373 
1374   if (Arg->isMasked()) {
1375     // TODO: Should we try to emit this once in the entry block?
1376     const LLT S32 = LLT::scalar(32);
1377     const unsigned Mask = Arg->getMask();
1378     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
1379 
1380     auto ShiftAmt = B.buildConstant(S32, Shift);
1381     auto LShr = B.buildLShr(S32, LiveIn, ShiftAmt);
1382     B.buildAnd(DstReg, LShr, B.buildConstant(S32, Mask >> Shift));
1383   } else
1384     B.buildCopy(DstReg, LiveIn);
1385 
1386   // Insert the argument copy if it doens't already exist.
1387   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
1388   if (!MRI.getVRegDef(LiveIn)) {
1389     // FIXME: Should have scoped insert pt
1390     MachineBasicBlock &OrigInsBB = B.getMBB();
1391     auto OrigInsPt = B.getInsertPt();
1392 
1393     MachineBasicBlock &EntryMBB = B.getMF().front();
1394     EntryMBB.addLiveIn(Arg->getRegister());
1395     B.setInsertPt(EntryMBB, EntryMBB.begin());
1396     B.buildCopy(LiveIn, Arg->getRegister());
1397 
1398     B.setInsertPt(OrigInsBB, OrigInsPt);
1399   }
1400 
1401   return true;
1402 }
1403 
1404 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
1405   MachineInstr &MI,
1406   MachineRegisterInfo &MRI,
1407   MachineIRBuilder &B,
1408   AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
1409   B.setInstr(MI);
1410 
1411   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1412 
1413   const ArgDescriptor *Arg;
1414   const TargetRegisterClass *RC;
1415   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
1416   if (!Arg) {
1417     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
1418     return false;
1419   }
1420 
1421   if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
1422     MI.eraseFromParent();
1423     return true;
1424   }
1425 
1426   return false;
1427 }
1428 
1429 bool AMDGPULegalizerInfo::legalizeFDIVFast(MachineInstr &MI,
1430                                            MachineRegisterInfo &MRI,
1431                                            MachineIRBuilder &B) const {
1432   B.setInstr(MI);
1433   Register Res = MI.getOperand(0).getReg();
1434   Register LHS = MI.getOperand(2).getReg();
1435   Register RHS = MI.getOperand(3).getReg();
1436   uint16_t Flags = MI.getFlags();
1437 
1438   LLT S32 = LLT::scalar(32);
1439   LLT S1 = LLT::scalar(1);
1440 
1441   auto Abs = B.buildFAbs(S32, RHS, Flags);
1442   const APFloat C0Val(1.0f);
1443 
1444   auto C0 = B.buildConstant(S32, 0x6f800000);
1445   auto C1 = B.buildConstant(S32, 0x2f800000);
1446   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
1447 
1448   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
1449   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
1450 
1451   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
1452 
1453   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
1454     .addUse(Mul0.getReg(0))
1455     .setMIFlags(Flags);
1456 
1457   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
1458 
1459   B.buildFMul(Res, Sel, Mul1, Flags);
1460 
1461   MI.eraseFromParent();
1462   return true;
1463 }
1464 
1465 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
1466                                                  MachineRegisterInfo &MRI,
1467                                                  MachineIRBuilder &B) const {
1468   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1469   if (!MFI->isEntryFunction()) {
1470     return legalizePreloadedArgIntrin(MI, MRI, B,
1471                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
1472   }
1473 
1474   B.setInstr(MI);
1475 
1476   uint64_t Offset =
1477     ST.getTargetLowering()->getImplicitParameterOffset(
1478       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
1479   Register DstReg = MI.getOperand(0).getReg();
1480   LLT DstTy = MRI.getType(DstReg);
1481   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
1482 
1483   const ArgDescriptor *Arg;
1484   const TargetRegisterClass *RC;
1485   std::tie(Arg, RC)
1486     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1487   if (!Arg)
1488     return false;
1489 
1490   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
1491   if (!loadInputValue(KernargPtrReg, B, Arg))
1492     return false;
1493 
1494   B.buildGEP(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
1495   MI.eraseFromParent();
1496   return true;
1497 }
1498 
1499 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
1500                                               MachineRegisterInfo &MRI,
1501                                               MachineIRBuilder &B,
1502                                               unsigned AddrSpace) const {
1503   B.setInstr(MI);
1504   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
1505   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
1506   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
1507   MI.eraseFromParent();
1508   return true;
1509 }
1510 
1511 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
1512                                             MachineRegisterInfo &MRI,
1513                                             MachineIRBuilder &B) const {
1514   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
1515   switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
1516   case Intrinsic::amdgcn_if: {
1517     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
1518       const SIRegisterInfo *TRI
1519         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1520 
1521       B.setInstr(*BrCond);
1522       Register Def = MI.getOperand(1).getReg();
1523       Register Use = MI.getOperand(3).getReg();
1524       B.buildInstr(AMDGPU::SI_IF)
1525         .addDef(Def)
1526         .addUse(Use)
1527         .addMBB(BrCond->getOperand(1).getMBB());
1528 
1529       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
1530       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
1531       MI.eraseFromParent();
1532       BrCond->eraseFromParent();
1533       return true;
1534     }
1535 
1536     return false;
1537   }
1538   case Intrinsic::amdgcn_loop: {
1539     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
1540       const SIRegisterInfo *TRI
1541         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1542 
1543       B.setInstr(*BrCond);
1544       Register Reg = MI.getOperand(2).getReg();
1545       B.buildInstr(AMDGPU::SI_LOOP)
1546         .addUse(Reg)
1547         .addMBB(BrCond->getOperand(1).getMBB());
1548       MI.eraseFromParent();
1549       BrCond->eraseFromParent();
1550       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
1551       return true;
1552     }
1553 
1554     return false;
1555   }
1556   case Intrinsic::amdgcn_kernarg_segment_ptr:
1557     return legalizePreloadedArgIntrin(
1558       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1559   case Intrinsic::amdgcn_implicitarg_ptr:
1560     return legalizeImplicitArgPtr(MI, MRI, B);
1561   case Intrinsic::amdgcn_workitem_id_x:
1562     return legalizePreloadedArgIntrin(MI, MRI, B,
1563                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
1564   case Intrinsic::amdgcn_workitem_id_y:
1565     return legalizePreloadedArgIntrin(MI, MRI, B,
1566                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
1567   case Intrinsic::amdgcn_workitem_id_z:
1568     return legalizePreloadedArgIntrin(MI, MRI, B,
1569                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
1570   case Intrinsic::amdgcn_workgroup_id_x:
1571     return legalizePreloadedArgIntrin(MI, MRI, B,
1572                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
1573   case Intrinsic::amdgcn_workgroup_id_y:
1574     return legalizePreloadedArgIntrin(MI, MRI, B,
1575                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
1576   case Intrinsic::amdgcn_workgroup_id_z:
1577     return legalizePreloadedArgIntrin(MI, MRI, B,
1578                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
1579   case Intrinsic::amdgcn_dispatch_ptr:
1580     return legalizePreloadedArgIntrin(MI, MRI, B,
1581                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
1582   case Intrinsic::amdgcn_queue_ptr:
1583     return legalizePreloadedArgIntrin(MI, MRI, B,
1584                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
1585   case Intrinsic::amdgcn_implicit_buffer_ptr:
1586     return legalizePreloadedArgIntrin(
1587       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
1588   case Intrinsic::amdgcn_dispatch_id:
1589     return legalizePreloadedArgIntrin(MI, MRI, B,
1590                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
1591   case Intrinsic::amdgcn_fdiv_fast:
1592     return legalizeFDIVFast(MI, MRI, B);
1593   case Intrinsic::amdgcn_is_shared:
1594     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
1595   case Intrinsic::amdgcn_is_private:
1596     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
1597   case Intrinsic::amdgcn_wavefrontsize: {
1598     B.setInstr(MI);
1599     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
1600     MI.eraseFromParent();
1601     return true;
1602   }
1603   default:
1604     return true;
1605   }
1606 
1607   return true;
1608 }
1609