1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
19 #endif
20 
21 #include "AMDGPULegalizerInfo.h"
22 
23 #include "AMDGPU.h"
24 #include "AMDGPUGlobalISelUtils.h"
25 #include "AMDGPUTargetMachine.h"
26 #include "SIMachineFunctionInfo.h"
27 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
28 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
29 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
30 #include "llvm/CodeGen/TargetOpcodes.h"
31 #include "llvm/CodeGen/ValueTypes.h"
32 #include "llvm/IR/DerivedTypes.h"
33 #include "llvm/IR/DiagnosticInfo.h"
34 #include "llvm/IR/Type.h"
35 #include "llvm/Support/Debug.h"
36 
37 #define DEBUG_TYPE "amdgpu-legalinfo"
38 
39 using namespace llvm;
40 using namespace LegalizeActions;
41 using namespace LegalizeMutations;
42 using namespace LegalityPredicates;
43 using namespace MIPatternMatch;
44 
45 // Round the number of elements to the next power of two elements
46 static LLT getPow2VectorType(LLT Ty) {
47   unsigned NElts = Ty.getNumElements();
48   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
49   return Ty.changeNumElements(Pow2NElts);
50 }
51 
52 // Round the number of bits to the next power of two bits
53 static LLT getPow2ScalarType(LLT Ty) {
54   unsigned Bits = Ty.getSizeInBits();
55   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
56   return LLT::scalar(Pow2Bits);
57 }
58 
59 static LegalityPredicate isMultiple32(unsigned TypeIdx,
60                                       unsigned MaxSize = 1024) {
61   return [=](const LegalityQuery &Query) {
62     const LLT Ty = Query.Types[TypeIdx];
63     const LLT EltTy = Ty.getScalarType();
64     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
65   };
66 }
67 
68 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
69   return [=](const LegalityQuery &Query) {
70     return Query.Types[TypeIdx].getSizeInBits() == Size;
71   };
72 }
73 
74 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
75   return [=](const LegalityQuery &Query) {
76     const LLT Ty = Query.Types[TypeIdx];
77     return Ty.isVector() &&
78            Ty.getNumElements() % 2 != 0 &&
79            Ty.getElementType().getSizeInBits() < 32 &&
80            Ty.getSizeInBits() % 32 != 0;
81   };
82 }
83 
84 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
85   return [=](const LegalityQuery &Query) {
86     const LLT Ty = Query.Types[TypeIdx];
87     const LLT EltTy = Ty.getScalarType();
88     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
89   };
90 }
91 
92 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
93   return [=](const LegalityQuery &Query) {
94     const LLT Ty = Query.Types[TypeIdx];
95     const LLT EltTy = Ty.getElementType();
96     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
97   };
98 }
99 
100 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
101   return [=](const LegalityQuery &Query) {
102     const LLT Ty = Query.Types[TypeIdx];
103     const LLT EltTy = Ty.getElementType();
104     unsigned Size = Ty.getSizeInBits();
105     unsigned Pieces = (Size + 63) / 64;
106     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
107     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
108   };
109 }
110 
111 // Increase the number of vector elements to reach the next multiple of 32-bit
112 // type.
113 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
114   return [=](const LegalityQuery &Query) {
115     const LLT Ty = Query.Types[TypeIdx];
116 
117     const LLT EltTy = Ty.getElementType();
118     const int Size = Ty.getSizeInBits();
119     const int EltSize = EltTy.getSizeInBits();
120     const int NextMul32 = (Size + 31) / 32;
121 
122     assert(EltSize < 32);
123 
124     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
125     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
126   };
127 }
128 
129 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
130   return [=](const LegalityQuery &Query) {
131     const LLT QueryTy = Query.Types[TypeIdx];
132     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
133   };
134 }
135 
136 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
137   return [=](const LegalityQuery &Query) {
138     const LLT QueryTy = Query.Types[TypeIdx];
139     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
140   };
141 }
142 
143 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
144   return [=](const LegalityQuery &Query) {
145     const LLT QueryTy = Query.Types[TypeIdx];
146     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
147   };
148 }
149 
150 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
151 // v2s16.
152 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
153   return [=](const LegalityQuery &Query) {
154     const LLT Ty = Query.Types[TypeIdx];
155     if (Ty.isVector()) {
156       const int EltSize = Ty.getElementType().getSizeInBits();
157       return EltSize == 32 || EltSize == 64 ||
158             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
159              EltSize == 128 || EltSize == 256;
160     }
161 
162     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
163   };
164 }
165 
166 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
167   return [=](const LegalityQuery &Query) {
168     const LLT QueryTy = Query.Types[TypeIdx];
169     return QueryTy.isVector() && QueryTy.getElementType() == Type;
170   };
171 }
172 
173 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
174   return [=](const LegalityQuery &Query) {
175     const LLT Ty = Query.Types[TypeIdx];
176     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
177            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
178   };
179 }
180 
181 static LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1) {
182   return [=](const LegalityQuery &Query) {
183     return Query.Types[TypeIdx0].getSizeInBits() <
184            Query.Types[TypeIdx1].getSizeInBits();
185   };
186 }
187 
188 static LegalityPredicate greaterThan(unsigned TypeIdx0, unsigned TypeIdx1) {
189   return [=](const LegalityQuery &Query) {
190     return Query.Types[TypeIdx0].getSizeInBits() >
191            Query.Types[TypeIdx1].getSizeInBits();
192   };
193 }
194 
195 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
196                                          const GCNTargetMachine &TM)
197   :  ST(ST_) {
198   using namespace TargetOpcode;
199 
200   auto GetAddrSpacePtr = [&TM](unsigned AS) {
201     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
202   };
203 
204   const LLT S1 = LLT::scalar(1);
205   const LLT S16 = LLT::scalar(16);
206   const LLT S32 = LLT::scalar(32);
207   const LLT S64 = LLT::scalar(64);
208   const LLT S128 = LLT::scalar(128);
209   const LLT S256 = LLT::scalar(256);
210   const LLT S1024 = LLT::scalar(1024);
211 
212   const LLT V2S16 = LLT::vector(2, 16);
213   const LLT V4S16 = LLT::vector(4, 16);
214 
215   const LLT V2S32 = LLT::vector(2, 32);
216   const LLT V3S32 = LLT::vector(3, 32);
217   const LLT V4S32 = LLT::vector(4, 32);
218   const LLT V5S32 = LLT::vector(5, 32);
219   const LLT V6S32 = LLT::vector(6, 32);
220   const LLT V7S32 = LLT::vector(7, 32);
221   const LLT V8S32 = LLT::vector(8, 32);
222   const LLT V9S32 = LLT::vector(9, 32);
223   const LLT V10S32 = LLT::vector(10, 32);
224   const LLT V11S32 = LLT::vector(11, 32);
225   const LLT V12S32 = LLT::vector(12, 32);
226   const LLT V13S32 = LLT::vector(13, 32);
227   const LLT V14S32 = LLT::vector(14, 32);
228   const LLT V15S32 = LLT::vector(15, 32);
229   const LLT V16S32 = LLT::vector(16, 32);
230   const LLT V32S32 = LLT::vector(32, 32);
231 
232   const LLT V2S64 = LLT::vector(2, 64);
233   const LLT V3S64 = LLT::vector(3, 64);
234   const LLT V4S64 = LLT::vector(4, 64);
235   const LLT V5S64 = LLT::vector(5, 64);
236   const LLT V6S64 = LLT::vector(6, 64);
237   const LLT V7S64 = LLT::vector(7, 64);
238   const LLT V8S64 = LLT::vector(8, 64);
239   const LLT V16S64 = LLT::vector(16, 64);
240 
241   std::initializer_list<LLT> AllS32Vectors =
242     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
243      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
244   std::initializer_list<LLT> AllS64Vectors =
245     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
246 
247   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
248   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
249   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
250   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
251   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
252   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
253   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
254 
255   const LLT CodePtr = FlatPtr;
256 
257   const std::initializer_list<LLT> AddrSpaces64 = {
258     GlobalPtr, ConstantPtr, FlatPtr
259   };
260 
261   const std::initializer_list<LLT> AddrSpaces32 = {
262     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
263   };
264 
265   const std::initializer_list<LLT> FPTypesBase = {
266     S32, S64
267   };
268 
269   const std::initializer_list<LLT> FPTypes16 = {
270     S32, S64, S16
271   };
272 
273   const std::initializer_list<LLT> FPTypesPK16 = {
274     S32, S64, S16, V2S16
275   };
276 
277   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
278 
279   setAction({G_BRCOND, S1}, Legal); // VCC branches
280   setAction({G_BRCOND, S32}, Legal); // SCC branches
281 
282   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
283   // elements for v3s16
284   getActionDefinitionsBuilder(G_PHI)
285     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
286     .legalFor(AllS32Vectors)
287     .legalFor(AllS64Vectors)
288     .legalFor(AddrSpaces64)
289     .legalFor(AddrSpaces32)
290     .clampScalar(0, S32, S256)
291     .widenScalarToNextPow2(0, 32)
292     .clampMaxNumElements(0, S32, 16)
293     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
294     .legalIf(isPointer(0));
295 
296   if (ST.has16BitInsts()) {
297     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
298       .legalFor({S32, S16})
299       .clampScalar(0, S16, S32)
300       .scalarize(0)
301       .widenScalarToNextPow2(0, 32);
302   } else {
303     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
304       .legalFor({S32})
305       .clampScalar(0, S32, S32)
306       .scalarize(0);
307   }
308 
309   // FIXME: Not really legal. Placeholder for custom lowering.
310   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
311     .customFor({S32, S64})
312     .clampScalar(0, S32, S64)
313     .widenScalarToNextPow2(0, 32)
314     .scalarize(0);
315 
316   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
317     .legalFor({S32})
318     .clampScalar(0, S32, S32)
319     .scalarize(0);
320 
321   // Report legal for any types we can handle anywhere. For the cases only legal
322   // on the SALU, RegBankSelect will be able to re-legalize.
323   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
324     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
325     .clampScalar(0, S32, S64)
326     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
327     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
328     .widenScalarToNextPow2(0)
329     .scalarize(0);
330 
331   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
332                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
333     .legalFor({{S32, S1}, {S32, S32}})
334     .clampScalar(0, S32, S32)
335     .scalarize(0); // TODO: Implement.
336 
337   getActionDefinitionsBuilder(G_BITCAST)
338     // Don't worry about the size constraint.
339     .legalIf(all(isRegisterType(0), isRegisterType(1)))
340     .lower();
341 
342 
343   getActionDefinitionsBuilder(G_CONSTANT)
344     .legalFor({S1, S32, S64, S16, GlobalPtr,
345                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
346     .clampScalar(0, S32, S64)
347     .widenScalarToNextPow2(0)
348     .legalIf(isPointer(0));
349 
350   getActionDefinitionsBuilder(G_FCONSTANT)
351     .legalFor({S32, S64, S16})
352     .clampScalar(0, S16, S64);
353 
354   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
355     .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
356                ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
357     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
358     .clampScalarOrElt(0, S32, S1024)
359     .legalIf(isMultiple32(0))
360     .widenScalarToNextPow2(0, 32)
361     .clampMaxNumElements(0, S32, 16);
362 
363   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
364   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
365     .unsupportedFor({PrivatePtr})
366     .custom();
367   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
368 
369   auto &FPOpActions = getActionDefinitionsBuilder(
370     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
371     .legalFor({S32, S64});
372   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
373     .customFor({S32, S64});
374   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
375     .customFor({S32, S64});
376 
377   if (ST.has16BitInsts()) {
378     if (ST.hasVOP3PInsts())
379       FPOpActions.legalFor({S16, V2S16});
380     else
381       FPOpActions.legalFor({S16});
382 
383     TrigActions.customFor({S16});
384     FDIVActions.customFor({S16});
385   }
386 
387   auto &MinNumMaxNum = getActionDefinitionsBuilder({
388       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
389 
390   if (ST.hasVOP3PInsts()) {
391     MinNumMaxNum.customFor(FPTypesPK16)
392       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
393       .clampMaxNumElements(0, S16, 2)
394       .clampScalar(0, S16, S64)
395       .scalarize(0);
396   } else if (ST.has16BitInsts()) {
397     MinNumMaxNum.customFor(FPTypes16)
398       .clampScalar(0, S16, S64)
399       .scalarize(0);
400   } else {
401     MinNumMaxNum.customFor(FPTypesBase)
402       .clampScalar(0, S32, S64)
403       .scalarize(0);
404   }
405 
406   if (ST.hasVOP3PInsts())
407     FPOpActions.clampMaxNumElements(0, S16, 2);
408 
409   FPOpActions
410     .scalarize(0)
411     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
412 
413   TrigActions
414     .scalarize(0)
415     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
416 
417   FDIVActions
418     .scalarize(0)
419     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
420 
421   getActionDefinitionsBuilder({G_FNEG, G_FABS})
422     .legalFor(FPTypesPK16)
423     .clampMaxNumElements(0, S16, 2)
424     .scalarize(0)
425     .clampScalar(0, S16, S64);
426 
427   if (ST.has16BitInsts()) {
428     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
429       .legalFor({S32, S64, S16})
430       .scalarize(0)
431       .clampScalar(0, S16, S64);
432   } else {
433     getActionDefinitionsBuilder(G_FSQRT)
434       .legalFor({S32, S64})
435       .scalarize(0)
436       .clampScalar(0, S32, S64);
437 
438     if (ST.hasFractBug()) {
439       getActionDefinitionsBuilder(G_FFLOOR)
440         .customFor({S64})
441         .legalFor({S32, S64})
442         .scalarize(0)
443         .clampScalar(0, S32, S64);
444     } else {
445       getActionDefinitionsBuilder(G_FFLOOR)
446         .legalFor({S32, S64})
447         .scalarize(0)
448         .clampScalar(0, S32, S64);
449     }
450   }
451 
452   getActionDefinitionsBuilder(G_FPTRUNC)
453     .legalFor({{S32, S64}, {S16, S32}})
454     .scalarize(0)
455     .lower();
456 
457   getActionDefinitionsBuilder(G_FPEXT)
458     .legalFor({{S64, S32}, {S32, S16}})
459     .lowerFor({{S64, S16}}) // FIXME: Implement
460     .scalarize(0);
461 
462   getActionDefinitionsBuilder(G_FSUB)
463       // Use actual fsub instruction
464       .legalFor({S32})
465       // Must use fadd + fneg
466       .lowerFor({S64, S16, V2S16})
467       .scalarize(0)
468       .clampScalar(0, S32, S64);
469 
470   // Whether this is legal depends on the floating point mode for the function.
471   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
472   if (ST.hasMadF16())
473     FMad.customFor({S32, S16});
474   else
475     FMad.customFor({S32});
476   FMad.scalarize(0)
477       .lower();
478 
479   getActionDefinitionsBuilder(G_TRUNC)
480     .alwaysLegal();
481 
482   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
483     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
484                {S32, S1}, {S64, S1}, {S16, S1}})
485     .scalarize(0)
486     .clampScalar(0, S32, S64)
487     .widenScalarToNextPow2(1, 32);
488 
489   // TODO: Split s1->s64 during regbankselect for VALU.
490   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
491     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
492     .lowerFor({{S32, S64}})
493     .lowerIf(typeIs(1, S1))
494     .customFor({{S64, S64}});
495   if (ST.has16BitInsts())
496     IToFP.legalFor({{S16, S16}});
497   IToFP.clampScalar(1, S32, S64)
498        .scalarize(0)
499        .widenScalarToNextPow2(1);
500 
501   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
502     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
503     .customFor({{S64, S64}});
504   if (ST.has16BitInsts())
505     FPToI.legalFor({{S16, S16}});
506   else
507     FPToI.minScalar(1, S32);
508 
509   FPToI.minScalar(0, S32)
510        .scalarize(0)
511        .lower();
512 
513   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
514     .scalarize(0)
515     .lower();
516 
517   if (ST.has16BitInsts()) {
518     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
519       .legalFor({S16, S32, S64})
520       .clampScalar(0, S16, S64)
521       .scalarize(0);
522   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
523     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
524       .legalFor({S32, S64})
525       .clampScalar(0, S32, S64)
526       .scalarize(0);
527   } else {
528     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
529       .legalFor({S32})
530       .customFor({S64})
531       .clampScalar(0, S32, S64)
532       .scalarize(0);
533   }
534 
535   getActionDefinitionsBuilder({G_PTR_ADD, G_PTR_MASK})
536     .scalarize(0)
537     .alwaysLegal();
538 
539   auto &CmpBuilder =
540     getActionDefinitionsBuilder(G_ICMP)
541     // The compare output type differs based on the register bank of the output,
542     // so make both s1 and s32 legal.
543     //
544     // Scalar compares producing output in scc will be promoted to s32, as that
545     // is the allocatable register type that will be needed for the copy from
546     // scc. This will be promoted during RegBankSelect, and we assume something
547     // before that won't try to use s32 result types.
548     //
549     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
550     // bank.
551     .legalForCartesianProduct(
552       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
553     .legalForCartesianProduct(
554       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
555   if (ST.has16BitInsts()) {
556     CmpBuilder.legalFor({{S1, S16}});
557   }
558 
559   CmpBuilder
560     .widenScalarToNextPow2(1)
561     .clampScalar(1, S32, S64)
562     .scalarize(0)
563     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
564 
565   getActionDefinitionsBuilder(G_FCMP)
566     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
567     .widenScalarToNextPow2(1)
568     .clampScalar(1, S32, S64)
569     .scalarize(0);
570 
571   // FIXME: fpow has a selection pattern that should move to custom lowering.
572   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
573   if (ST.has16BitInsts())
574     Exp2Ops.legalFor({S32, S16});
575   else
576     Exp2Ops.legalFor({S32});
577   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
578   Exp2Ops.scalarize(0);
579 
580   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
581   if (ST.has16BitInsts())
582     ExpOps.customFor({{S32}, {S16}});
583   else
584     ExpOps.customFor({S32});
585   ExpOps.clampScalar(0, MinScalarFPTy, S32)
586         .scalarize(0);
587 
588   // The 64-bit versions produce 32-bit results, but only on the SALU.
589   getActionDefinitionsBuilder(G_CTPOP)
590     .legalFor({{S32, S32}, {S32, S64}})
591     .clampScalar(0, S32, S32)
592     .clampScalar(1, S32, S64)
593     .scalarize(0)
594     .widenScalarToNextPow2(0, 32)
595     .widenScalarToNextPow2(1, 32);
596 
597   // The hardware instructions return a different result on 0 than the generic
598   // instructions expect. The hardware produces -1, but these produce the
599   // bitwidth.
600   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
601     .scalarize(0)
602     .clampScalar(0, S32, S32)
603     .clampScalar(1, S32, S64)
604     .widenScalarToNextPow2(0, 32)
605     .widenScalarToNextPow2(1, 32)
606     .lower();
607 
608   // The 64-bit versions produce 32-bit results, but only on the SALU.
609   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
610     .legalFor({{S32, S32}, {S32, S64}})
611     .clampScalar(0, S32, S32)
612     .clampScalar(1, S32, S64)
613     .scalarize(0)
614     .widenScalarToNextPow2(0, 32)
615     .widenScalarToNextPow2(1, 32);
616 
617   getActionDefinitionsBuilder(G_BITREVERSE)
618     .legalFor({S32})
619     .clampScalar(0, S32, S32)
620     .scalarize(0);
621 
622   if (ST.has16BitInsts()) {
623     getActionDefinitionsBuilder(G_BSWAP)
624       .legalFor({S16, S32, V2S16})
625       .clampMaxNumElements(0, S16, 2)
626       // FIXME: Fixing non-power-of-2 before clamp is workaround for
627       // narrowScalar limitation.
628       .widenScalarToNextPow2(0)
629       .clampScalar(0, S16, S32)
630       .scalarize(0);
631 
632     if (ST.hasVOP3PInsts()) {
633       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
634         .legalFor({S32, S16, V2S16})
635         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
636         .clampMaxNumElements(0, S16, 2)
637         .clampScalar(0, S16, S32)
638         .widenScalarToNextPow2(0)
639         .scalarize(0);
640     } else {
641       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
642         .legalFor({S32, S16})
643         .widenScalarToNextPow2(0)
644         .clampScalar(0, S16, S32)
645         .scalarize(0);
646     }
647   } else {
648     // TODO: Should have same legality without v_perm_b32
649     getActionDefinitionsBuilder(G_BSWAP)
650       .legalFor({S32})
651       .lowerIf(narrowerThan(0, 32))
652       // FIXME: Fixing non-power-of-2 before clamp is workaround for
653       // narrowScalar limitation.
654       .widenScalarToNextPow2(0)
655       .maxScalar(0, S32)
656       .scalarize(0)
657       .lower();
658 
659     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
660       .legalFor({S32})
661       .clampScalar(0, S32, S32)
662       .widenScalarToNextPow2(0)
663       .scalarize(0);
664   }
665 
666   getActionDefinitionsBuilder(G_INTTOPTR)
667     // List the common cases
668     .legalForCartesianProduct(AddrSpaces64, {S64})
669     .legalForCartesianProduct(AddrSpaces32, {S32})
670     .scalarize(0)
671     // Accept any address space as long as the size matches
672     .legalIf(sameSize(0, 1))
673     .widenScalarIf(smallerThan(1, 0),
674       [](const LegalityQuery &Query) {
675         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
676       })
677     .narrowScalarIf(greaterThan(1, 0),
678       [](const LegalityQuery &Query) {
679         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
680       });
681 
682   getActionDefinitionsBuilder(G_PTRTOINT)
683     // List the common cases
684     .legalForCartesianProduct(AddrSpaces64, {S64})
685     .legalForCartesianProduct(AddrSpaces32, {S32})
686     .scalarize(0)
687     // Accept any address space as long as the size matches
688     .legalIf(sameSize(0, 1))
689     .widenScalarIf(smallerThan(0, 1),
690       [](const LegalityQuery &Query) {
691         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
692       })
693     .narrowScalarIf(
694       greaterThan(0, 1),
695       [](const LegalityQuery &Query) {
696         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
697       });
698 
699   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
700     .scalarize(0)
701     .custom();
702 
703   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
704   // handle some operations by just promoting the register during
705   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
706   auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned {
707     switch (AS) {
708     // FIXME: Private element size.
709     case AMDGPUAS::PRIVATE_ADDRESS:
710       return 32;
711     // FIXME: Check subtarget
712     case AMDGPUAS::LOCAL_ADDRESS:
713       return ST.useDS128() ? 128 : 64;
714 
715     // Treat constant and global as identical. SMRD loads are sometimes usable
716     // for global loads (ideally constant address space should be eliminated)
717     // depending on the context. Legality cannot be context dependent, but
718     // RegBankSelect can split the load as necessary depending on the pointer
719     // register bank/uniformity and if the memory is invariant or not written in
720     // a kernel.
721     case AMDGPUAS::CONSTANT_ADDRESS:
722     case AMDGPUAS::GLOBAL_ADDRESS:
723       return IsLoad ? 512 : 128;
724     default:
725       return 128;
726     }
727   };
728 
729   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
730                                     bool IsLoad) -> bool {
731     const LLT DstTy = Query.Types[0];
732 
733     // Split vector extloads.
734     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
735     unsigned Align = Query.MMODescrs[0].AlignInBits;
736 
737     if (MemSize < DstTy.getSizeInBits())
738       MemSize = std::max(MemSize, Align);
739 
740     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
741       return true;
742 
743     const LLT PtrTy = Query.Types[1];
744     unsigned AS = PtrTy.getAddressSpace();
745     if (MemSize > maxSizeForAddrSpace(AS, IsLoad))
746       return true;
747 
748     // Catch weird sized loads that don't evenly divide into the access sizes
749     // TODO: May be able to widen depending on alignment etc.
750     unsigned NumRegs = (MemSize + 31) / 32;
751     if (NumRegs == 3) {
752       if (!ST.hasDwordx3LoadStores())
753         return true;
754     } else {
755       // If the alignment allows, these should have been widened.
756       if (!isPowerOf2_32(NumRegs))
757         return true;
758     }
759 
760     if (Align < MemSize) {
761       const SITargetLowering *TLI = ST.getTargetLowering();
762       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
763     }
764 
765     return false;
766   };
767 
768   const auto shouldWidenLoadResult = [=](const LegalityQuery &Query) -> bool {
769     unsigned Size = Query.Types[0].getSizeInBits();
770     if (isPowerOf2_32(Size))
771       return false;
772 
773     if (Size == 96 && ST.hasDwordx3LoadStores())
774       return false;
775 
776     unsigned AddrSpace = Query.Types[1].getAddressSpace();
777     if (Size >= maxSizeForAddrSpace(AddrSpace, true))
778       return false;
779 
780     unsigned Align = Query.MMODescrs[0].AlignInBits;
781     unsigned RoundedSize = NextPowerOf2(Size);
782     return (Align >= RoundedSize);
783   };
784 
785   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
786   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
787   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
788 
789   // TODO: Refine based on subtargets which support unaligned access or 128-bit
790   // LDS
791   // TODO: Unsupported flat for SI.
792 
793   for (unsigned Op : {G_LOAD, G_STORE}) {
794     const bool IsStore = Op == G_STORE;
795 
796     auto &Actions = getActionDefinitionsBuilder(Op);
797     // Whitelist the common cases.
798     // TODO: Loads to s16 on gfx9
799     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
800                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
801                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
802                                       {S128, GlobalPtr, 128, GlobalAlign32},
803                                       {S64, GlobalPtr, 64, GlobalAlign32},
804                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
805                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
806                                       {S32, GlobalPtr, 8, GlobalAlign8},
807                                       {S32, GlobalPtr, 16, GlobalAlign16},
808 
809                                       {S32, LocalPtr, 32, 32},
810                                       {S64, LocalPtr, 64, 32},
811                                       {V2S32, LocalPtr, 64, 32},
812                                       {S32, LocalPtr, 8, 8},
813                                       {S32, LocalPtr, 16, 16},
814                                       {V2S16, LocalPtr, 32, 32},
815 
816                                       {S32, PrivatePtr, 32, 32},
817                                       {S32, PrivatePtr, 8, 8},
818                                       {S32, PrivatePtr, 16, 16},
819                                       {V2S16, PrivatePtr, 32, 32},
820 
821                                       {S32, FlatPtr, 32, GlobalAlign32},
822                                       {S32, FlatPtr, 16, GlobalAlign16},
823                                       {S32, FlatPtr, 8, GlobalAlign8},
824                                       {V2S16, FlatPtr, 32, GlobalAlign32},
825 
826                                       {S32, ConstantPtr, 32, GlobalAlign32},
827                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
828                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
829                                       {S64, ConstantPtr, 64, GlobalAlign32},
830                                       {S128, ConstantPtr, 128, GlobalAlign32},
831                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
832     Actions
833         .customIf(typeIs(1, Constant32Ptr))
834         // Widen suitably aligned loads by loading extra elements.
835         .moreElementsIf([=](const LegalityQuery &Query) {
836             const LLT Ty = Query.Types[0];
837             return Op == G_LOAD && Ty.isVector() &&
838                    shouldWidenLoadResult(Query);
839           }, moreElementsToNextPow2(0))
840         .widenScalarIf([=](const LegalityQuery &Query) {
841             const LLT Ty = Query.Types[0];
842             return Op == G_LOAD && !Ty.isVector() &&
843                    shouldWidenLoadResult(Query);
844           }, widenScalarOrEltToNextPow2(0))
845         .narrowScalarIf(
846             [=](const LegalityQuery &Query) -> bool {
847               return !Query.Types[0].isVector() &&
848                      needToSplitMemOp(Query, Op == G_LOAD);
849             },
850             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
851               const LLT DstTy = Query.Types[0];
852               const LLT PtrTy = Query.Types[1];
853 
854               const unsigned DstSize = DstTy.getSizeInBits();
855               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
856 
857               // Split extloads.
858               if (DstSize > MemSize)
859                 return std::make_pair(0, LLT::scalar(MemSize));
860 
861               if (!isPowerOf2_32(DstSize)) {
862                 // We're probably decomposing an odd sized store. Try to split
863                 // to the widest type. TODO: Account for alignment. As-is it
864                 // should be OK, since the new parts will be further legalized.
865                 unsigned FloorSize = PowerOf2Floor(DstSize);
866                 return std::make_pair(0, LLT::scalar(FloorSize));
867               }
868 
869               if (DstSize > 32 && (DstSize % 32 != 0)) {
870                 // FIXME: Need a way to specify non-extload of larger size if
871                 // suitably aligned.
872                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
873               }
874 
875               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
876                                                      Op == G_LOAD);
877               if (MemSize > MaxSize)
878                 return std::make_pair(0, LLT::scalar(MaxSize));
879 
880               unsigned Align = Query.MMODescrs[0].AlignInBits;
881               return std::make_pair(0, LLT::scalar(Align));
882             })
883         .fewerElementsIf(
884             [=](const LegalityQuery &Query) -> bool {
885               return Query.Types[0].isVector() &&
886                      needToSplitMemOp(Query, Op == G_LOAD);
887             },
888             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
889               const LLT DstTy = Query.Types[0];
890               const LLT PtrTy = Query.Types[1];
891 
892               LLT EltTy = DstTy.getElementType();
893               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
894                                                      Op == G_LOAD);
895 
896               // FIXME: Handle widened to power of 2 results better. This ends
897               // up scalarizing.
898               // FIXME: 3 element stores scalarized on SI
899 
900               // Split if it's too large for the address space.
901               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
902                 unsigned NumElts = DstTy.getNumElements();
903                 unsigned EltSize = EltTy.getSizeInBits();
904 
905                 if (MaxSize % EltSize == 0) {
906                   return std::make_pair(
907                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
908                 }
909 
910                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
911 
912                 // FIXME: Refine when odd breakdowns handled
913                 // The scalars will need to be re-legalized.
914                 if (NumPieces == 1 || NumPieces >= NumElts ||
915                     NumElts % NumPieces != 0)
916                   return std::make_pair(0, EltTy);
917 
918                 return std::make_pair(0,
919                                       LLT::vector(NumElts / NumPieces, EltTy));
920               }
921 
922               // FIXME: We could probably handle weird extending loads better.
923               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
924               if (DstTy.getSizeInBits() > MemSize)
925                 return std::make_pair(0, EltTy);
926 
927               unsigned EltSize = EltTy.getSizeInBits();
928               unsigned DstSize = DstTy.getSizeInBits();
929               if (!isPowerOf2_32(DstSize)) {
930                 // We're probably decomposing an odd sized store. Try to split
931                 // to the widest type. TODO: Account for alignment. As-is it
932                 // should be OK, since the new parts will be further legalized.
933                 unsigned FloorSize = PowerOf2Floor(DstSize);
934                 return std::make_pair(
935                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
936               }
937 
938               // Need to split because of alignment.
939               unsigned Align = Query.MMODescrs[0].AlignInBits;
940               if (EltSize > Align &&
941                   (EltSize / Align < DstTy.getNumElements())) {
942                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
943               }
944 
945               // May need relegalization for the scalars.
946               return std::make_pair(0, EltTy);
947             })
948         .minScalar(0, S32);
949 
950     if (IsStore)
951       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
952 
953     // TODO: Need a bitcast lower option?
954     Actions
955         .legalIf([=](const LegalityQuery &Query) {
956           const LLT Ty0 = Query.Types[0];
957           unsigned Size = Ty0.getSizeInBits();
958           unsigned MemSize = Query.MMODescrs[0].SizeInBits;
959           unsigned Align = Query.MMODescrs[0].AlignInBits;
960 
961           // FIXME: Widening store from alignment not valid.
962           if (MemSize < Size)
963             MemSize = std::max(MemSize, Align);
964 
965           // No extending vector loads.
966           if (Size > MemSize && Ty0.isVector())
967             return false;
968 
969           switch (MemSize) {
970           case 8:
971           case 16:
972             return Size == 32;
973           case 32:
974           case 64:
975           case 128:
976             return true;
977           case 96:
978             return ST.hasDwordx3LoadStores();
979           case 256:
980           case 512:
981             return true;
982           default:
983             return false;
984           }
985         })
986         .widenScalarToNextPow2(0)
987         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
988   }
989 
990   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
991                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
992                                                   {S32, GlobalPtr, 16, 2 * 8},
993                                                   {S32, LocalPtr, 8, 8},
994                                                   {S32, LocalPtr, 16, 16},
995                                                   {S32, PrivatePtr, 8, 8},
996                                                   {S32, PrivatePtr, 16, 16},
997                                                   {S32, ConstantPtr, 8, 8},
998                                                   {S32, ConstantPtr, 16, 2 * 8}});
999   if (ST.hasFlatAddressSpace()) {
1000     ExtLoads.legalForTypesWithMemDesc(
1001         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1002   }
1003 
1004   ExtLoads.clampScalar(0, S32, S32)
1005           .widenScalarToNextPow2(0)
1006           .unsupportedIfMemSizeNotPow2()
1007           .lower();
1008 
1009   auto &Atomics = getActionDefinitionsBuilder(
1010     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1011      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1012      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1013      G_ATOMICRMW_UMIN})
1014     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1015                {S64, GlobalPtr}, {S64, LocalPtr}});
1016   if (ST.hasFlatAddressSpace()) {
1017     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1018   }
1019 
1020   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1021     .legalFor({{S32, LocalPtr}});
1022 
1023   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1024   // demarshalling
1025   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1026     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1027                 {S32, FlatPtr}, {S64, FlatPtr}})
1028     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1029                {S32, RegionPtr}, {S64, RegionPtr}});
1030   // TODO: Pointer types, any 32-bit or 64-bit vector
1031 
1032   // Condition should be s32 for scalar, s1 for vector.
1033   getActionDefinitionsBuilder(G_SELECT)
1034     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1035           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1036           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1037     .clampScalar(0, S16, S64)
1038     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1039     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1040     .scalarize(1)
1041     .clampMaxNumElements(0, S32, 2)
1042     .clampMaxNumElements(0, LocalPtr, 2)
1043     .clampMaxNumElements(0, PrivatePtr, 2)
1044     .scalarize(0)
1045     .widenScalarToNextPow2(0)
1046     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1047 
1048   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1049   // be more flexible with the shift amount type.
1050   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1051     .legalFor({{S32, S32}, {S64, S32}});
1052   if (ST.has16BitInsts()) {
1053     if (ST.hasVOP3PInsts()) {
1054       Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
1055             .clampMaxNumElements(0, S16, 2);
1056     } else
1057       Shifts.legalFor({{S16, S32}, {S16, S16}});
1058 
1059     // TODO: Support 16-bit shift amounts
1060     Shifts.clampScalar(1, S32, S32);
1061     Shifts.clampScalar(0, S16, S64);
1062     Shifts.widenScalarToNextPow2(0, 16);
1063   } else {
1064     // Make sure we legalize the shift amount type first, as the general
1065     // expansion for the shifted type will produce much worse code if it hasn't
1066     // been truncated already.
1067     Shifts.clampScalar(1, S32, S32);
1068     Shifts.clampScalar(0, S32, S64);
1069     Shifts.widenScalarToNextPow2(0, 32);
1070   }
1071   Shifts.scalarize(0);
1072 
1073   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1074     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1075     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1076     unsigned IdxTypeIdx = 2;
1077 
1078     getActionDefinitionsBuilder(Op)
1079       .customIf([=](const LegalityQuery &Query) {
1080           const LLT EltTy = Query.Types[EltTypeIdx];
1081           const LLT VecTy = Query.Types[VecTypeIdx];
1082           const LLT IdxTy = Query.Types[IdxTypeIdx];
1083           return (EltTy.getSizeInBits() == 16 ||
1084                   EltTy.getSizeInBits() % 32 == 0) &&
1085                  VecTy.getSizeInBits() % 32 == 0 &&
1086                  VecTy.getSizeInBits() <= 1024 &&
1087                  IdxTy.getSizeInBits() == 32;
1088         })
1089       .clampScalar(EltTypeIdx, S32, S64)
1090       .clampScalar(VecTypeIdx, S32, S64)
1091       .clampScalar(IdxTypeIdx, S32, S32);
1092   }
1093 
1094   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1095     .unsupportedIf([=](const LegalityQuery &Query) {
1096         const LLT &EltTy = Query.Types[1].getElementType();
1097         return Query.Types[0] != EltTy;
1098       });
1099 
1100   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1101     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1102     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1103 
1104     // FIXME: Doesn't handle extract of illegal sizes.
1105     getActionDefinitionsBuilder(Op)
1106       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1107       // FIXME: Multiples of 16 should not be legal.
1108       .legalIf([=](const LegalityQuery &Query) {
1109           const LLT BigTy = Query.Types[BigTyIdx];
1110           const LLT LitTy = Query.Types[LitTyIdx];
1111           return (BigTy.getSizeInBits() % 32 == 0) &&
1112                  (LitTy.getSizeInBits() % 16 == 0);
1113         })
1114       .widenScalarIf(
1115         [=](const LegalityQuery &Query) {
1116           const LLT BigTy = Query.Types[BigTyIdx];
1117           return (BigTy.getScalarSizeInBits() < 16);
1118         },
1119         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1120       .widenScalarIf(
1121         [=](const LegalityQuery &Query) {
1122           const LLT LitTy = Query.Types[LitTyIdx];
1123           return (LitTy.getScalarSizeInBits() < 16);
1124         },
1125         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1126       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1127       .widenScalarToNextPow2(BigTyIdx, 32);
1128 
1129   }
1130 
1131   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1132     .legalForCartesianProduct(AllS32Vectors, {S32})
1133     .legalForCartesianProduct(AllS64Vectors, {S64})
1134     .clampNumElements(0, V16S32, V32S32)
1135     .clampNumElements(0, V2S64, V16S64)
1136     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1137 
1138   if (ST.hasScalarPackInsts()) {
1139     BuildVector
1140       // FIXME: Should probably widen s1 vectors straight to s32
1141       .minScalarOrElt(0, S16)
1142       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1143       .minScalar(1, S32);
1144 
1145     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1146       .legalFor({V2S16, S32})
1147       .lower();
1148     BuildVector.minScalarOrElt(0, S32);
1149   } else {
1150     BuildVector.customFor({V2S16, S16});
1151     BuildVector.minScalarOrElt(0, S32);
1152 
1153     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1154       .customFor({V2S16, S32})
1155       .lower();
1156   }
1157 
1158   BuildVector.legalIf(isRegisterType(0));
1159 
1160   // FIXME: Clamp maximum size
1161   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1162     .legalIf(isRegisterType(0));
1163 
1164   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1165   // pre-legalize.
1166   if (ST.hasVOP3PInsts()) {
1167     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1168       .customFor({V2S16, V2S16})
1169       .lower();
1170   } else
1171     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1172 
1173   // Merge/Unmerge
1174   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1175     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1176     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1177 
1178     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1179       const LLT &Ty = Query.Types[TypeIdx];
1180       if (Ty.isVector()) {
1181         const LLT &EltTy = Ty.getElementType();
1182         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
1183           return true;
1184         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1185           return true;
1186       }
1187       return false;
1188     };
1189 
1190     auto &Builder = getActionDefinitionsBuilder(Op)
1191       // Try to widen to s16 first for small types.
1192       // TODO: Only do this on targets with legal s16 shifts
1193       .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1194 
1195       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1196       .lowerFor({{S16, V2S16}})
1197       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1198       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1199                            elementTypeIs(1, S16)),
1200                        changeTo(1, V2S16))
1201       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1202       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1203       // valid.
1204       .clampScalar(LitTyIdx, S32, S256)
1205       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1206       // Break up vectors with weird elements into scalars
1207       .fewerElementsIf(
1208         [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
1209         scalarize(0))
1210       .fewerElementsIf(
1211         [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
1212         scalarize(1))
1213       .clampScalar(BigTyIdx, S32, S1024);
1214 
1215     if (Op == G_MERGE_VALUES) {
1216       Builder.widenScalarIf(
1217         // TODO: Use 16-bit shifts if legal for 8-bit values?
1218         [=](const LegalityQuery &Query) {
1219           const LLT Ty = Query.Types[LitTyIdx];
1220           return Ty.getSizeInBits() < 32;
1221         },
1222         changeTo(LitTyIdx, S32));
1223     }
1224 
1225     Builder.widenScalarIf(
1226       [=](const LegalityQuery &Query) {
1227         const LLT Ty = Query.Types[BigTyIdx];
1228         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1229           Ty.getSizeInBits() % 16 != 0;
1230       },
1231       [=](const LegalityQuery &Query) {
1232         // Pick the next power of 2, or a multiple of 64 over 128.
1233         // Whichever is smaller.
1234         const LLT &Ty = Query.Types[BigTyIdx];
1235         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1236         if (NewSizeInBits >= 256) {
1237           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1238           if (RoundedTo < NewSizeInBits)
1239             NewSizeInBits = RoundedTo;
1240         }
1241         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1242       })
1243       .legalIf([=](const LegalityQuery &Query) {
1244           const LLT &BigTy = Query.Types[BigTyIdx];
1245           const LLT &LitTy = Query.Types[LitTyIdx];
1246 
1247           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1248             return false;
1249           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1250             return false;
1251 
1252           return BigTy.getSizeInBits() % 16 == 0 &&
1253                  LitTy.getSizeInBits() % 16 == 0 &&
1254                  BigTy.getSizeInBits() <= 1024;
1255         })
1256       // Any vectors left are the wrong size. Scalarize them.
1257       .scalarize(0)
1258       .scalarize(1);
1259   }
1260 
1261   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1262   // RegBankSelect.
1263   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1264     .legalFor({{S32}, {S64}});
1265 
1266   if (ST.hasVOP3PInsts()) {
1267     SextInReg.lowerFor({{V2S16}})
1268       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1269       // get more vector shift opportunities, since we'll get those when
1270       // expanded.
1271       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1272   } else if (ST.has16BitInsts()) {
1273     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1274   } else {
1275     // Prefer to promote to s32 before lowering if we don't have 16-bit
1276     // shifts. This avoid a lot of intermediate truncate and extend operations.
1277     SextInReg.lowerFor({{S32}, {S64}});
1278   }
1279 
1280   SextInReg
1281     .scalarize(0)
1282     .clampScalar(0, S32, S64)
1283     .lower();
1284 
1285   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1286     .legalFor({S64});
1287 
1288   getActionDefinitionsBuilder({
1289       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1290       G_FCOPYSIGN,
1291 
1292       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1293       G_READ_REGISTER,
1294       G_WRITE_REGISTER,
1295 
1296       G_SADDO, G_SSUBO,
1297 
1298        // TODO: Implement
1299       G_FMINIMUM, G_FMAXIMUM
1300     }).lower();
1301 
1302   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1303         G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1304         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1305     .unsupported();
1306 
1307   computeTables();
1308   verify(*ST.getInstrInfo());
1309 }
1310 
1311 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1312                                          MachineRegisterInfo &MRI,
1313                                          MachineIRBuilder &B,
1314                                          GISelChangeObserver &Observer) const {
1315   switch (MI.getOpcode()) {
1316   case TargetOpcode::G_ADDRSPACE_CAST:
1317     return legalizeAddrSpaceCast(MI, MRI, B);
1318   case TargetOpcode::G_FRINT:
1319     return legalizeFrint(MI, MRI, B);
1320   case TargetOpcode::G_FCEIL:
1321     return legalizeFceil(MI, MRI, B);
1322   case TargetOpcode::G_INTRINSIC_TRUNC:
1323     return legalizeIntrinsicTrunc(MI, MRI, B);
1324   case TargetOpcode::G_SITOFP:
1325     return legalizeITOFP(MI, MRI, B, true);
1326   case TargetOpcode::G_UITOFP:
1327     return legalizeITOFP(MI, MRI, B, false);
1328   case TargetOpcode::G_FPTOSI:
1329     return legalizeFPTOI(MI, MRI, B, true);
1330   case TargetOpcode::G_FPTOUI:
1331     return legalizeFPTOI(MI, MRI, B, false);
1332   case TargetOpcode::G_FMINNUM:
1333   case TargetOpcode::G_FMAXNUM:
1334   case TargetOpcode::G_FMINNUM_IEEE:
1335   case TargetOpcode::G_FMAXNUM_IEEE:
1336     return legalizeMinNumMaxNum(MI, MRI, B);
1337   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1338     return legalizeExtractVectorElt(MI, MRI, B);
1339   case TargetOpcode::G_INSERT_VECTOR_ELT:
1340     return legalizeInsertVectorElt(MI, MRI, B);
1341   case TargetOpcode::G_SHUFFLE_VECTOR:
1342     return legalizeShuffleVector(MI, MRI, B);
1343   case TargetOpcode::G_FSIN:
1344   case TargetOpcode::G_FCOS:
1345     return legalizeSinCos(MI, MRI, B);
1346   case TargetOpcode::G_GLOBAL_VALUE:
1347     return legalizeGlobalValue(MI, MRI, B);
1348   case TargetOpcode::G_LOAD:
1349     return legalizeLoad(MI, MRI, B, Observer);
1350   case TargetOpcode::G_FMAD:
1351     return legalizeFMad(MI, MRI, B);
1352   case TargetOpcode::G_FDIV:
1353     return legalizeFDIV(MI, MRI, B);
1354   case TargetOpcode::G_UDIV:
1355   case TargetOpcode::G_UREM:
1356     return legalizeUDIV_UREM(MI, MRI, B);
1357   case TargetOpcode::G_SDIV:
1358   case TargetOpcode::G_SREM:
1359     return legalizeSDIV_SREM(MI, MRI, B);
1360   case TargetOpcode::G_ATOMIC_CMPXCHG:
1361     return legalizeAtomicCmpXChg(MI, MRI, B);
1362   case TargetOpcode::G_FLOG:
1363     return legalizeFlog(MI, B, 1.0f / numbers::log2ef);
1364   case TargetOpcode::G_FLOG10:
1365     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1366   case TargetOpcode::G_FEXP:
1367     return legalizeFExp(MI, B);
1368   case TargetOpcode::G_FPOW:
1369     return legalizeFPow(MI, B);
1370   case TargetOpcode::G_FFLOOR:
1371     return legalizeFFloor(MI, MRI, B);
1372   case TargetOpcode::G_BUILD_VECTOR:
1373     return legalizeBuildVector(MI, MRI, B);
1374   default:
1375     return false;
1376   }
1377 
1378   llvm_unreachable("expected switch to return");
1379 }
1380 
1381 Register AMDGPULegalizerInfo::getSegmentAperture(
1382   unsigned AS,
1383   MachineRegisterInfo &MRI,
1384   MachineIRBuilder &B) const {
1385   MachineFunction &MF = B.getMF();
1386   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1387   const LLT S32 = LLT::scalar(32);
1388 
1389   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1390 
1391   if (ST.hasApertureRegs()) {
1392     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1393     // getreg.
1394     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1395         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1396         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1397     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1398         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1399         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1400     unsigned Encoding =
1401         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1402         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1403         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1404 
1405     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1406 
1407     B.buildInstr(AMDGPU::S_GETREG_B32)
1408       .addDef(GetReg)
1409       .addImm(Encoding);
1410     MRI.setType(GetReg, S32);
1411 
1412     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1413     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1414   }
1415 
1416   Register QueuePtr = MRI.createGenericVirtualRegister(
1417     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1418 
1419   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1420   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1421     return Register();
1422 
1423   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1424   // private_segment_aperture_base_hi.
1425   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1426 
1427   // TODO: can we be smarter about machine pointer info?
1428   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1429   MachineMemOperand *MMO = MF.getMachineMemOperand(
1430     PtrInfo,
1431     MachineMemOperand::MOLoad |
1432     MachineMemOperand::MODereferenceable |
1433     MachineMemOperand::MOInvariant,
1434     4,
1435     MinAlign(64, StructOffset));
1436 
1437   Register LoadAddr;
1438 
1439   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1440   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1441 }
1442 
1443 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1444   MachineInstr &MI, MachineRegisterInfo &MRI,
1445   MachineIRBuilder &B) const {
1446   MachineFunction &MF = B.getMF();
1447 
1448   B.setInstr(MI);
1449 
1450   const LLT S32 = LLT::scalar(32);
1451   Register Dst = MI.getOperand(0).getReg();
1452   Register Src = MI.getOperand(1).getReg();
1453 
1454   LLT DstTy = MRI.getType(Dst);
1455   LLT SrcTy = MRI.getType(Src);
1456   unsigned DestAS = DstTy.getAddressSpace();
1457   unsigned SrcAS = SrcTy.getAddressSpace();
1458 
1459   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1460   // vector element.
1461   assert(!DstTy.isVector());
1462 
1463   const AMDGPUTargetMachine &TM
1464     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1465 
1466   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1467   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1468     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1469     return true;
1470   }
1471 
1472   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1473     // Truncate.
1474     B.buildExtract(Dst, Src, 0);
1475     MI.eraseFromParent();
1476     return true;
1477   }
1478 
1479   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1480     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1481     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1482 
1483     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1484     // another. Merge operands are required to be the same type, but creating an
1485     // extra ptrtoint would be kind of pointless.
1486     auto HighAddr = B.buildConstant(
1487       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1488     B.buildMerge(Dst, {Src, HighAddr});
1489     MI.eraseFromParent();
1490     return true;
1491   }
1492 
1493   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1494     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1495            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1496     unsigned NullVal = TM.getNullPointerValue(DestAS);
1497 
1498     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1499     auto FlatNull = B.buildConstant(SrcTy, 0);
1500 
1501     // Extract low 32-bits of the pointer.
1502     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1503 
1504     auto CmpRes =
1505         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1506     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1507 
1508     MI.eraseFromParent();
1509     return true;
1510   }
1511 
1512   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1513     return false;
1514 
1515   if (!ST.hasFlatAddressSpace())
1516     return false;
1517 
1518   auto SegmentNull =
1519       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1520   auto FlatNull =
1521       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1522 
1523   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1524   if (!ApertureReg.isValid())
1525     return false;
1526 
1527   auto CmpRes =
1528       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1529 
1530   // Coerce the type of the low half of the result so we can use merge_values.
1531   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1532 
1533   // TODO: Should we allow mismatched types but matching sizes in merges to
1534   // avoid the ptrtoint?
1535   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1536   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1537 
1538   MI.eraseFromParent();
1539   return true;
1540 }
1541 
1542 bool AMDGPULegalizerInfo::legalizeFrint(
1543   MachineInstr &MI, MachineRegisterInfo &MRI,
1544   MachineIRBuilder &B) const {
1545   B.setInstr(MI);
1546 
1547   Register Src = MI.getOperand(1).getReg();
1548   LLT Ty = MRI.getType(Src);
1549   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1550 
1551   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1552   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1553 
1554   auto C1 = B.buildFConstant(Ty, C1Val);
1555   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1556 
1557   // TODO: Should this propagate fast-math-flags?
1558   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1559   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1560 
1561   auto C2 = B.buildFConstant(Ty, C2Val);
1562   auto Fabs = B.buildFAbs(Ty, Src);
1563 
1564   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1565   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1566   return true;
1567 }
1568 
1569 bool AMDGPULegalizerInfo::legalizeFceil(
1570   MachineInstr &MI, MachineRegisterInfo &MRI,
1571   MachineIRBuilder &B) const {
1572   B.setInstr(MI);
1573 
1574   const LLT S1 = LLT::scalar(1);
1575   const LLT S64 = LLT::scalar(64);
1576 
1577   Register Src = MI.getOperand(1).getReg();
1578   assert(MRI.getType(Src) == S64);
1579 
1580   // result = trunc(src)
1581   // if (src > 0.0 && src != result)
1582   //   result += 1.0
1583 
1584   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1585 
1586   const auto Zero = B.buildFConstant(S64, 0.0);
1587   const auto One = B.buildFConstant(S64, 1.0);
1588   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1589   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1590   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1591   auto Add = B.buildSelect(S64, And, One, Zero);
1592 
1593   // TODO: Should this propagate fast-math-flags?
1594   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1595   return true;
1596 }
1597 
1598 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1599                                               MachineIRBuilder &B) {
1600   const unsigned FractBits = 52;
1601   const unsigned ExpBits = 11;
1602   LLT S32 = LLT::scalar(32);
1603 
1604   auto Const0 = B.buildConstant(S32, FractBits - 32);
1605   auto Const1 = B.buildConstant(S32, ExpBits);
1606 
1607   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1608     .addUse(Const0.getReg(0))
1609     .addUse(Const1.getReg(0));
1610 
1611   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1612 }
1613 
1614 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1615   MachineInstr &MI, MachineRegisterInfo &MRI,
1616   MachineIRBuilder &B) const {
1617   B.setInstr(MI);
1618 
1619   const LLT S1 = LLT::scalar(1);
1620   const LLT S32 = LLT::scalar(32);
1621   const LLT S64 = LLT::scalar(64);
1622 
1623   Register Src = MI.getOperand(1).getReg();
1624   assert(MRI.getType(Src) == S64);
1625 
1626   // TODO: Should this use extract since the low half is unused?
1627   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1628   Register Hi = Unmerge.getReg(1);
1629 
1630   // Extract the upper half, since this is where we will find the sign and
1631   // exponent.
1632   auto Exp = extractF64Exponent(Hi, B);
1633 
1634   const unsigned FractBits = 52;
1635 
1636   // Extract the sign bit.
1637   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1638   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1639 
1640   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1641 
1642   const auto Zero32 = B.buildConstant(S32, 0);
1643 
1644   // Extend back to 64-bits.
1645   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1646 
1647   auto Shr = B.buildAShr(S64, FractMask, Exp);
1648   auto Not = B.buildNot(S64, Shr);
1649   auto Tmp0 = B.buildAnd(S64, Src, Not);
1650   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1651 
1652   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1653   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1654 
1655   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1656   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1657   return true;
1658 }
1659 
1660 bool AMDGPULegalizerInfo::legalizeITOFP(
1661   MachineInstr &MI, MachineRegisterInfo &MRI,
1662   MachineIRBuilder &B, bool Signed) const {
1663   B.setInstr(MI);
1664 
1665   Register Dst = MI.getOperand(0).getReg();
1666   Register Src = MI.getOperand(1).getReg();
1667 
1668   const LLT S64 = LLT::scalar(64);
1669   const LLT S32 = LLT::scalar(32);
1670 
1671   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1672 
1673   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1674 
1675   auto CvtHi = Signed ?
1676     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1677     B.buildUITOFP(S64, Unmerge.getReg(1));
1678 
1679   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1680 
1681   auto ThirtyTwo = B.buildConstant(S32, 32);
1682   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1683     .addUse(CvtHi.getReg(0))
1684     .addUse(ThirtyTwo.getReg(0));
1685 
1686   // TODO: Should this propagate fast-math-flags?
1687   B.buildFAdd(Dst, LdExp, CvtLo);
1688   MI.eraseFromParent();
1689   return true;
1690 }
1691 
1692 // TODO: Copied from DAG implementation. Verify logic and document how this
1693 // actually works.
1694 bool AMDGPULegalizerInfo::legalizeFPTOI(
1695   MachineInstr &MI, MachineRegisterInfo &MRI,
1696   MachineIRBuilder &B, bool Signed) const {
1697   B.setInstr(MI);
1698 
1699   Register Dst = MI.getOperand(0).getReg();
1700   Register Src = MI.getOperand(1).getReg();
1701 
1702   const LLT S64 = LLT::scalar(64);
1703   const LLT S32 = LLT::scalar(32);
1704 
1705   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1706 
1707   unsigned Flags = MI.getFlags();
1708 
1709   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
1710   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
1711   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
1712 
1713   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
1714   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
1715   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
1716 
1717   auto Hi = Signed ?
1718     B.buildFPTOSI(S32, FloorMul) :
1719     B.buildFPTOUI(S32, FloorMul);
1720   auto Lo = B.buildFPTOUI(S32, Fma);
1721 
1722   B.buildMerge(Dst, { Lo, Hi });
1723   MI.eraseFromParent();
1724 
1725   return true;
1726 }
1727 
1728 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1729   MachineInstr &MI, MachineRegisterInfo &MRI,
1730   MachineIRBuilder &B) const {
1731   MachineFunction &MF = B.getMF();
1732   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1733 
1734   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1735                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1736 
1737   // With ieee_mode disabled, the instructions have the correct behavior
1738   // already for G_FMINNUM/G_FMAXNUM
1739   if (!MFI->getMode().IEEE)
1740     return !IsIEEEOp;
1741 
1742   if (IsIEEEOp)
1743     return true;
1744 
1745   MachineIRBuilder HelperBuilder(MI);
1746   GISelObserverWrapper DummyObserver;
1747   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1748   HelperBuilder.setInstr(MI);
1749   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1750 }
1751 
1752 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1753   MachineInstr &MI, MachineRegisterInfo &MRI,
1754   MachineIRBuilder &B) const {
1755   // TODO: Should move some of this into LegalizerHelper.
1756 
1757   // TODO: Promote dynamic indexing of s16 to s32
1758 
1759   // FIXME: Artifact combiner probably should have replaced the truncated
1760   // constant before this, so we shouldn't need
1761   // getConstantVRegValWithLookThrough.
1762   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1763     MI.getOperand(2).getReg(), MRI);
1764   if (!IdxVal) // Dynamic case will be selected to register indexing.
1765     return true;
1766 
1767   Register Dst = MI.getOperand(0).getReg();
1768   Register Vec = MI.getOperand(1).getReg();
1769 
1770   LLT VecTy = MRI.getType(Vec);
1771   LLT EltTy = VecTy.getElementType();
1772   assert(EltTy == MRI.getType(Dst));
1773 
1774   B.setInstr(MI);
1775 
1776   if (IdxVal->Value < VecTy.getNumElements())
1777     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
1778   else
1779     B.buildUndef(Dst);
1780 
1781   MI.eraseFromParent();
1782   return true;
1783 }
1784 
1785 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1786   MachineInstr &MI, MachineRegisterInfo &MRI,
1787   MachineIRBuilder &B) const {
1788   // TODO: Should move some of this into LegalizerHelper.
1789 
1790   // TODO: Promote dynamic indexing of s16 to s32
1791 
1792   // FIXME: Artifact combiner probably should have replaced the truncated
1793   // constant before this, so we shouldn't need
1794   // getConstantVRegValWithLookThrough.
1795   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1796     MI.getOperand(3).getReg(), MRI);
1797   if (!IdxVal) // Dynamic case will be selected to register indexing.
1798     return true;
1799 
1800   Register Dst = MI.getOperand(0).getReg();
1801   Register Vec = MI.getOperand(1).getReg();
1802   Register Ins = MI.getOperand(2).getReg();
1803 
1804   LLT VecTy = MRI.getType(Vec);
1805   LLT EltTy = VecTy.getElementType();
1806   assert(EltTy == MRI.getType(Ins));
1807 
1808   B.setInstr(MI);
1809 
1810   if (IdxVal->Value < VecTy.getNumElements())
1811     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
1812   else
1813     B.buildUndef(Dst);
1814 
1815   MI.eraseFromParent();
1816   return true;
1817 }
1818 
1819 bool AMDGPULegalizerInfo::legalizeShuffleVector(
1820   MachineInstr &MI, MachineRegisterInfo &MRI,
1821   MachineIRBuilder &B) const {
1822   const LLT V2S16 = LLT::vector(2, 16);
1823 
1824   Register Dst = MI.getOperand(0).getReg();
1825   Register Src0 = MI.getOperand(1).getReg();
1826   LLT DstTy = MRI.getType(Dst);
1827   LLT SrcTy = MRI.getType(Src0);
1828 
1829   if (SrcTy == V2S16 && DstTy == V2S16 &&
1830       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
1831     return true;
1832 
1833   MachineIRBuilder HelperBuilder(MI);
1834   GISelObserverWrapper DummyObserver;
1835   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
1836   HelperBuilder.setInstr(MI);
1837   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
1838 }
1839 
1840 bool AMDGPULegalizerInfo::legalizeSinCos(
1841   MachineInstr &MI, MachineRegisterInfo &MRI,
1842   MachineIRBuilder &B) const {
1843   B.setInstr(MI);
1844 
1845   Register DstReg = MI.getOperand(0).getReg();
1846   Register SrcReg = MI.getOperand(1).getReg();
1847   LLT Ty = MRI.getType(DstReg);
1848   unsigned Flags = MI.getFlags();
1849 
1850   Register TrigVal;
1851   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1852   if (ST.hasTrigReducedRange()) {
1853     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1854     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1855       .addUse(MulVal.getReg(0))
1856       .setMIFlags(Flags).getReg(0);
1857   } else
1858     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1859 
1860   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1861     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1862   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1863     .addUse(TrigVal)
1864     .setMIFlags(Flags);
1865   MI.eraseFromParent();
1866   return true;
1867 }
1868 
1869 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1870   Register DstReg, LLT PtrTy,
1871   MachineIRBuilder &B, const GlobalValue *GV,
1872   unsigned Offset, unsigned GAFlags) const {
1873   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1874   // to the following code sequence:
1875   //
1876   // For constant address space:
1877   //   s_getpc_b64 s[0:1]
1878   //   s_add_u32 s0, s0, $symbol
1879   //   s_addc_u32 s1, s1, 0
1880   //
1881   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1882   //   a fixup or relocation is emitted to replace $symbol with a literal
1883   //   constant, which is a pc-relative offset from the encoding of the $symbol
1884   //   operand to the global variable.
1885   //
1886   // For global address space:
1887   //   s_getpc_b64 s[0:1]
1888   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1889   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1890   //
1891   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1892   //   fixups or relocations are emitted to replace $symbol@*@lo and
1893   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1894   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
1895   //   operand to the global variable.
1896   //
1897   // What we want here is an offset from the value returned by s_getpc
1898   // (which is the address of the s_add_u32 instruction) to the global
1899   // variable, but since the encoding of $symbol starts 4 bytes after the start
1900   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1901   // small. This requires us to add 4 to the global variable offset in order to
1902   // compute the correct address.
1903 
1904   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1905 
1906   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1907     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1908 
1909   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1910     .addDef(PCReg);
1911 
1912   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1913   if (GAFlags == SIInstrInfo::MO_NONE)
1914     MIB.addImm(0);
1915   else
1916     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1917 
1918   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1919 
1920   if (PtrTy.getSizeInBits() == 32)
1921     B.buildExtract(DstReg, PCReg, 0);
1922   return true;
1923  }
1924 
1925 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1926   MachineInstr &MI, MachineRegisterInfo &MRI,
1927   MachineIRBuilder &B) const {
1928   Register DstReg = MI.getOperand(0).getReg();
1929   LLT Ty = MRI.getType(DstReg);
1930   unsigned AS = Ty.getAddressSpace();
1931 
1932   const GlobalValue *GV = MI.getOperand(1).getGlobal();
1933   MachineFunction &MF = B.getMF();
1934   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1935   B.setInstr(MI);
1936 
1937   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1938     if (!MFI->isEntryFunction()) {
1939       const Function &Fn = MF.getFunction();
1940       DiagnosticInfoUnsupported BadLDSDecl(
1941         Fn, "local memory global used by non-kernel function", MI.getDebugLoc());
1942       Fn.getContext().diagnose(BadLDSDecl);
1943     }
1944 
1945     // TODO: We could emit code to handle the initialization somewhere.
1946     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1947       const SITargetLowering *TLI = ST.getTargetLowering();
1948       if (!TLI->shouldUseLDSConstAddress(GV)) {
1949         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
1950         return true; // Leave in place;
1951       }
1952 
1953       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1954       MI.eraseFromParent();
1955       return true;
1956     }
1957 
1958     const Function &Fn = MF.getFunction();
1959     DiagnosticInfoUnsupported BadInit(
1960       Fn, "unsupported initializer for address space", MI.getDebugLoc());
1961     Fn.getContext().diagnose(BadInit);
1962     return true;
1963   }
1964 
1965   const SITargetLowering *TLI = ST.getTargetLowering();
1966 
1967   if (TLI->shouldEmitFixup(GV)) {
1968     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
1969     MI.eraseFromParent();
1970     return true;
1971   }
1972 
1973   if (TLI->shouldEmitPCReloc(GV)) {
1974     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
1975     MI.eraseFromParent();
1976     return true;
1977   }
1978 
1979   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1980   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
1981 
1982   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
1983     MachinePointerInfo::getGOT(MF),
1984     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1985     MachineMemOperand::MOInvariant,
1986     8 /*Size*/, 8 /*Align*/);
1987 
1988   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
1989 
1990   if (Ty.getSizeInBits() == 32) {
1991     // Truncate if this is a 32-bit constant adrdess.
1992     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
1993     B.buildExtract(DstReg, Load, 0);
1994   } else
1995     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
1996 
1997   MI.eraseFromParent();
1998   return true;
1999 }
2000 
2001 bool AMDGPULegalizerInfo::legalizeLoad(
2002   MachineInstr &MI, MachineRegisterInfo &MRI,
2003   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
2004   B.setInstr(MI);
2005   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2006   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
2007   Observer.changingInstr(MI);
2008   MI.getOperand(1).setReg(Cast.getReg(0));
2009   Observer.changedInstr(MI);
2010   return true;
2011 }
2012 
2013 bool AMDGPULegalizerInfo::legalizeFMad(
2014   MachineInstr &MI, MachineRegisterInfo &MRI,
2015   MachineIRBuilder &B) const {
2016   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2017   assert(Ty.isScalar());
2018 
2019   MachineFunction &MF = B.getMF();
2020   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2021 
2022   // TODO: Always legal with future ftz flag.
2023   // FIXME: Do we need just output?
2024   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2025     return true;
2026   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2027     return true;
2028 
2029   MachineIRBuilder HelperBuilder(MI);
2030   GISelObserverWrapper DummyObserver;
2031   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2032   HelperBuilder.setMBB(*MI.getParent());
2033   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2034 }
2035 
2036 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2037   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2038   Register DstReg = MI.getOperand(0).getReg();
2039   Register PtrReg = MI.getOperand(1).getReg();
2040   Register CmpVal = MI.getOperand(2).getReg();
2041   Register NewVal = MI.getOperand(3).getReg();
2042 
2043   assert(SITargetLowering::isFlatGlobalAddrSpace(
2044            MRI.getType(PtrReg).getAddressSpace()) &&
2045          "this should not have been custom lowered");
2046 
2047   LLT ValTy = MRI.getType(CmpVal);
2048   LLT VecTy = LLT::vector(2, ValTy);
2049 
2050   B.setInstr(MI);
2051   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2052 
2053   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2054     .addDef(DstReg)
2055     .addUse(PtrReg)
2056     .addUse(PackedVal)
2057     .setMemRefs(MI.memoperands());
2058 
2059   MI.eraseFromParent();
2060   return true;
2061 }
2062 
2063 bool AMDGPULegalizerInfo::legalizeFlog(
2064   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2065   Register Dst = MI.getOperand(0).getReg();
2066   Register Src = MI.getOperand(1).getReg();
2067   LLT Ty = B.getMRI()->getType(Dst);
2068   unsigned Flags = MI.getFlags();
2069   B.setInstr(MI);
2070 
2071   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2072   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2073 
2074   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2075   MI.eraseFromParent();
2076   return true;
2077 }
2078 
2079 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2080                                        MachineIRBuilder &B) const {
2081   Register Dst = MI.getOperand(0).getReg();
2082   Register Src = MI.getOperand(1).getReg();
2083   unsigned Flags = MI.getFlags();
2084   LLT Ty = B.getMRI()->getType(Dst);
2085   B.setInstr(MI);
2086 
2087   auto K = B.buildFConstant(Ty, numbers::log2e);
2088   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2089   B.buildFExp2(Dst, Mul, Flags);
2090   MI.eraseFromParent();
2091   return true;
2092 }
2093 
2094 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2095                                        MachineIRBuilder &B) const {
2096   Register Dst = MI.getOperand(0).getReg();
2097   Register Src0 = MI.getOperand(1).getReg();
2098   Register Src1 = MI.getOperand(2).getReg();
2099   unsigned Flags = MI.getFlags();
2100   LLT Ty = B.getMRI()->getType(Dst);
2101   B.setInstr(MI);
2102   const LLT S16 = LLT::scalar(16);
2103   const LLT S32 = LLT::scalar(32);
2104 
2105   if (Ty == S32) {
2106     auto Log = B.buildFLog2(S32, Src0, Flags);
2107     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2108       .addUse(Log.getReg(0))
2109       .addUse(Src1)
2110       .setMIFlags(Flags);
2111     B.buildFExp2(Dst, Mul, Flags);
2112   } else if (Ty == S16) {
2113     // There's no f16 fmul_legacy, so we need to convert for it.
2114     auto Log = B.buildFLog2(S16, Src0, Flags);
2115     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2116     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2117     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2118       .addUse(Ext0.getReg(0))
2119       .addUse(Ext1.getReg(0))
2120       .setMIFlags(Flags);
2121 
2122     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2123   } else
2124     return false;
2125 
2126   MI.eraseFromParent();
2127   return true;
2128 }
2129 
2130 // Find a source register, ignoring any possible source modifiers.
2131 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2132   Register ModSrc = OrigSrc;
2133   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2134     ModSrc = SrcFNeg->getOperand(1).getReg();
2135     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2136       ModSrc = SrcFAbs->getOperand(1).getReg();
2137   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2138     ModSrc = SrcFAbs->getOperand(1).getReg();
2139   return ModSrc;
2140 }
2141 
2142 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2143                                          MachineRegisterInfo &MRI,
2144                                          MachineIRBuilder &B) const {
2145   B.setInstr(MI);
2146 
2147   const LLT S1 = LLT::scalar(1);
2148   const LLT S64 = LLT::scalar(64);
2149   Register Dst = MI.getOperand(0).getReg();
2150   Register OrigSrc = MI.getOperand(1).getReg();
2151   unsigned Flags = MI.getFlags();
2152   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2153          "this should not have been custom lowered");
2154 
2155   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2156   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2157   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2158   // V_FRACT bug is:
2159   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2160   //
2161   // Convert floor(x) to (x - fract(x))
2162 
2163   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2164     .addUse(OrigSrc)
2165     .setMIFlags(Flags);
2166 
2167   // Give source modifier matching some assistance before obscuring a foldable
2168   // pattern.
2169 
2170   // TODO: We can avoid the neg on the fract? The input sign to fract
2171   // shouldn't matter?
2172   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2173 
2174   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2175 
2176   Register Min = MRI.createGenericVirtualRegister(S64);
2177 
2178   // We don't need to concern ourselves with the snan handling difference, so
2179   // use the one which will directly select.
2180   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2181   if (MFI->getMode().IEEE)
2182     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2183   else
2184     B.buildFMinNum(Min, Fract, Const, Flags);
2185 
2186   Register CorrectedFract = Min;
2187   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2188     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2189     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2190   }
2191 
2192   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2193   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2194 
2195   MI.eraseFromParent();
2196   return true;
2197 }
2198 
2199 // Turn an illegal packed v2s16 build vector into bit operations.
2200 // TODO: This should probably be a bitcast action in LegalizerHelper.
2201 bool AMDGPULegalizerInfo::legalizeBuildVector(
2202   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2203   Register Dst = MI.getOperand(0).getReg();
2204   LLT DstTy = MRI.getType(Dst);
2205   const LLT S32 = LLT::scalar(32);
2206   const LLT V2S16 = LLT::vector(2, 16);
2207   (void)DstTy;
2208   (void)V2S16;
2209   assert(DstTy == V2S16);
2210 
2211   Register Src0 = MI.getOperand(1).getReg();
2212   Register Src1 = MI.getOperand(2).getReg();
2213   assert(MRI.getType(Src0) == LLT::scalar(16));
2214 
2215   B.setInstr(MI);
2216   auto Merge = B.buildMerge(S32, {Src0, Src1});
2217   B.buildBitcast(Dst, Merge);
2218 
2219   MI.eraseFromParent();
2220   return true;
2221 }
2222 
2223 // Return the use branch instruction, otherwise null if the usage is invalid.
2224 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2225                                        MachineRegisterInfo &MRI,
2226                                        MachineInstr *&Br) {
2227   Register CondDef = MI.getOperand(0).getReg();
2228   if (!MRI.hasOneNonDBGUse(CondDef))
2229     return nullptr;
2230 
2231   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2232   if (UseMI.getParent() != MI.getParent() ||
2233       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2234     return nullptr;
2235 
2236   // Make sure the cond br is followed by a G_BR
2237   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2238   if (Next != MI.getParent()->end()) {
2239     if (Next->getOpcode() != AMDGPU::G_BR)
2240       return nullptr;
2241     Br = &*Next;
2242   }
2243 
2244   return &UseMI;
2245 }
2246 
2247 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
2248                                                 Register Reg, LLT Ty) const {
2249   Register LiveIn = MRI.getLiveInVirtReg(Reg);
2250   if (LiveIn)
2251     return LiveIn;
2252 
2253   Register NewReg = MRI.createGenericVirtualRegister(Ty);
2254   MRI.addLiveIn(Reg, NewReg);
2255   return NewReg;
2256 }
2257 
2258 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2259                                          const ArgDescriptor *Arg) const {
2260   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2261     return false; // TODO: Handle these
2262 
2263   assert(Arg->getRegister().isPhysical());
2264 
2265   MachineRegisterInfo &MRI = *B.getMRI();
2266 
2267   LLT Ty = MRI.getType(DstReg);
2268   Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
2269 
2270   if (Arg->isMasked()) {
2271     // TODO: Should we try to emit this once in the entry block?
2272     const LLT S32 = LLT::scalar(32);
2273     const unsigned Mask = Arg->getMask();
2274     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2275 
2276     Register AndMaskSrc = LiveIn;
2277 
2278     if (Shift != 0) {
2279       auto ShiftAmt = B.buildConstant(S32, Shift);
2280       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2281     }
2282 
2283     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2284   } else
2285     B.buildCopy(DstReg, LiveIn);
2286 
2287   // Insert the argument copy if it doens't already exist.
2288   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
2289   if (!MRI.getVRegDef(LiveIn)) {
2290     // FIXME: Should have scoped insert pt
2291     MachineBasicBlock &OrigInsBB = B.getMBB();
2292     auto OrigInsPt = B.getInsertPt();
2293 
2294     MachineBasicBlock &EntryMBB = B.getMF().front();
2295     EntryMBB.addLiveIn(Arg->getRegister());
2296     B.setInsertPt(EntryMBB, EntryMBB.begin());
2297     B.buildCopy(LiveIn, Arg->getRegister());
2298 
2299     B.setInsertPt(OrigInsBB, OrigInsPt);
2300   }
2301 
2302   return true;
2303 }
2304 
2305 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2306   MachineInstr &MI,
2307   MachineRegisterInfo &MRI,
2308   MachineIRBuilder &B,
2309   AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2310   B.setInstr(MI);
2311 
2312   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2313 
2314   const ArgDescriptor *Arg;
2315   const TargetRegisterClass *RC;
2316   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
2317   if (!Arg) {
2318     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
2319     return false;
2320   }
2321 
2322   if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
2323     MI.eraseFromParent();
2324     return true;
2325   }
2326 
2327   return false;
2328 }
2329 
2330 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2331                                        MachineRegisterInfo &MRI,
2332                                        MachineIRBuilder &B) const {
2333   B.setInstr(MI);
2334   Register Dst = MI.getOperand(0).getReg();
2335   LLT DstTy = MRI.getType(Dst);
2336   LLT S16 = LLT::scalar(16);
2337   LLT S32 = LLT::scalar(32);
2338   LLT S64 = LLT::scalar(64);
2339 
2340   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2341     return true;
2342 
2343   if (DstTy == S16)
2344     return legalizeFDIV16(MI, MRI, B);
2345   if (DstTy == S32)
2346     return legalizeFDIV32(MI, MRI, B);
2347   if (DstTy == S64)
2348     return legalizeFDIV64(MI, MRI, B);
2349 
2350   return false;
2351 }
2352 
2353 static Register buildDivRCP(MachineIRBuilder &B, Register Src) {
2354   const LLT S32 = LLT::scalar(32);
2355 
2356   auto Cvt0 = B.buildUITOFP(S32, Src);
2357   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Cvt0});
2358   auto FPUIntMaxPlus1 = B.buildFConstant(S32, BitsToFloat(0x4f800000));
2359   auto Mul = B.buildFMul(S32, RcpIFlag, FPUIntMaxPlus1);
2360   return B.buildFPTOUI(S32, Mul).getReg(0);
2361 }
2362 
2363 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2364                                                   Register DstReg,
2365                                                   Register Num,
2366                                                   Register Den,
2367                                                   bool IsRem) const {
2368   const LLT S1 = LLT::scalar(1);
2369   const LLT S32 = LLT::scalar(32);
2370 
2371   // RCP =  URECIP(Den) = 2^32 / Den + e
2372   // e is rounding error.
2373   auto RCP = buildDivRCP(B, Den);
2374 
2375   // RCP_LO = mul(RCP, Den)
2376   auto RCP_LO = B.buildMul(S32, RCP, Den);
2377 
2378   // RCP_HI = mulhu (RCP, Den) */
2379   auto RCP_HI = B.buildUMulH(S32, RCP, Den);
2380 
2381   // NEG_RCP_LO = -RCP_LO
2382   auto Zero = B.buildConstant(S32, 0);
2383   auto NEG_RCP_LO = B.buildSub(S32, Zero, RCP_LO);
2384 
2385   // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
2386   auto CmpRcpHiZero = B.buildICmp(CmpInst::ICMP_EQ, S1, RCP_HI, Zero);
2387   auto ABS_RCP_LO = B.buildSelect(S32, CmpRcpHiZero, NEG_RCP_LO, RCP_LO);
2388 
2389   // Calculate the rounding error from the URECIP instruction
2390   // E = mulhu(ABS_RCP_LO, RCP)
2391   auto E = B.buildUMulH(S32, ABS_RCP_LO, RCP);
2392 
2393   // RCP_A_E = RCP + E
2394   auto RCP_A_E = B.buildAdd(S32, RCP, E);
2395 
2396   // RCP_S_E = RCP - E
2397   auto RCP_S_E = B.buildSub(S32, RCP, E);
2398 
2399   // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
2400   auto Tmp0 = B.buildSelect(S32, CmpRcpHiZero, RCP_A_E, RCP_S_E);
2401 
2402   // Quotient = mulhu(Tmp0, Num)stmp
2403   auto Quotient = B.buildUMulH(S32, Tmp0, Num);
2404 
2405   // Num_S_Remainder = Quotient * Den
2406   auto Num_S_Remainder = B.buildMul(S32, Quotient, Den);
2407 
2408   // Remainder = Num - Num_S_Remainder
2409   auto Remainder = B.buildSub(S32, Num, Num_S_Remainder);
2410 
2411   // Remainder_GE_Den = Remainder >= Den
2412   auto Remainder_GE_Den = B.buildICmp(CmpInst::ICMP_UGE, S1, Remainder, Den);
2413 
2414   // Remainder_GE_Zero = Num >= Num_S_Remainder;
2415   auto Remainder_GE_Zero = B.buildICmp(CmpInst::ICMP_UGE, S1,
2416                                        Num, Num_S_Remainder);
2417 
2418   // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
2419   auto Tmp1 = B.buildAnd(S1, Remainder_GE_Den, Remainder_GE_Zero);
2420 
2421   // Calculate Division result:
2422 
2423   // Quotient_A_One = Quotient + 1
2424   auto One = B.buildConstant(S32, 1);
2425   auto Quotient_A_One = B.buildAdd(S32, Quotient, One);
2426 
2427   // Quotient_S_One = Quotient - 1
2428   auto Quotient_S_One = B.buildSub(S32, Quotient, One);
2429 
2430   // Div = (Tmp1 == 0 ? Quotient_A_One : Quotient)
2431   auto Div = B.buildSelect(S32, Tmp1, Quotient, Quotient_A_One);
2432 
2433   // Div = (Remainder_GE_Zero ? Div : Quotient_S_One)
2434   if (IsRem) {
2435     Div = B.buildSelect(S32, Remainder_GE_Zero, Div, Quotient_S_One);
2436 
2437     // Calculate Rem result:
2438     auto Remainder_S_Den = B.buildSub(S32, Remainder, Den);
2439 
2440     // Remainder_A_Den = Remainder + Den
2441     auto Remainder_A_Den = B.buildAdd(S32, Remainder, Den);
2442 
2443     // Rem = (Tmp1 ? Remainder_S_Den : Remainder)
2444     auto Rem = B.buildSelect(S32, Tmp1, Remainder_S_Den, Remainder);
2445 
2446     // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den)
2447     B.buildSelect(DstReg, Remainder_GE_Zero, Rem, Remainder_A_Den);
2448   } else {
2449     B.buildSelect(DstReg, Remainder_GE_Zero, Div, Quotient_S_One);
2450   }
2451 }
2452 
2453 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2454                                               MachineRegisterInfo &MRI,
2455                                               MachineIRBuilder &B) const {
2456   B.setInstr(MI);
2457   const bool IsRem = MI.getOpcode() == AMDGPU::G_UREM;
2458   Register DstReg = MI.getOperand(0).getReg();
2459   Register Num = MI.getOperand(1).getReg();
2460   Register Den = MI.getOperand(2).getReg();
2461   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsRem);
2462   MI.eraseFromParent();
2463   return true;
2464 }
2465 
2466 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2467                                             MachineRegisterInfo &MRI,
2468                                             MachineIRBuilder &B) const {
2469   if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32))
2470     return legalizeUDIV_UREM32(MI, MRI, B);
2471   return false;
2472 }
2473 
2474 bool AMDGPULegalizerInfo::legalizeSDIV_SREM32(MachineInstr &MI,
2475                                               MachineRegisterInfo &MRI,
2476                                               MachineIRBuilder &B) const {
2477   B.setInstr(MI);
2478   const LLT S32 = LLT::scalar(32);
2479 
2480   const bool IsRem = MI.getOpcode() == AMDGPU::G_SREM;
2481   Register DstReg = MI.getOperand(0).getReg();
2482   Register LHS = MI.getOperand(1).getReg();
2483   Register RHS = MI.getOperand(2).getReg();
2484 
2485   auto ThirtyOne = B.buildConstant(S32, 31);
2486   auto LHSign = B.buildAShr(S32, LHS, ThirtyOne);
2487   auto RHSign = B.buildAShr(S32, LHS, ThirtyOne);
2488 
2489   LHS = B.buildAdd(S32, LHS, LHSign).getReg(0);
2490   RHS = B.buildAdd(S32, RHS, RHSign).getReg(0);
2491 
2492   LHS = B.buildXor(S32, LHS, LHSign).getReg(0);
2493   RHS = B.buildXor(S32, RHS, RHSign).getReg(0);
2494 
2495   Register UDivRem = MRI.createGenericVirtualRegister(S32);
2496   legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsRem);
2497 
2498   if (IsRem) {
2499     auto RSign = LHSign; // Remainder sign is the same as LHS
2500     UDivRem = B.buildXor(S32, UDivRem, RSign).getReg(0);
2501     B.buildSub(DstReg, UDivRem, RSign);
2502   } else {
2503     auto DSign = B.buildXor(S32, LHSign, RHSign);
2504     UDivRem = B.buildXor(S32, UDivRem, DSign).getReg(0);
2505     B.buildSub(DstReg, UDivRem, DSign);
2506   }
2507 
2508   MI.eraseFromParent();
2509   return true;
2510 }
2511 
2512 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2513                                             MachineRegisterInfo &MRI,
2514                                             MachineIRBuilder &B) const {
2515   if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32))
2516     return legalizeSDIV_SREM32(MI, MRI, B);
2517   return false;
2518 }
2519 
2520 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2521                                                  MachineRegisterInfo &MRI,
2522                                                  MachineIRBuilder &B) const {
2523   Register Res = MI.getOperand(0).getReg();
2524   Register LHS = MI.getOperand(1).getReg();
2525   Register RHS = MI.getOperand(2).getReg();
2526 
2527   uint16_t Flags = MI.getFlags();
2528 
2529   LLT ResTy = MRI.getType(Res);
2530   LLT S32 = LLT::scalar(32);
2531   LLT S64 = LLT::scalar(64);
2532 
2533   const MachineFunction &MF = B.getMF();
2534   bool Unsafe =
2535     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2536 
2537   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2538     return false;
2539 
2540   if (!Unsafe && ResTy == S32 &&
2541       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2542     return false;
2543 
2544   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2545     // 1 / x -> RCP(x)
2546     if (CLHS->isExactlyValue(1.0)) {
2547       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2548         .addUse(RHS)
2549         .setMIFlags(Flags);
2550 
2551       MI.eraseFromParent();
2552       return true;
2553     }
2554 
2555     // -1 / x -> RCP( FNEG(x) )
2556     if (CLHS->isExactlyValue(-1.0)) {
2557       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2558       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2559         .addUse(FNeg.getReg(0))
2560         .setMIFlags(Flags);
2561 
2562       MI.eraseFromParent();
2563       return true;
2564     }
2565   }
2566 
2567   // x / y -> x * (1.0 / y)
2568   if (Unsafe) {
2569     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2570       .addUse(RHS)
2571       .setMIFlags(Flags);
2572     B.buildFMul(Res, LHS, RCP, Flags);
2573 
2574     MI.eraseFromParent();
2575     return true;
2576   }
2577 
2578   return false;
2579 }
2580 
2581 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2582                                          MachineRegisterInfo &MRI,
2583                                          MachineIRBuilder &B) const {
2584   B.setInstr(MI);
2585   Register Res = MI.getOperand(0).getReg();
2586   Register LHS = MI.getOperand(1).getReg();
2587   Register RHS = MI.getOperand(2).getReg();
2588 
2589   uint16_t Flags = MI.getFlags();
2590 
2591   LLT S16 = LLT::scalar(16);
2592   LLT S32 = LLT::scalar(32);
2593 
2594   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2595   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2596 
2597   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2598     .addUse(RHSExt.getReg(0))
2599     .setMIFlags(Flags);
2600 
2601   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2602   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2603 
2604   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2605     .addUse(RDst.getReg(0))
2606     .addUse(RHS)
2607     .addUse(LHS)
2608     .setMIFlags(Flags);
2609 
2610   MI.eraseFromParent();
2611   return true;
2612 }
2613 
2614 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2615 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2616 static void toggleSPDenormMode(bool Enable,
2617                                MachineIRBuilder &B,
2618                                const GCNSubtarget &ST,
2619                                AMDGPU::SIModeRegisterDefaults Mode) {
2620   // Set SP denorm mode to this value.
2621   unsigned SPDenormMode =
2622     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
2623 
2624   if (ST.hasDenormModeInst()) {
2625     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2626     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
2627 
2628     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2629     B.buildInstr(AMDGPU::S_DENORM_MODE)
2630       .addImm(NewDenormModeValue);
2631 
2632   } else {
2633     // Select FP32 bit field in mode register.
2634     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2635                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2636                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2637 
2638     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2639       .addImm(SPDenormMode)
2640       .addImm(SPDenormModeBitField);
2641   }
2642 }
2643 
2644 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2645                                          MachineRegisterInfo &MRI,
2646                                          MachineIRBuilder &B) const {
2647   B.setInstr(MI);
2648   Register Res = MI.getOperand(0).getReg();
2649   Register LHS = MI.getOperand(1).getReg();
2650   Register RHS = MI.getOperand(2).getReg();
2651   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2652   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2653 
2654   uint16_t Flags = MI.getFlags();
2655 
2656   LLT S32 = LLT::scalar(32);
2657   LLT S1 = LLT::scalar(1);
2658 
2659   auto One = B.buildFConstant(S32, 1.0f);
2660 
2661   auto DenominatorScaled =
2662     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2663       .addUse(RHS)
2664       .addUse(LHS)
2665       .addImm(1)
2666       .setMIFlags(Flags);
2667   auto NumeratorScaled =
2668     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2669       .addUse(LHS)
2670       .addUse(RHS)
2671       .addImm(0)
2672       .setMIFlags(Flags);
2673 
2674   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2675     .addUse(DenominatorScaled.getReg(0))
2676     .setMIFlags(Flags);
2677   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2678 
2679   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2680   // aren't modeled as reading it.
2681   if (!Mode.allFP32Denormals())
2682     toggleSPDenormMode(true, B, ST, Mode);
2683 
2684   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2685   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2686   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2687   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2688   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2689   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2690 
2691   if (!Mode.allFP32Denormals())
2692     toggleSPDenormMode(false, B, ST, Mode);
2693 
2694   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2695     .addUse(Fma4.getReg(0))
2696     .addUse(Fma1.getReg(0))
2697     .addUse(Fma3.getReg(0))
2698     .addUse(NumeratorScaled.getReg(1))
2699     .setMIFlags(Flags);
2700 
2701   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2702     .addUse(Fmas.getReg(0))
2703     .addUse(RHS)
2704     .addUse(LHS)
2705     .setMIFlags(Flags);
2706 
2707   MI.eraseFromParent();
2708   return true;
2709 }
2710 
2711 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
2712                                          MachineRegisterInfo &MRI,
2713                                          MachineIRBuilder &B) const {
2714   B.setInstr(MI);
2715   Register Res = MI.getOperand(0).getReg();
2716   Register LHS = MI.getOperand(1).getReg();
2717   Register RHS = MI.getOperand(2).getReg();
2718 
2719   uint16_t Flags = MI.getFlags();
2720 
2721   LLT S64 = LLT::scalar(64);
2722   LLT S1 = LLT::scalar(1);
2723 
2724   auto One = B.buildFConstant(S64, 1.0);
2725 
2726   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2727     .addUse(LHS)
2728     .addUse(RHS)
2729     .addImm(1)
2730     .setMIFlags(Flags);
2731 
2732   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
2733 
2734   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
2735     .addUse(DivScale0.getReg(0))
2736     .setMIFlags(Flags);
2737 
2738   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
2739   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
2740   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
2741 
2742   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2743     .addUse(LHS)
2744     .addUse(RHS)
2745     .addImm(0)
2746     .setMIFlags(Flags);
2747 
2748   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
2749   auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags);
2750   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
2751 
2752   Register Scale;
2753   if (!ST.hasUsableDivScaleConditionOutput()) {
2754     // Workaround a hardware bug on SI where the condition output from div_scale
2755     // is not usable.
2756 
2757     LLT S32 = LLT::scalar(32);
2758 
2759     auto NumUnmerge = B.buildUnmerge(S32, LHS);
2760     auto DenUnmerge = B.buildUnmerge(S32, RHS);
2761     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
2762     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
2763 
2764     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
2765                               Scale1Unmerge.getReg(1));
2766     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
2767                               Scale0Unmerge.getReg(1));
2768     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
2769   } else {
2770     Scale = DivScale1.getReg(1);
2771   }
2772 
2773   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
2774     .addUse(Fma4.getReg(0))
2775     .addUse(Fma3.getReg(0))
2776     .addUse(Mul.getReg(0))
2777     .addUse(Scale)
2778     .setMIFlags(Flags);
2779 
2780   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
2781     .addUse(Fmas.getReg(0))
2782     .addUse(RHS)
2783     .addUse(LHS)
2784     .setMIFlags(Flags);
2785 
2786   MI.eraseFromParent();
2787   return true;
2788 }
2789 
2790 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
2791                                                  MachineRegisterInfo &MRI,
2792                                                  MachineIRBuilder &B) const {
2793   B.setInstr(MI);
2794   Register Res = MI.getOperand(0).getReg();
2795   Register LHS = MI.getOperand(2).getReg();
2796   Register RHS = MI.getOperand(3).getReg();
2797   uint16_t Flags = MI.getFlags();
2798 
2799   LLT S32 = LLT::scalar(32);
2800   LLT S1 = LLT::scalar(1);
2801 
2802   auto Abs = B.buildFAbs(S32, RHS, Flags);
2803   const APFloat C0Val(1.0f);
2804 
2805   auto C0 = B.buildConstant(S32, 0x6f800000);
2806   auto C1 = B.buildConstant(S32, 0x2f800000);
2807   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
2808 
2809   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
2810   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
2811 
2812   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
2813 
2814   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2815     .addUse(Mul0.getReg(0))
2816     .setMIFlags(Flags);
2817 
2818   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
2819 
2820   B.buildFMul(Res, Sel, Mul1, Flags);
2821 
2822   MI.eraseFromParent();
2823   return true;
2824 }
2825 
2826 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
2827                                                  MachineRegisterInfo &MRI,
2828                                                  MachineIRBuilder &B) const {
2829   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2830   if (!MFI->isEntryFunction()) {
2831     return legalizePreloadedArgIntrin(MI, MRI, B,
2832                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
2833   }
2834 
2835   B.setInstr(MI);
2836 
2837   uint64_t Offset =
2838     ST.getTargetLowering()->getImplicitParameterOffset(
2839       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
2840   Register DstReg = MI.getOperand(0).getReg();
2841   LLT DstTy = MRI.getType(DstReg);
2842   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
2843 
2844   const ArgDescriptor *Arg;
2845   const TargetRegisterClass *RC;
2846   std::tie(Arg, RC)
2847     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2848   if (!Arg)
2849     return false;
2850 
2851   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
2852   if (!loadInputValue(KernargPtrReg, B, Arg))
2853     return false;
2854 
2855   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
2856   MI.eraseFromParent();
2857   return true;
2858 }
2859 
2860 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
2861                                               MachineRegisterInfo &MRI,
2862                                               MachineIRBuilder &B,
2863                                               unsigned AddrSpace) const {
2864   B.setInstr(MI);
2865   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
2866   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
2867   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
2868   MI.eraseFromParent();
2869   return true;
2870 }
2871 
2872 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
2873 // offset (the offset that is included in bounds checking and swizzling, to be
2874 // split between the instruction's voffset and immoffset fields) and soffset
2875 // (the offset that is excluded from bounds checking and swizzling, to go in
2876 // the instruction's soffset field).  This function takes the first kind of
2877 // offset and figures out how to split it between voffset and immoffset.
2878 std::tuple<Register, unsigned, unsigned>
2879 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
2880                                         Register OrigOffset) const {
2881   const unsigned MaxImm = 4095;
2882   Register BaseReg;
2883   unsigned TotalConstOffset;
2884   MachineInstr *OffsetDef;
2885   const LLT S32 = LLT::scalar(32);
2886 
2887   std::tie(BaseReg, TotalConstOffset, OffsetDef)
2888     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
2889 
2890   unsigned ImmOffset = TotalConstOffset;
2891 
2892   // If the immediate value is too big for the immoffset field, put the value
2893   // and -4096 into the immoffset field so that the value that is copied/added
2894   // for the voffset field is a multiple of 4096, and it stands more chance
2895   // of being CSEd with the copy/add for another similar load/store.
2896   // However, do not do that rounding down to a multiple of 4096 if that is a
2897   // negative number, as it appears to be illegal to have a negative offset
2898   // in the vgpr, even if adding the immediate offset makes it positive.
2899   unsigned Overflow = ImmOffset & ~MaxImm;
2900   ImmOffset -= Overflow;
2901   if ((int32_t)Overflow < 0) {
2902     Overflow += ImmOffset;
2903     ImmOffset = 0;
2904   }
2905 
2906   if (Overflow != 0) {
2907     if (!BaseReg) {
2908       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
2909     } else {
2910       auto OverflowVal = B.buildConstant(S32, Overflow);
2911       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
2912     }
2913   }
2914 
2915   if (!BaseReg)
2916     BaseReg = B.buildConstant(S32, 0).getReg(0);
2917 
2918   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
2919 }
2920 
2921 /// Handle register layout difference for f16 images for some subtargets.
2922 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
2923                                              MachineRegisterInfo &MRI,
2924                                              Register Reg) const {
2925   if (!ST.hasUnpackedD16VMem())
2926     return Reg;
2927 
2928   const LLT S16 = LLT::scalar(16);
2929   const LLT S32 = LLT::scalar(32);
2930   LLT StoreVT = MRI.getType(Reg);
2931   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
2932 
2933   auto Unmerge = B.buildUnmerge(S16, Reg);
2934 
2935   SmallVector<Register, 4> WideRegs;
2936   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
2937     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
2938 
2939   int NumElts = StoreVT.getNumElements();
2940 
2941   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
2942 }
2943 
2944 Register AMDGPULegalizerInfo::fixStoreSourceType(
2945   MachineIRBuilder &B, Register VData, bool IsFormat) const {
2946   MachineRegisterInfo *MRI = B.getMRI();
2947   LLT Ty = MRI->getType(VData);
2948 
2949   const LLT S16 = LLT::scalar(16);
2950 
2951   // Fixup illegal register types for i8 stores.
2952   if (Ty == LLT::scalar(8) || Ty == S16) {
2953     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
2954     return AnyExt;
2955   }
2956 
2957   if (Ty.isVector()) {
2958     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
2959       if (IsFormat)
2960         return handleD16VData(B, *MRI, VData);
2961     }
2962   }
2963 
2964   return VData;
2965 }
2966 
2967 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
2968                                               MachineRegisterInfo &MRI,
2969                                               MachineIRBuilder &B,
2970                                               bool IsTyped,
2971                                               bool IsFormat) const {
2972   B.setInstr(MI);
2973 
2974   Register VData = MI.getOperand(1).getReg();
2975   LLT Ty = MRI.getType(VData);
2976   LLT EltTy = Ty.getScalarType();
2977   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
2978   const LLT S32 = LLT::scalar(32);
2979 
2980   VData = fixStoreSourceType(B, VData, IsFormat);
2981   Register RSrc = MI.getOperand(2).getReg();
2982 
2983   MachineMemOperand *MMO = *MI.memoperands_begin();
2984   const int MemSize = MMO->getSize();
2985 
2986   unsigned ImmOffset;
2987   unsigned TotalOffset;
2988 
2989   // The typed intrinsics add an immediate after the registers.
2990   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
2991 
2992   // The struct intrinsic variants add one additional operand over raw.
2993   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
2994   Register VIndex;
2995   int OpOffset = 0;
2996   if (HasVIndex) {
2997     VIndex = MI.getOperand(3).getReg();
2998     OpOffset = 1;
2999   }
3000 
3001   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3002   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3003 
3004   unsigned Format = 0;
3005   if (IsTyped) {
3006     Format = MI.getOperand(5 + OpOffset).getImm();
3007     ++OpOffset;
3008   }
3009 
3010   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3011 
3012   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3013   if (TotalOffset != 0)
3014     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3015 
3016   unsigned Opc;
3017   if (IsTyped) {
3018     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3019                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3020   } else if (IsFormat) {
3021     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3022                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3023   } else {
3024     switch (MemSize) {
3025     case 1:
3026       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3027       break;
3028     case 2:
3029       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3030       break;
3031     default:
3032       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3033       break;
3034     }
3035   }
3036 
3037   if (!VIndex)
3038     VIndex = B.buildConstant(S32, 0).getReg(0);
3039 
3040   auto MIB = B.buildInstr(Opc)
3041     .addUse(VData)              // vdata
3042     .addUse(RSrc)               // rsrc
3043     .addUse(VIndex)             // vindex
3044     .addUse(VOffset)            // voffset
3045     .addUse(SOffset)            // soffset
3046     .addImm(ImmOffset);         // offset(imm)
3047 
3048   if (IsTyped)
3049     MIB.addImm(Format);
3050 
3051   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3052      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3053      .addMemOperand(MMO);
3054 
3055   MI.eraseFromParent();
3056   return true;
3057 }
3058 
3059 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3060                                              MachineRegisterInfo &MRI,
3061                                              MachineIRBuilder &B,
3062                                              bool IsFormat,
3063                                              bool IsTyped) const {
3064   B.setInstr(MI);
3065 
3066   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3067   MachineMemOperand *MMO = *MI.memoperands_begin();
3068   const int MemSize = MMO->getSize();
3069   const LLT S32 = LLT::scalar(32);
3070 
3071   Register Dst = MI.getOperand(0).getReg();
3072   Register RSrc = MI.getOperand(2).getReg();
3073 
3074   // The typed intrinsics add an immediate after the registers.
3075   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3076 
3077   // The struct intrinsic variants add one additional operand over raw.
3078   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3079   Register VIndex;
3080   int OpOffset = 0;
3081   if (HasVIndex) {
3082     VIndex = MI.getOperand(3).getReg();
3083     OpOffset = 1;
3084   }
3085 
3086   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3087   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3088 
3089   unsigned Format = 0;
3090   if (IsTyped) {
3091     Format = MI.getOperand(5 + OpOffset).getImm();
3092     ++OpOffset;
3093   }
3094 
3095   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3096   unsigned ImmOffset;
3097   unsigned TotalOffset;
3098 
3099   LLT Ty = MRI.getType(Dst);
3100   LLT EltTy = Ty.getScalarType();
3101   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3102   const bool Unpacked = ST.hasUnpackedD16VMem();
3103 
3104   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3105   if (TotalOffset != 0)
3106     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3107 
3108   unsigned Opc;
3109 
3110   if (IsTyped) {
3111     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3112                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3113   } else if (IsFormat) {
3114     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3115                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3116   } else {
3117     switch (MemSize) {
3118     case 1:
3119       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3120       break;
3121     case 2:
3122       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3123       break;
3124     default:
3125       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3126       break;
3127     }
3128   }
3129 
3130   Register LoadDstReg;
3131 
3132   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3133   LLT UnpackedTy = Ty.changeElementSize(32);
3134 
3135   if (IsExtLoad)
3136     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3137   else if (Unpacked && IsD16 && Ty.isVector())
3138     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3139   else
3140     LoadDstReg = Dst;
3141 
3142   if (!VIndex)
3143     VIndex = B.buildConstant(S32, 0).getReg(0);
3144 
3145   auto MIB = B.buildInstr(Opc)
3146     .addDef(LoadDstReg)         // vdata
3147     .addUse(RSrc)               // rsrc
3148     .addUse(VIndex)             // vindex
3149     .addUse(VOffset)            // voffset
3150     .addUse(SOffset)            // soffset
3151     .addImm(ImmOffset);         // offset(imm)
3152 
3153   if (IsTyped)
3154     MIB.addImm(Format);
3155 
3156   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3157      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3158      .addMemOperand(MMO);
3159 
3160   if (LoadDstReg != Dst) {
3161     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3162 
3163     // Widen result for extending loads was widened.
3164     if (IsExtLoad)
3165       B.buildTrunc(Dst, LoadDstReg);
3166     else {
3167       // Repack to original 16-bit vector result
3168       // FIXME: G_TRUNC should work, but legalization currently fails
3169       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3170       SmallVector<Register, 4> Repack;
3171       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3172         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3173       B.buildMerge(Dst, Repack);
3174     }
3175   }
3176 
3177   MI.eraseFromParent();
3178   return true;
3179 }
3180 
3181 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3182                                                MachineIRBuilder &B,
3183                                                bool IsInc) const {
3184   B.setInstr(MI);
3185   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3186                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3187   B.buildInstr(Opc)
3188     .addDef(MI.getOperand(0).getReg())
3189     .addUse(MI.getOperand(2).getReg())
3190     .addUse(MI.getOperand(3).getReg())
3191     .cloneMemRefs(MI);
3192   MI.eraseFromParent();
3193   return true;
3194 }
3195 
3196 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3197   switch (IntrID) {
3198   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3199   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3200     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3201   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3202   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3203     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3204   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3205   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3206     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3207   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3208   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3209     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3210   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3211   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3212     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3213   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3214   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3215     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3216   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3217   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3218     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3219   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3220   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3221     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3222   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3223   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3224     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3225   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3226   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3227     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3228   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3229   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3230     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3231   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3232   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3233     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3234   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3235   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3236     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3237   default:
3238     llvm_unreachable("unhandled atomic opcode");
3239   }
3240 }
3241 
3242 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3243                                                MachineIRBuilder &B,
3244                                                Intrinsic::ID IID) const {
3245   B.setInstr(MI);
3246 
3247   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3248                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3249 
3250   Register Dst = MI.getOperand(0).getReg();
3251   Register VData = MI.getOperand(2).getReg();
3252 
3253   Register CmpVal;
3254   int OpOffset = 0;
3255 
3256   if (IsCmpSwap) {
3257     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3258     ++OpOffset;
3259   }
3260 
3261   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3262   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
3263 
3264   // The struct intrinsic variants add one additional operand over raw.
3265   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3266   Register VIndex;
3267   if (HasVIndex) {
3268     VIndex = MI.getOperand(4 + OpOffset).getReg();
3269     ++OpOffset;
3270   }
3271 
3272   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3273   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3274   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3275 
3276   MachineMemOperand *MMO = *MI.memoperands_begin();
3277 
3278   unsigned ImmOffset;
3279   unsigned TotalOffset;
3280   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3281   if (TotalOffset != 0)
3282     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3283 
3284   if (!VIndex)
3285     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3286 
3287   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
3288     .addDef(Dst)
3289     .addUse(VData); // vdata
3290 
3291   if (IsCmpSwap)
3292     MIB.addReg(CmpVal);
3293 
3294   MIB.addUse(RSrc)               // rsrc
3295      .addUse(VIndex)             // vindex
3296      .addUse(VOffset)            // voffset
3297      .addUse(SOffset)            // soffset
3298      .addImm(ImmOffset)          // offset(imm)
3299      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3300      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3301      .addMemOperand(MMO);
3302 
3303   MI.eraseFromParent();
3304   return true;
3305 }
3306 
3307 // Produce a vector of s16 elements from s32 pieces.
3308 static void truncToS16Vector(MachineIRBuilder &B, Register DstReg,
3309                              ArrayRef<Register> UnmergeParts) {
3310   const LLT S16 = LLT::scalar(16);
3311 
3312   SmallVector<Register, 4> RemergeParts(UnmergeParts.size());
3313   for (int I = 0, E = UnmergeParts.size(); I != E; ++I)
3314     RemergeParts[I] = B.buildTrunc(S16, UnmergeParts[I]).getReg(0);
3315 
3316   B.buildBuildVector(DstReg, RemergeParts);
3317 }
3318 
3319 /// Convert a set of s32 registers to a result vector with s16 elements.
3320 static void bitcastToS16Vector(MachineIRBuilder &B, Register DstReg,
3321                                ArrayRef<Register> UnmergeParts) {
3322   MachineRegisterInfo &MRI = *B.getMRI();
3323   const LLT V2S16 = LLT::vector(2, 16);
3324   LLT TargetTy = MRI.getType(DstReg);
3325   int NumElts = UnmergeParts.size();
3326 
3327   if (NumElts == 1) {
3328     assert(TargetTy == V2S16);
3329     B.buildBitcast(DstReg, UnmergeParts[0]);
3330     return;
3331   }
3332 
3333   SmallVector<Register, 4> RemergeParts(NumElts);
3334   for (int I = 0; I != NumElts; ++I)
3335     RemergeParts[I] = B.buildBitcast(V2S16, UnmergeParts[I]).getReg(0);
3336 
3337   if (TargetTy.getSizeInBits() == 32u * NumElts) {
3338     B.buildConcatVectors(DstReg, RemergeParts);
3339     return;
3340   }
3341 
3342   const LLT V3S16 = LLT::vector(3, 16);
3343   const LLT V6S16 = LLT::vector(6, 16);
3344 
3345   // Widen to v6s16 and unpack v3 parts.
3346   assert(TargetTy == V3S16);
3347 
3348   RemergeParts.push_back(B.buildUndef(V2S16).getReg(0));
3349   auto Concat = B.buildConcatVectors(V6S16, RemergeParts);
3350   B.buildUnmerge({DstReg, MRI.createGenericVirtualRegister(V3S16)}, Concat);
3351 }
3352 
3353 // FIXME: Just vector trunc should be sufficent, but legalization currently
3354 // broken.
3355 static void repackUnpackedD16Load(MachineIRBuilder &B, Register DstReg,
3356                                   Register WideDstReg) {
3357   const LLT S32 = LLT::scalar(32);
3358   const LLT S16 = LLT::scalar(16);
3359 
3360   auto Unmerge = B.buildUnmerge(S32, WideDstReg);
3361 
3362   int NumOps = Unmerge->getNumOperands() - 1;
3363   SmallVector<Register, 4> RemergeParts(NumOps);
3364   for (int I = 0; I != NumOps; ++I)
3365     RemergeParts[I] = B.buildTrunc(S16, Unmerge.getReg(I)).getReg(0);
3366 
3367   B.buildBuildVector(DstReg, RemergeParts);
3368 }
3369 
3370 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3371     MachineInstr &MI, MachineIRBuilder &B,
3372     GISelChangeObserver &Observer,
3373     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3374   bool IsTFE = MI.getNumExplicitDefs() == 2;
3375 
3376   // We are only processing the operands of d16 image operations on subtargets
3377   // that use the unpacked register layout, or need to repack the TFE result.
3378 
3379   // TODO: Need to handle a16 images too
3380   // TODO: Do we need to guard against already legalized intrinsics?
3381   if (!IsTFE && !ST.hasUnpackedD16VMem())
3382     return true;
3383 
3384   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3385     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3386 
3387   if (BaseOpcode->Atomic) // No d16 atomics, or TFE.
3388     return true;
3389 
3390   B.setInstr(MI);
3391 
3392   MachineRegisterInfo *MRI = B.getMRI();
3393   const LLT S32 = LLT::scalar(32);
3394   const LLT S16 = LLT::scalar(16);
3395 
3396   if (BaseOpcode->Store) { // No TFE for stores?
3397     Register VData = MI.getOperand(1).getReg();
3398     LLT Ty = MRI->getType(VData);
3399     if (!Ty.isVector() || Ty.getElementType() != S16)
3400       return true;
3401 
3402     B.setInstr(MI);
3403 
3404     Observer.changingInstr(MI);
3405     MI.getOperand(1).setReg(handleD16VData(B, *MRI, VData));
3406     Observer.changedInstr(MI);
3407     return true;
3408   }
3409 
3410   Register DstReg = MI.getOperand(0).getReg();
3411   LLT Ty = MRI->getType(DstReg);
3412   const LLT EltTy = Ty.getScalarType();
3413   const bool IsD16 = Ty.getScalarType() == S16;
3414   const unsigned NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
3415 
3416   if (IsTFE) {
3417     // In the IR, TFE is supposed to be used with a 2 element struct return
3418     // type. The intruction really returns these two values in one contiguous
3419     // register, with one additional dword beyond the loaded data. Rewrite the
3420     // return type to use a single register result.
3421     Register Dst1Reg = MI.getOperand(1).getReg();
3422     if (MRI->getType(Dst1Reg) != S32)
3423       return false;
3424 
3425     // TODO: Make sure the TFE operand bit is set.
3426 
3427     // The raw dword aligned data component of the load. The only legal cases
3428     // where this matters should be when using the packed D16 format, for
3429     // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
3430     LLT RoundedTy;
3431     LLT TFETy;
3432 
3433     if (IsD16 && ST.hasUnpackedD16VMem()) {
3434       RoundedTy = LLT::scalarOrVector(NumElts, 32);
3435       TFETy = LLT::vector(NumElts + 1, 32);
3436     } else {
3437       unsigned EltSize = Ty.getScalarSizeInBits();
3438       unsigned RoundedElts = (Ty.getSizeInBits() + 31) / 32;
3439       unsigned RoundedSize = 32 * RoundedElts;
3440       RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
3441       TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
3442     }
3443 
3444     Register TFEReg = MRI->createGenericVirtualRegister(TFETy);
3445     Observer.changingInstr(MI);
3446 
3447     MI.getOperand(0).setReg(TFEReg);
3448     MI.RemoveOperand(1);
3449 
3450     Observer.changedInstr(MI);
3451 
3452     // Insert after the instruction.
3453     B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3454 
3455     // Now figure out how to copy the new result register back into the old
3456     // result.
3457 
3458     SmallVector<Register, 5> UnmergeResults(TFETy.getNumElements(), Dst1Reg);
3459     int NumDataElts = TFETy.getNumElements() - 1;
3460 
3461     if (!Ty.isVector()) {
3462       // Simplest case is a trivial unmerge (plus a truncate for d16).
3463       UnmergeResults[0] = Ty == S32 ?
3464         DstReg : MRI->createGenericVirtualRegister(S32);
3465 
3466       B.buildUnmerge(UnmergeResults, TFEReg);
3467       if (Ty != S32)
3468         B.buildTrunc(DstReg, UnmergeResults[0]);
3469       return true;
3470     }
3471 
3472     // We have to repack into a new vector of some kind.
3473     for (int I = 0; I != NumDataElts; ++I)
3474       UnmergeResults[I] = MRI->createGenericVirtualRegister(S32);
3475     B.buildUnmerge(UnmergeResults, TFEReg);
3476 
3477     // Drop the final TFE element.
3478     ArrayRef<Register> DataPart(UnmergeResults.data(), NumDataElts);
3479 
3480     if (EltTy == S32)
3481       B.buildBuildVector(DstReg, DataPart);
3482     else if (ST.hasUnpackedD16VMem())
3483       truncToS16Vector(B, DstReg, DataPart);
3484     else
3485       bitcastToS16Vector(B, DstReg, DataPart);
3486 
3487     return true;
3488   }
3489 
3490   // Must be an image load.
3491   if (!Ty.isVector() || Ty.getElementType() != S16)
3492     return true;
3493 
3494   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3495 
3496   LLT WidenedTy = Ty.changeElementType(S32);
3497   Register WideDstReg = MRI->createGenericVirtualRegister(WidenedTy);
3498 
3499   Observer.changingInstr(MI);
3500   MI.getOperand(0).setReg(WideDstReg);
3501   Observer.changedInstr(MI);
3502 
3503   repackUnpackedD16Load(B, DstReg, WideDstReg);
3504   return true;
3505 }
3506 
3507 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
3508   MachineInstr &MI, MachineIRBuilder &B,
3509   GISelChangeObserver &Observer) const {
3510   Register Dst = MI.getOperand(0).getReg();
3511   LLT Ty = B.getMRI()->getType(Dst);
3512   unsigned Size = Ty.getSizeInBits();
3513   MachineFunction &MF = B.getMF();
3514 
3515   Observer.changingInstr(MI);
3516 
3517   // FIXME: We don't really need this intermediate instruction. The intrinsic
3518   // should be fixed to have a memory operand. Since it's readnone, we're not
3519   // allowed to add one.
3520   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
3521   MI.RemoveOperand(1); // Remove intrinsic ID
3522 
3523   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
3524   // TODO: Should this use datalayout alignment?
3525   const unsigned MemSize = (Size + 7) / 8;
3526   const unsigned MemAlign = 4;
3527   MachineMemOperand *MMO = MF.getMachineMemOperand(
3528     MachinePointerInfo(),
3529     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
3530     MachineMemOperand::MOInvariant, MemSize, MemAlign);
3531   MI.addMemOperand(MF, MMO);
3532 
3533   // There are no 96-bit result scalar loads, but widening to 128-bit should
3534   // always be legal. We may need to restore this to a 96-bit result if it turns
3535   // out this needs to be converted to a vector load during RegBankSelect.
3536   if (!isPowerOf2_32(Size)) {
3537     LegalizerHelper Helper(MF, *this, Observer, B);
3538     B.setInstr(MI);
3539 
3540     if (Ty.isVector())
3541       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
3542     else
3543       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
3544   }
3545 
3546   Observer.changedInstr(MI);
3547   return true;
3548 }
3549 
3550 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
3551                                             MachineIRBuilder &B,
3552                                             GISelChangeObserver &Observer) const {
3553   MachineRegisterInfo &MRI = *B.getMRI();
3554 
3555   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
3556   auto IntrID = MI.getIntrinsicID();
3557   switch (IntrID) {
3558   case Intrinsic::amdgcn_if:
3559   case Intrinsic::amdgcn_else: {
3560     MachineInstr *Br = nullptr;
3561     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
3562       const SIRegisterInfo *TRI
3563         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
3564 
3565       B.setInstr(*BrCond);
3566       Register Def = MI.getOperand(1).getReg();
3567       Register Use = MI.getOperand(3).getReg();
3568 
3569       MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB();
3570       if (Br)
3571         BrTarget = Br->getOperand(0).getMBB();
3572 
3573       if (IntrID == Intrinsic::amdgcn_if) {
3574         B.buildInstr(AMDGPU::SI_IF)
3575           .addDef(Def)
3576           .addUse(Use)
3577           .addMBB(BrTarget);
3578       } else {
3579         B.buildInstr(AMDGPU::SI_ELSE)
3580           .addDef(Def)
3581           .addUse(Use)
3582           .addMBB(BrTarget)
3583           .addImm(0);
3584       }
3585 
3586       if (Br)
3587         Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB());
3588 
3589       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
3590       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
3591       MI.eraseFromParent();
3592       BrCond->eraseFromParent();
3593       return true;
3594     }
3595 
3596     return false;
3597   }
3598   case Intrinsic::amdgcn_loop: {
3599     MachineInstr *Br = nullptr;
3600     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
3601       const SIRegisterInfo *TRI
3602         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
3603 
3604       B.setInstr(*BrCond);
3605 
3606       MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB();
3607       if (Br)
3608         BrTarget = Br->getOperand(0).getMBB();
3609 
3610       Register Reg = MI.getOperand(2).getReg();
3611       B.buildInstr(AMDGPU::SI_LOOP)
3612         .addUse(Reg)
3613         .addMBB(BrTarget);
3614 
3615       if (Br)
3616         Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB());
3617 
3618       MI.eraseFromParent();
3619       BrCond->eraseFromParent();
3620       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
3621       return true;
3622     }
3623 
3624     return false;
3625   }
3626   case Intrinsic::amdgcn_kernarg_segment_ptr:
3627     return legalizePreloadedArgIntrin(
3628       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
3629   case Intrinsic::amdgcn_implicitarg_ptr:
3630     return legalizeImplicitArgPtr(MI, MRI, B);
3631   case Intrinsic::amdgcn_workitem_id_x:
3632     return legalizePreloadedArgIntrin(MI, MRI, B,
3633                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
3634   case Intrinsic::amdgcn_workitem_id_y:
3635     return legalizePreloadedArgIntrin(MI, MRI, B,
3636                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
3637   case Intrinsic::amdgcn_workitem_id_z:
3638     return legalizePreloadedArgIntrin(MI, MRI, B,
3639                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
3640   case Intrinsic::amdgcn_workgroup_id_x:
3641     return legalizePreloadedArgIntrin(MI, MRI, B,
3642                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
3643   case Intrinsic::amdgcn_workgroup_id_y:
3644     return legalizePreloadedArgIntrin(MI, MRI, B,
3645                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
3646   case Intrinsic::amdgcn_workgroup_id_z:
3647     return legalizePreloadedArgIntrin(MI, MRI, B,
3648                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
3649   case Intrinsic::amdgcn_dispatch_ptr:
3650     return legalizePreloadedArgIntrin(MI, MRI, B,
3651                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
3652   case Intrinsic::amdgcn_queue_ptr:
3653     return legalizePreloadedArgIntrin(MI, MRI, B,
3654                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
3655   case Intrinsic::amdgcn_implicit_buffer_ptr:
3656     return legalizePreloadedArgIntrin(
3657       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
3658   case Intrinsic::amdgcn_dispatch_id:
3659     return legalizePreloadedArgIntrin(MI, MRI, B,
3660                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
3661   case Intrinsic::amdgcn_fdiv_fast:
3662     return legalizeFDIVFastIntrin(MI, MRI, B);
3663   case Intrinsic::amdgcn_is_shared:
3664     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
3665   case Intrinsic::amdgcn_is_private:
3666     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
3667   case Intrinsic::amdgcn_wavefrontsize: {
3668     B.setInstr(MI);
3669     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
3670     MI.eraseFromParent();
3671     return true;
3672   }
3673   case Intrinsic::amdgcn_s_buffer_load:
3674     return legalizeSBufferLoad(MI, B, Observer);
3675   case Intrinsic::amdgcn_raw_buffer_store:
3676   case Intrinsic::amdgcn_struct_buffer_store:
3677     return legalizeBufferStore(MI, MRI, B, false, false);
3678   case Intrinsic::amdgcn_raw_buffer_store_format:
3679   case Intrinsic::amdgcn_struct_buffer_store_format:
3680     return legalizeBufferStore(MI, MRI, B, false, true);
3681   case Intrinsic::amdgcn_raw_tbuffer_store:
3682   case Intrinsic::amdgcn_struct_tbuffer_store:
3683     return legalizeBufferStore(MI, MRI, B, true, true);
3684   case Intrinsic::amdgcn_raw_buffer_load:
3685   case Intrinsic::amdgcn_struct_buffer_load:
3686     return legalizeBufferLoad(MI, MRI, B, false, false);
3687   case Intrinsic::amdgcn_raw_buffer_load_format:
3688   case Intrinsic::amdgcn_struct_buffer_load_format:
3689     return legalizeBufferLoad(MI, MRI, B, true, false);
3690   case Intrinsic::amdgcn_raw_tbuffer_load:
3691   case Intrinsic::amdgcn_struct_tbuffer_load:
3692     return legalizeBufferLoad(MI, MRI, B, true, true);
3693   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3694   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3695   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3696   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3697   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3698   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3699   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3700   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3701   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3702   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3703   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3704   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3705   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3706   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3707   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3708   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3709   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3710   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3711   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3712   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3713   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3714   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3715   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3716   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3717   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3718   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3719     return legalizeBufferAtomic(MI, B, IntrID);
3720   case Intrinsic::amdgcn_atomic_inc:
3721     return legalizeAtomicIncDec(MI, B, true);
3722   case Intrinsic::amdgcn_atomic_dec:
3723     return legalizeAtomicIncDec(MI, B, false);
3724   default: {
3725     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
3726             AMDGPU::getImageDimIntrinsicInfo(IntrID))
3727       return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr);
3728     return true;
3729   }
3730   }
3731 
3732   return true;
3733 }
3734