1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
19 #endif
20 
21 #include "AMDGPULegalizerInfo.h"
22 
23 #include "AMDGPU.h"
24 #include "AMDGPUGlobalISelUtils.h"
25 #include "AMDGPUTargetMachine.h"
26 #include "SIMachineFunctionInfo.h"
27 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
28 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
29 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
30 #include "llvm/CodeGen/TargetOpcodes.h"
31 #include "llvm/CodeGen/ValueTypes.h"
32 #include "llvm/IR/DerivedTypes.h"
33 #include "llvm/IR/DiagnosticInfo.h"
34 #include "llvm/IR/Type.h"
35 #include "llvm/Support/Debug.h"
36 
37 #define DEBUG_TYPE "amdgpu-legalinfo"
38 
39 using namespace llvm;
40 using namespace LegalizeActions;
41 using namespace LegalizeMutations;
42 using namespace LegalityPredicates;
43 using namespace MIPatternMatch;
44 
45 // Round the number of elements to the next power of two elements
46 static LLT getPow2VectorType(LLT Ty) {
47   unsigned NElts = Ty.getNumElements();
48   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
49   return Ty.changeNumElements(Pow2NElts);
50 }
51 
52 // Round the number of bits to the next power of two bits
53 static LLT getPow2ScalarType(LLT Ty) {
54   unsigned Bits = Ty.getSizeInBits();
55   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
56   return LLT::scalar(Pow2Bits);
57 }
58 
59 static LegalityPredicate isMultiple32(unsigned TypeIdx,
60                                       unsigned MaxSize = 1024) {
61   return [=](const LegalityQuery &Query) {
62     const LLT Ty = Query.Types[TypeIdx];
63     const LLT EltTy = Ty.getScalarType();
64     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
65   };
66 }
67 
68 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
69   return [=](const LegalityQuery &Query) {
70     return Query.Types[TypeIdx].getSizeInBits() == Size;
71   };
72 }
73 
74 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
75   return [=](const LegalityQuery &Query) {
76     const LLT Ty = Query.Types[TypeIdx];
77     return Ty.isVector() &&
78            Ty.getNumElements() % 2 != 0 &&
79            Ty.getElementType().getSizeInBits() < 32 &&
80            Ty.getSizeInBits() % 32 != 0;
81   };
82 }
83 
84 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
85   return [=](const LegalityQuery &Query) {
86     const LLT Ty = Query.Types[TypeIdx];
87     const LLT EltTy = Ty.getScalarType();
88     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
89   };
90 }
91 
92 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
93   return [=](const LegalityQuery &Query) {
94     const LLT Ty = Query.Types[TypeIdx];
95     const LLT EltTy = Ty.getElementType();
96     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
97   };
98 }
99 
100 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
101   return [=](const LegalityQuery &Query) {
102     const LLT Ty = Query.Types[TypeIdx];
103     const LLT EltTy = Ty.getElementType();
104     unsigned Size = Ty.getSizeInBits();
105     unsigned Pieces = (Size + 63) / 64;
106     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
107     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
108   };
109 }
110 
111 // Increase the number of vector elements to reach the next multiple of 32-bit
112 // type.
113 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
114   return [=](const LegalityQuery &Query) {
115     const LLT Ty = Query.Types[TypeIdx];
116 
117     const LLT EltTy = Ty.getElementType();
118     const int Size = Ty.getSizeInBits();
119     const int EltSize = EltTy.getSizeInBits();
120     const int NextMul32 = (Size + 31) / 32;
121 
122     assert(EltSize < 32);
123 
124     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
125     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
126   };
127 }
128 
129 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
130   return [=](const LegalityQuery &Query) {
131     const LLT QueryTy = Query.Types[TypeIdx];
132     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
133   };
134 }
135 
136 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
137   return [=](const LegalityQuery &Query) {
138     const LLT QueryTy = Query.Types[TypeIdx];
139     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
140   };
141 }
142 
143 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
144   return [=](const LegalityQuery &Query) {
145     const LLT QueryTy = Query.Types[TypeIdx];
146     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
147   };
148 }
149 
150 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
151 // v2s16.
152 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
153   return [=](const LegalityQuery &Query) {
154     const LLT Ty = Query.Types[TypeIdx];
155     if (Ty.isVector()) {
156       const int EltSize = Ty.getElementType().getSizeInBits();
157       return EltSize == 32 || EltSize == 64 ||
158             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
159              EltSize == 128 || EltSize == 256;
160     }
161 
162     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
163   };
164 }
165 
166 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
167   return [=](const LegalityQuery &Query) {
168     const LLT QueryTy = Query.Types[TypeIdx];
169     return QueryTy.isVector() && QueryTy.getElementType() == Type;
170   };
171 }
172 
173 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
174   return [=](const LegalityQuery &Query) {
175     const LLT Ty = Query.Types[TypeIdx];
176     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
177            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
178   };
179 }
180 
181 static LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1) {
182   return [=](const LegalityQuery &Query) {
183     return Query.Types[TypeIdx0].getSizeInBits() <
184            Query.Types[TypeIdx1].getSizeInBits();
185   };
186 }
187 
188 static LegalityPredicate greaterThan(unsigned TypeIdx0, unsigned TypeIdx1) {
189   return [=](const LegalityQuery &Query) {
190     return Query.Types[TypeIdx0].getSizeInBits() >
191            Query.Types[TypeIdx1].getSizeInBits();
192   };
193 }
194 
195 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
196                                          const GCNTargetMachine &TM)
197   :  ST(ST_) {
198   using namespace TargetOpcode;
199 
200   auto GetAddrSpacePtr = [&TM](unsigned AS) {
201     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
202   };
203 
204   const LLT S1 = LLT::scalar(1);
205   const LLT S16 = LLT::scalar(16);
206   const LLT S32 = LLT::scalar(32);
207   const LLT S64 = LLT::scalar(64);
208   const LLT S128 = LLT::scalar(128);
209   const LLT S256 = LLT::scalar(256);
210   const LLT S1024 = LLT::scalar(1024);
211 
212   const LLT V2S16 = LLT::vector(2, 16);
213   const LLT V4S16 = LLT::vector(4, 16);
214 
215   const LLT V2S32 = LLT::vector(2, 32);
216   const LLT V3S32 = LLT::vector(3, 32);
217   const LLT V4S32 = LLT::vector(4, 32);
218   const LLT V5S32 = LLT::vector(5, 32);
219   const LLT V6S32 = LLT::vector(6, 32);
220   const LLT V7S32 = LLT::vector(7, 32);
221   const LLT V8S32 = LLT::vector(8, 32);
222   const LLT V9S32 = LLT::vector(9, 32);
223   const LLT V10S32 = LLT::vector(10, 32);
224   const LLT V11S32 = LLT::vector(11, 32);
225   const LLT V12S32 = LLT::vector(12, 32);
226   const LLT V13S32 = LLT::vector(13, 32);
227   const LLT V14S32 = LLT::vector(14, 32);
228   const LLT V15S32 = LLT::vector(15, 32);
229   const LLT V16S32 = LLT::vector(16, 32);
230   const LLT V32S32 = LLT::vector(32, 32);
231 
232   const LLT V2S64 = LLT::vector(2, 64);
233   const LLT V3S64 = LLT::vector(3, 64);
234   const LLT V4S64 = LLT::vector(4, 64);
235   const LLT V5S64 = LLT::vector(5, 64);
236   const LLT V6S64 = LLT::vector(6, 64);
237   const LLT V7S64 = LLT::vector(7, 64);
238   const LLT V8S64 = LLT::vector(8, 64);
239   const LLT V16S64 = LLT::vector(16, 64);
240 
241   std::initializer_list<LLT> AllS32Vectors =
242     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
243      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
244   std::initializer_list<LLT> AllS64Vectors =
245     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
246 
247   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
248   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
249   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
250   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
251   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
252   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
253   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
254 
255   const LLT CodePtr = FlatPtr;
256 
257   const std::initializer_list<LLT> AddrSpaces64 = {
258     GlobalPtr, ConstantPtr, FlatPtr
259   };
260 
261   const std::initializer_list<LLT> AddrSpaces32 = {
262     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
263   };
264 
265   const std::initializer_list<LLT> FPTypesBase = {
266     S32, S64
267   };
268 
269   const std::initializer_list<LLT> FPTypes16 = {
270     S32, S64, S16
271   };
272 
273   const std::initializer_list<LLT> FPTypesPK16 = {
274     S32, S64, S16, V2S16
275   };
276 
277   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
278 
279   setAction({G_BRCOND, S1}, Legal); // VCC branches
280   setAction({G_BRCOND, S32}, Legal); // SCC branches
281 
282   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
283   // elements for v3s16
284   getActionDefinitionsBuilder(G_PHI)
285     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
286     .legalFor(AllS32Vectors)
287     .legalFor(AllS64Vectors)
288     .legalFor(AddrSpaces64)
289     .legalFor(AddrSpaces32)
290     .clampScalar(0, S32, S256)
291     .widenScalarToNextPow2(0, 32)
292     .clampMaxNumElements(0, S32, 16)
293     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
294     .legalIf(isPointer(0));
295 
296   if (ST.hasVOP3PInsts()) {
297     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
298       .legalFor({S32, S16, V2S16})
299       .clampScalar(0, S16, S32)
300       .clampMaxNumElements(0, S16, 2)
301       .scalarize(0)
302       .widenScalarToNextPow2(0, 32);
303   } else if (ST.has16BitInsts()) {
304     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
305       .legalFor({S32, S16})
306       .clampScalar(0, S16, S32)
307       .scalarize(0)
308       .widenScalarToNextPow2(0, 32);
309   } else {
310     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
311       .legalFor({S32})
312       .clampScalar(0, S32, S32)
313       .scalarize(0);
314   }
315 
316   // FIXME: Not really legal. Placeholder for custom lowering.
317   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
318     .customFor({S32, S64})
319     .clampScalar(0, S32, S64)
320     .widenScalarToNextPow2(0, 32)
321     .scalarize(0);
322 
323   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
324     .legalFor({S32})
325     .clampScalar(0, S32, S32)
326     .scalarize(0);
327 
328   // Report legal for any types we can handle anywhere. For the cases only legal
329   // on the SALU, RegBankSelect will be able to re-legalize.
330   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
331     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
332     .clampScalar(0, S32, S64)
333     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
334     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
335     .widenScalarToNextPow2(0)
336     .scalarize(0);
337 
338   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
339                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
340     .legalFor({{S32, S1}, {S32, S32}})
341     .minScalar(0, S32)
342     // TODO: .scalarize(0)
343     .lower();
344 
345   getActionDefinitionsBuilder(G_BITCAST)
346     // Don't worry about the size constraint.
347     .legalIf(all(isRegisterType(0), isRegisterType(1)))
348     .lower();
349 
350 
351   getActionDefinitionsBuilder(G_CONSTANT)
352     .legalFor({S1, S32, S64, S16, GlobalPtr,
353                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
354     .clampScalar(0, S32, S64)
355     .widenScalarToNextPow2(0)
356     .legalIf(isPointer(0));
357 
358   getActionDefinitionsBuilder(G_FCONSTANT)
359     .legalFor({S32, S64, S16})
360     .clampScalar(0, S16, S64);
361 
362   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
363     .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
364                ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
365     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
366     .clampScalarOrElt(0, S32, S1024)
367     .legalIf(isMultiple32(0))
368     .widenScalarToNextPow2(0, 32)
369     .clampMaxNumElements(0, S32, 16);
370 
371   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
372   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
373     .unsupportedFor({PrivatePtr})
374     .custom();
375   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
376 
377   auto &FPOpActions = getActionDefinitionsBuilder(
378     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
379     .legalFor({S32, S64});
380   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
381     .customFor({S32, S64});
382   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
383     .customFor({S32, S64});
384 
385   if (ST.has16BitInsts()) {
386     if (ST.hasVOP3PInsts())
387       FPOpActions.legalFor({S16, V2S16});
388     else
389       FPOpActions.legalFor({S16});
390 
391     TrigActions.customFor({S16});
392     FDIVActions.customFor({S16});
393   }
394 
395   auto &MinNumMaxNum = getActionDefinitionsBuilder({
396       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
397 
398   if (ST.hasVOP3PInsts()) {
399     MinNumMaxNum.customFor(FPTypesPK16)
400       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
401       .clampMaxNumElements(0, S16, 2)
402       .clampScalar(0, S16, S64)
403       .scalarize(0);
404   } else if (ST.has16BitInsts()) {
405     MinNumMaxNum.customFor(FPTypes16)
406       .clampScalar(0, S16, S64)
407       .scalarize(0);
408   } else {
409     MinNumMaxNum.customFor(FPTypesBase)
410       .clampScalar(0, S32, S64)
411       .scalarize(0);
412   }
413 
414   if (ST.hasVOP3PInsts())
415     FPOpActions.clampMaxNumElements(0, S16, 2);
416 
417   FPOpActions
418     .scalarize(0)
419     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
420 
421   TrigActions
422     .scalarize(0)
423     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
424 
425   FDIVActions
426     .scalarize(0)
427     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
428 
429   getActionDefinitionsBuilder({G_FNEG, G_FABS})
430     .legalFor(FPTypesPK16)
431     .clampMaxNumElements(0, S16, 2)
432     .scalarize(0)
433     .clampScalar(0, S16, S64);
434 
435   if (ST.has16BitInsts()) {
436     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
437       .legalFor({S32, S64, S16})
438       .scalarize(0)
439       .clampScalar(0, S16, S64);
440   } else {
441     getActionDefinitionsBuilder(G_FSQRT)
442       .legalFor({S32, S64})
443       .scalarize(0)
444       .clampScalar(0, S32, S64);
445 
446     if (ST.hasFractBug()) {
447       getActionDefinitionsBuilder(G_FFLOOR)
448         .customFor({S64})
449         .legalFor({S32, S64})
450         .scalarize(0)
451         .clampScalar(0, S32, S64);
452     } else {
453       getActionDefinitionsBuilder(G_FFLOOR)
454         .legalFor({S32, S64})
455         .scalarize(0)
456         .clampScalar(0, S32, S64);
457     }
458   }
459 
460   getActionDefinitionsBuilder(G_FPTRUNC)
461     .legalFor({{S32, S64}, {S16, S32}})
462     .scalarize(0)
463     .lower();
464 
465   getActionDefinitionsBuilder(G_FPEXT)
466     .legalFor({{S64, S32}, {S32, S16}})
467     .lowerFor({{S64, S16}}) // FIXME: Implement
468     .scalarize(0);
469 
470   getActionDefinitionsBuilder(G_FSUB)
471       // Use actual fsub instruction
472       .legalFor({S32})
473       // Must use fadd + fneg
474       .lowerFor({S64, S16, V2S16})
475       .scalarize(0)
476       .clampScalar(0, S32, S64);
477 
478   // Whether this is legal depends on the floating point mode for the function.
479   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
480   if (ST.hasMadF16())
481     FMad.customFor({S32, S16});
482   else
483     FMad.customFor({S32});
484   FMad.scalarize(0)
485       .lower();
486 
487   getActionDefinitionsBuilder(G_TRUNC)
488     .alwaysLegal();
489 
490   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
491     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
492                {S32, S1}, {S64, S1}, {S16, S1}})
493     .scalarize(0)
494     .clampScalar(0, S32, S64)
495     .widenScalarToNextPow2(1, 32);
496 
497   // TODO: Split s1->s64 during regbankselect for VALU.
498   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
499     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
500     .lowerFor({{S32, S64}})
501     .lowerIf(typeIs(1, S1))
502     .customFor({{S64, S64}});
503   if (ST.has16BitInsts())
504     IToFP.legalFor({{S16, S16}});
505   IToFP.clampScalar(1, S32, S64)
506        .scalarize(0)
507        .widenScalarToNextPow2(1);
508 
509   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
510     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
511     .customFor({{S64, S64}});
512   if (ST.has16BitInsts())
513     FPToI.legalFor({{S16, S16}});
514   else
515     FPToI.minScalar(1, S32);
516 
517   FPToI.minScalar(0, S32)
518        .scalarize(0)
519        .lower();
520 
521   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
522     .scalarize(0)
523     .lower();
524 
525   if (ST.has16BitInsts()) {
526     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
527       .legalFor({S16, S32, S64})
528       .clampScalar(0, S16, S64)
529       .scalarize(0);
530   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
531     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
532       .legalFor({S32, S64})
533       .clampScalar(0, S32, S64)
534       .scalarize(0);
535   } else {
536     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
537       .legalFor({S32})
538       .customFor({S64})
539       .clampScalar(0, S32, S64)
540       .scalarize(0);
541   }
542 
543   getActionDefinitionsBuilder({G_PTR_ADD, G_PTR_MASK})
544     .scalarize(0)
545     .alwaysLegal();
546 
547   auto &CmpBuilder =
548     getActionDefinitionsBuilder(G_ICMP)
549     // The compare output type differs based on the register bank of the output,
550     // so make both s1 and s32 legal.
551     //
552     // Scalar compares producing output in scc will be promoted to s32, as that
553     // is the allocatable register type that will be needed for the copy from
554     // scc. This will be promoted during RegBankSelect, and we assume something
555     // before that won't try to use s32 result types.
556     //
557     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
558     // bank.
559     .legalForCartesianProduct(
560       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
561     .legalForCartesianProduct(
562       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
563   if (ST.has16BitInsts()) {
564     CmpBuilder.legalFor({{S1, S16}});
565   }
566 
567   CmpBuilder
568     .widenScalarToNextPow2(1)
569     .clampScalar(1, S32, S64)
570     .scalarize(0)
571     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
572 
573   getActionDefinitionsBuilder(G_FCMP)
574     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
575     .widenScalarToNextPow2(1)
576     .clampScalar(1, S32, S64)
577     .scalarize(0);
578 
579   // FIXME: fpow has a selection pattern that should move to custom lowering.
580   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
581   if (ST.has16BitInsts())
582     Exp2Ops.legalFor({S32, S16});
583   else
584     Exp2Ops.legalFor({S32});
585   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
586   Exp2Ops.scalarize(0);
587 
588   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
589   if (ST.has16BitInsts())
590     ExpOps.customFor({{S32}, {S16}});
591   else
592     ExpOps.customFor({S32});
593   ExpOps.clampScalar(0, MinScalarFPTy, S32)
594         .scalarize(0);
595 
596   // The 64-bit versions produce 32-bit results, but only on the SALU.
597   getActionDefinitionsBuilder(G_CTPOP)
598     .legalFor({{S32, S32}, {S32, S64}})
599     .clampScalar(0, S32, S32)
600     .clampScalar(1, S32, S64)
601     .scalarize(0)
602     .widenScalarToNextPow2(0, 32)
603     .widenScalarToNextPow2(1, 32);
604 
605   // The hardware instructions return a different result on 0 than the generic
606   // instructions expect. The hardware produces -1, but these produce the
607   // bitwidth.
608   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
609     .scalarize(0)
610     .clampScalar(0, S32, S32)
611     .clampScalar(1, S32, S64)
612     .widenScalarToNextPow2(0, 32)
613     .widenScalarToNextPow2(1, 32)
614     .lower();
615 
616   // The 64-bit versions produce 32-bit results, but only on the SALU.
617   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
618     .legalFor({{S32, S32}, {S32, S64}})
619     .clampScalar(0, S32, S32)
620     .clampScalar(1, S32, S64)
621     .scalarize(0)
622     .widenScalarToNextPow2(0, 32)
623     .widenScalarToNextPow2(1, 32);
624 
625   getActionDefinitionsBuilder(G_BITREVERSE)
626     .legalFor({S32})
627     .clampScalar(0, S32, S32)
628     .scalarize(0);
629 
630   if (ST.has16BitInsts()) {
631     getActionDefinitionsBuilder(G_BSWAP)
632       .legalFor({S16, S32, V2S16})
633       .clampMaxNumElements(0, S16, 2)
634       // FIXME: Fixing non-power-of-2 before clamp is workaround for
635       // narrowScalar limitation.
636       .widenScalarToNextPow2(0)
637       .clampScalar(0, S16, S32)
638       .scalarize(0);
639 
640     if (ST.hasVOP3PInsts()) {
641       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
642         .legalFor({S32, S16, V2S16})
643         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
644         .clampMaxNumElements(0, S16, 2)
645         .minScalar(0, S16)
646         .widenScalarToNextPow2(0)
647         .scalarize(0)
648         .lower();
649     } else {
650       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
651         .legalFor({S32, S16})
652         .widenScalarToNextPow2(0)
653         .minScalar(0, S16)
654         .scalarize(0)
655         .lower();
656     }
657   } else {
658     // TODO: Should have same legality without v_perm_b32
659     getActionDefinitionsBuilder(G_BSWAP)
660       .legalFor({S32})
661       .lowerIf(narrowerThan(0, 32))
662       // FIXME: Fixing non-power-of-2 before clamp is workaround for
663       // narrowScalar limitation.
664       .widenScalarToNextPow2(0)
665       .maxScalar(0, S32)
666       .scalarize(0)
667       .lower();
668 
669     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
670       .legalFor({S32})
671       .minScalar(0, S32)
672       .widenScalarToNextPow2(0)
673       .scalarize(0)
674       .lower();
675   }
676 
677   getActionDefinitionsBuilder(G_INTTOPTR)
678     // List the common cases
679     .legalForCartesianProduct(AddrSpaces64, {S64})
680     .legalForCartesianProduct(AddrSpaces32, {S32})
681     .scalarize(0)
682     // Accept any address space as long as the size matches
683     .legalIf(sameSize(0, 1))
684     .widenScalarIf(smallerThan(1, 0),
685       [](const LegalityQuery &Query) {
686         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
687       })
688     .narrowScalarIf(greaterThan(1, 0),
689       [](const LegalityQuery &Query) {
690         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
691       });
692 
693   getActionDefinitionsBuilder(G_PTRTOINT)
694     // List the common cases
695     .legalForCartesianProduct(AddrSpaces64, {S64})
696     .legalForCartesianProduct(AddrSpaces32, {S32})
697     .scalarize(0)
698     // Accept any address space as long as the size matches
699     .legalIf(sameSize(0, 1))
700     .widenScalarIf(smallerThan(0, 1),
701       [](const LegalityQuery &Query) {
702         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
703       })
704     .narrowScalarIf(
705       greaterThan(0, 1),
706       [](const LegalityQuery &Query) {
707         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
708       });
709 
710   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
711     .scalarize(0)
712     .custom();
713 
714   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
715   // handle some operations by just promoting the register during
716   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
717   auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned {
718     switch (AS) {
719     // FIXME: Private element size.
720     case AMDGPUAS::PRIVATE_ADDRESS:
721       return 32;
722     // FIXME: Check subtarget
723     case AMDGPUAS::LOCAL_ADDRESS:
724       return ST.useDS128() ? 128 : 64;
725 
726     // Treat constant and global as identical. SMRD loads are sometimes usable
727     // for global loads (ideally constant address space should be eliminated)
728     // depending on the context. Legality cannot be context dependent, but
729     // RegBankSelect can split the load as necessary depending on the pointer
730     // register bank/uniformity and if the memory is invariant or not written in
731     // a kernel.
732     case AMDGPUAS::CONSTANT_ADDRESS:
733     case AMDGPUAS::GLOBAL_ADDRESS:
734       return IsLoad ? 512 : 128;
735     default:
736       return 128;
737     }
738   };
739 
740   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
741                                     bool IsLoad) -> bool {
742     const LLT DstTy = Query.Types[0];
743 
744     // Split vector extloads.
745     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
746     unsigned Align = Query.MMODescrs[0].AlignInBits;
747 
748     if (MemSize < DstTy.getSizeInBits())
749       MemSize = std::max(MemSize, Align);
750 
751     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
752       return true;
753 
754     const LLT PtrTy = Query.Types[1];
755     unsigned AS = PtrTy.getAddressSpace();
756     if (MemSize > maxSizeForAddrSpace(AS, IsLoad))
757       return true;
758 
759     // Catch weird sized loads that don't evenly divide into the access sizes
760     // TODO: May be able to widen depending on alignment etc.
761     unsigned NumRegs = (MemSize + 31) / 32;
762     if (NumRegs == 3) {
763       if (!ST.hasDwordx3LoadStores())
764         return true;
765     } else {
766       // If the alignment allows, these should have been widened.
767       if (!isPowerOf2_32(NumRegs))
768         return true;
769     }
770 
771     if (Align < MemSize) {
772       const SITargetLowering *TLI = ST.getTargetLowering();
773       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
774     }
775 
776     return false;
777   };
778 
779   const auto shouldWidenLoadResult = [=](const LegalityQuery &Query) -> bool {
780     unsigned Size = Query.Types[0].getSizeInBits();
781     if (isPowerOf2_32(Size))
782       return false;
783 
784     if (Size == 96 && ST.hasDwordx3LoadStores())
785       return false;
786 
787     unsigned AddrSpace = Query.Types[1].getAddressSpace();
788     if (Size >= maxSizeForAddrSpace(AddrSpace, true))
789       return false;
790 
791     unsigned Align = Query.MMODescrs[0].AlignInBits;
792     unsigned RoundedSize = NextPowerOf2(Size);
793     return (Align >= RoundedSize);
794   };
795 
796   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
797   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
798   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
799 
800   // TODO: Refine based on subtargets which support unaligned access or 128-bit
801   // LDS
802   // TODO: Unsupported flat for SI.
803 
804   for (unsigned Op : {G_LOAD, G_STORE}) {
805     const bool IsStore = Op == G_STORE;
806 
807     auto &Actions = getActionDefinitionsBuilder(Op);
808     // Whitelist the common cases.
809     // TODO: Loads to s16 on gfx9
810     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
811                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
812                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
813                                       {S128, GlobalPtr, 128, GlobalAlign32},
814                                       {S64, GlobalPtr, 64, GlobalAlign32},
815                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
816                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
817                                       {S32, GlobalPtr, 8, GlobalAlign8},
818                                       {S32, GlobalPtr, 16, GlobalAlign16},
819 
820                                       {S32, LocalPtr, 32, 32},
821                                       {S64, LocalPtr, 64, 32},
822                                       {V2S32, LocalPtr, 64, 32},
823                                       {S32, LocalPtr, 8, 8},
824                                       {S32, LocalPtr, 16, 16},
825                                       {V2S16, LocalPtr, 32, 32},
826 
827                                       {S32, PrivatePtr, 32, 32},
828                                       {S32, PrivatePtr, 8, 8},
829                                       {S32, PrivatePtr, 16, 16},
830                                       {V2S16, PrivatePtr, 32, 32},
831 
832                                       {S32, FlatPtr, 32, GlobalAlign32},
833                                       {S32, FlatPtr, 16, GlobalAlign16},
834                                       {S32, FlatPtr, 8, GlobalAlign8},
835                                       {V2S16, FlatPtr, 32, GlobalAlign32},
836 
837                                       {S32, ConstantPtr, 32, GlobalAlign32},
838                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
839                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
840                                       {S64, ConstantPtr, 64, GlobalAlign32},
841                                       {S128, ConstantPtr, 128, GlobalAlign32},
842                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
843     Actions
844         .customIf(typeIs(1, Constant32Ptr))
845         // Widen suitably aligned loads by loading extra elements.
846         .moreElementsIf([=](const LegalityQuery &Query) {
847             const LLT Ty = Query.Types[0];
848             return Op == G_LOAD && Ty.isVector() &&
849                    shouldWidenLoadResult(Query);
850           }, moreElementsToNextPow2(0))
851         .widenScalarIf([=](const LegalityQuery &Query) {
852             const LLT Ty = Query.Types[0];
853             return Op == G_LOAD && !Ty.isVector() &&
854                    shouldWidenLoadResult(Query);
855           }, widenScalarOrEltToNextPow2(0))
856         .narrowScalarIf(
857             [=](const LegalityQuery &Query) -> bool {
858               return !Query.Types[0].isVector() &&
859                      needToSplitMemOp(Query, Op == G_LOAD);
860             },
861             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
862               const LLT DstTy = Query.Types[0];
863               const LLT PtrTy = Query.Types[1];
864 
865               const unsigned DstSize = DstTy.getSizeInBits();
866               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
867 
868               // Split extloads.
869               if (DstSize > MemSize)
870                 return std::make_pair(0, LLT::scalar(MemSize));
871 
872               if (!isPowerOf2_32(DstSize)) {
873                 // We're probably decomposing an odd sized store. Try to split
874                 // to the widest type. TODO: Account for alignment. As-is it
875                 // should be OK, since the new parts will be further legalized.
876                 unsigned FloorSize = PowerOf2Floor(DstSize);
877                 return std::make_pair(0, LLT::scalar(FloorSize));
878               }
879 
880               if (DstSize > 32 && (DstSize % 32 != 0)) {
881                 // FIXME: Need a way to specify non-extload of larger size if
882                 // suitably aligned.
883                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
884               }
885 
886               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
887                                                      Op == G_LOAD);
888               if (MemSize > MaxSize)
889                 return std::make_pair(0, LLT::scalar(MaxSize));
890 
891               unsigned Align = Query.MMODescrs[0].AlignInBits;
892               return std::make_pair(0, LLT::scalar(Align));
893             })
894         .fewerElementsIf(
895             [=](const LegalityQuery &Query) -> bool {
896               return Query.Types[0].isVector() &&
897                      needToSplitMemOp(Query, Op == G_LOAD);
898             },
899             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
900               const LLT DstTy = Query.Types[0];
901               const LLT PtrTy = Query.Types[1];
902 
903               LLT EltTy = DstTy.getElementType();
904               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
905                                                      Op == G_LOAD);
906 
907               // FIXME: Handle widened to power of 2 results better. This ends
908               // up scalarizing.
909               // FIXME: 3 element stores scalarized on SI
910 
911               // Split if it's too large for the address space.
912               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
913                 unsigned NumElts = DstTy.getNumElements();
914                 unsigned EltSize = EltTy.getSizeInBits();
915 
916                 if (MaxSize % EltSize == 0) {
917                   return std::make_pair(
918                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
919                 }
920 
921                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
922 
923                 // FIXME: Refine when odd breakdowns handled
924                 // The scalars will need to be re-legalized.
925                 if (NumPieces == 1 || NumPieces >= NumElts ||
926                     NumElts % NumPieces != 0)
927                   return std::make_pair(0, EltTy);
928 
929                 return std::make_pair(0,
930                                       LLT::vector(NumElts / NumPieces, EltTy));
931               }
932 
933               // FIXME: We could probably handle weird extending loads better.
934               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
935               if (DstTy.getSizeInBits() > MemSize)
936                 return std::make_pair(0, EltTy);
937 
938               unsigned EltSize = EltTy.getSizeInBits();
939               unsigned DstSize = DstTy.getSizeInBits();
940               if (!isPowerOf2_32(DstSize)) {
941                 // We're probably decomposing an odd sized store. Try to split
942                 // to the widest type. TODO: Account for alignment. As-is it
943                 // should be OK, since the new parts will be further legalized.
944                 unsigned FloorSize = PowerOf2Floor(DstSize);
945                 return std::make_pair(
946                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
947               }
948 
949               // Need to split because of alignment.
950               unsigned Align = Query.MMODescrs[0].AlignInBits;
951               if (EltSize > Align &&
952                   (EltSize / Align < DstTy.getNumElements())) {
953                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
954               }
955 
956               // May need relegalization for the scalars.
957               return std::make_pair(0, EltTy);
958             })
959         .minScalar(0, S32);
960 
961     if (IsStore)
962       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
963 
964     // TODO: Need a bitcast lower option?
965     Actions
966         .legalIf([=](const LegalityQuery &Query) {
967           const LLT Ty0 = Query.Types[0];
968           unsigned Size = Ty0.getSizeInBits();
969           unsigned MemSize = Query.MMODescrs[0].SizeInBits;
970           unsigned Align = Query.MMODescrs[0].AlignInBits;
971 
972           // FIXME: Widening store from alignment not valid.
973           if (MemSize < Size)
974             MemSize = std::max(MemSize, Align);
975 
976           // No extending vector loads.
977           if (Size > MemSize && Ty0.isVector())
978             return false;
979 
980           switch (MemSize) {
981           case 8:
982           case 16:
983             return Size == 32;
984           case 32:
985           case 64:
986           case 128:
987             return true;
988           case 96:
989             return ST.hasDwordx3LoadStores();
990           case 256:
991           case 512:
992             return true;
993           default:
994             return false;
995           }
996         })
997         .widenScalarToNextPow2(0)
998         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
999   }
1000 
1001   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1002                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1003                                                   {S32, GlobalPtr, 16, 2 * 8},
1004                                                   {S32, LocalPtr, 8, 8},
1005                                                   {S32, LocalPtr, 16, 16},
1006                                                   {S32, PrivatePtr, 8, 8},
1007                                                   {S32, PrivatePtr, 16, 16},
1008                                                   {S32, ConstantPtr, 8, 8},
1009                                                   {S32, ConstantPtr, 16, 2 * 8}});
1010   if (ST.hasFlatAddressSpace()) {
1011     ExtLoads.legalForTypesWithMemDesc(
1012         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1013   }
1014 
1015   ExtLoads.clampScalar(0, S32, S32)
1016           .widenScalarToNextPow2(0)
1017           .unsupportedIfMemSizeNotPow2()
1018           .lower();
1019 
1020   auto &Atomics = getActionDefinitionsBuilder(
1021     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1022      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1023      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1024      G_ATOMICRMW_UMIN})
1025     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1026                {S64, GlobalPtr}, {S64, LocalPtr}});
1027   if (ST.hasFlatAddressSpace()) {
1028     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1029   }
1030 
1031   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1032     .legalFor({{S32, LocalPtr}});
1033 
1034   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1035   // demarshalling
1036   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1037     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1038                 {S32, FlatPtr}, {S64, FlatPtr}})
1039     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1040                {S32, RegionPtr}, {S64, RegionPtr}});
1041   // TODO: Pointer types, any 32-bit or 64-bit vector
1042 
1043   // Condition should be s32 for scalar, s1 for vector.
1044   getActionDefinitionsBuilder(G_SELECT)
1045     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1046           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1047           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1048     .clampScalar(0, S16, S64)
1049     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1050     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1051     .scalarize(1)
1052     .clampMaxNumElements(0, S32, 2)
1053     .clampMaxNumElements(0, LocalPtr, 2)
1054     .clampMaxNumElements(0, PrivatePtr, 2)
1055     .scalarize(0)
1056     .widenScalarToNextPow2(0)
1057     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1058 
1059   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1060   // be more flexible with the shift amount type.
1061   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1062     .legalFor({{S32, S32}, {S64, S32}});
1063   if (ST.has16BitInsts()) {
1064     if (ST.hasVOP3PInsts()) {
1065       Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
1066             .clampMaxNumElements(0, S16, 2);
1067     } else
1068       Shifts.legalFor({{S16, S32}, {S16, S16}});
1069 
1070     // TODO: Support 16-bit shift amounts
1071     Shifts.clampScalar(1, S32, S32);
1072     Shifts.clampScalar(0, S16, S64);
1073     Shifts.widenScalarToNextPow2(0, 16);
1074   } else {
1075     // Make sure we legalize the shift amount type first, as the general
1076     // expansion for the shifted type will produce much worse code if it hasn't
1077     // been truncated already.
1078     Shifts.clampScalar(1, S32, S32);
1079     Shifts.clampScalar(0, S32, S64);
1080     Shifts.widenScalarToNextPow2(0, 32);
1081   }
1082   Shifts.scalarize(0);
1083 
1084   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1085     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1086     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1087     unsigned IdxTypeIdx = 2;
1088 
1089     getActionDefinitionsBuilder(Op)
1090       .customIf([=](const LegalityQuery &Query) {
1091           const LLT EltTy = Query.Types[EltTypeIdx];
1092           const LLT VecTy = Query.Types[VecTypeIdx];
1093           const LLT IdxTy = Query.Types[IdxTypeIdx];
1094           return (EltTy.getSizeInBits() == 16 ||
1095                   EltTy.getSizeInBits() % 32 == 0) &&
1096                  VecTy.getSizeInBits() % 32 == 0 &&
1097                  VecTy.getSizeInBits() <= 1024 &&
1098                  IdxTy.getSizeInBits() == 32;
1099         })
1100       .clampScalar(EltTypeIdx, S32, S64)
1101       .clampScalar(VecTypeIdx, S32, S64)
1102       .clampScalar(IdxTypeIdx, S32, S32);
1103   }
1104 
1105   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1106     .unsupportedIf([=](const LegalityQuery &Query) {
1107         const LLT &EltTy = Query.Types[1].getElementType();
1108         return Query.Types[0] != EltTy;
1109       });
1110 
1111   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1112     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1113     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1114 
1115     // FIXME: Doesn't handle extract of illegal sizes.
1116     getActionDefinitionsBuilder(Op)
1117       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1118       // FIXME: Multiples of 16 should not be legal.
1119       .legalIf([=](const LegalityQuery &Query) {
1120           const LLT BigTy = Query.Types[BigTyIdx];
1121           const LLT LitTy = Query.Types[LitTyIdx];
1122           return (BigTy.getSizeInBits() % 32 == 0) &&
1123                  (LitTy.getSizeInBits() % 16 == 0);
1124         })
1125       .widenScalarIf(
1126         [=](const LegalityQuery &Query) {
1127           const LLT BigTy = Query.Types[BigTyIdx];
1128           return (BigTy.getScalarSizeInBits() < 16);
1129         },
1130         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1131       .widenScalarIf(
1132         [=](const LegalityQuery &Query) {
1133           const LLT LitTy = Query.Types[LitTyIdx];
1134           return (LitTy.getScalarSizeInBits() < 16);
1135         },
1136         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1137       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1138       .widenScalarToNextPow2(BigTyIdx, 32);
1139 
1140   }
1141 
1142   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1143     .legalForCartesianProduct(AllS32Vectors, {S32})
1144     .legalForCartesianProduct(AllS64Vectors, {S64})
1145     .clampNumElements(0, V16S32, V32S32)
1146     .clampNumElements(0, V2S64, V16S64)
1147     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1148 
1149   if (ST.hasScalarPackInsts()) {
1150     BuildVector
1151       // FIXME: Should probably widen s1 vectors straight to s32
1152       .minScalarOrElt(0, S16)
1153       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1154       .minScalar(1, S32);
1155 
1156     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1157       .legalFor({V2S16, S32})
1158       .lower();
1159     BuildVector.minScalarOrElt(0, S32);
1160   } else {
1161     BuildVector.customFor({V2S16, S16});
1162     BuildVector.minScalarOrElt(0, S32);
1163 
1164     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1165       .customFor({V2S16, S32})
1166       .lower();
1167   }
1168 
1169   BuildVector.legalIf(isRegisterType(0));
1170 
1171   // FIXME: Clamp maximum size
1172   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1173     .legalIf(isRegisterType(0));
1174 
1175   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1176   // pre-legalize.
1177   if (ST.hasVOP3PInsts()) {
1178     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1179       .customFor({V2S16, V2S16})
1180       .lower();
1181   } else
1182     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1183 
1184   // Merge/Unmerge
1185   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1186     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1187     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1188 
1189     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1190       const LLT &Ty = Query.Types[TypeIdx];
1191       if (Ty.isVector()) {
1192         const LLT &EltTy = Ty.getElementType();
1193         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
1194           return true;
1195         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1196           return true;
1197       }
1198       return false;
1199     };
1200 
1201     auto &Builder = getActionDefinitionsBuilder(Op)
1202       // Try to widen to s16 first for small types.
1203       // TODO: Only do this on targets with legal s16 shifts
1204       .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1205 
1206       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1207       .lowerFor({{S16, V2S16}})
1208       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1209       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1210                            elementTypeIs(1, S16)),
1211                        changeTo(1, V2S16))
1212       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1213       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1214       // valid.
1215       .clampScalar(LitTyIdx, S32, S256)
1216       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1217       // Break up vectors with weird elements into scalars
1218       .fewerElementsIf(
1219         [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
1220         scalarize(0))
1221       .fewerElementsIf(
1222         [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
1223         scalarize(1))
1224       .clampScalar(BigTyIdx, S32, S1024);
1225 
1226     if (Op == G_MERGE_VALUES) {
1227       Builder.widenScalarIf(
1228         // TODO: Use 16-bit shifts if legal for 8-bit values?
1229         [=](const LegalityQuery &Query) {
1230           const LLT Ty = Query.Types[LitTyIdx];
1231           return Ty.getSizeInBits() < 32;
1232         },
1233         changeTo(LitTyIdx, S32));
1234     }
1235 
1236     Builder.widenScalarIf(
1237       [=](const LegalityQuery &Query) {
1238         const LLT Ty = Query.Types[BigTyIdx];
1239         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1240           Ty.getSizeInBits() % 16 != 0;
1241       },
1242       [=](const LegalityQuery &Query) {
1243         // Pick the next power of 2, or a multiple of 64 over 128.
1244         // Whichever is smaller.
1245         const LLT &Ty = Query.Types[BigTyIdx];
1246         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1247         if (NewSizeInBits >= 256) {
1248           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1249           if (RoundedTo < NewSizeInBits)
1250             NewSizeInBits = RoundedTo;
1251         }
1252         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1253       })
1254       .legalIf([=](const LegalityQuery &Query) {
1255           const LLT &BigTy = Query.Types[BigTyIdx];
1256           const LLT &LitTy = Query.Types[LitTyIdx];
1257 
1258           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1259             return false;
1260           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1261             return false;
1262 
1263           return BigTy.getSizeInBits() % 16 == 0 &&
1264                  LitTy.getSizeInBits() % 16 == 0 &&
1265                  BigTy.getSizeInBits() <= 1024;
1266         })
1267       // Any vectors left are the wrong size. Scalarize them.
1268       .scalarize(0)
1269       .scalarize(1);
1270   }
1271 
1272   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1273   // RegBankSelect.
1274   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1275     .legalFor({{S32}, {S64}});
1276 
1277   if (ST.hasVOP3PInsts()) {
1278     SextInReg.lowerFor({{V2S16}})
1279       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1280       // get more vector shift opportunities, since we'll get those when
1281       // expanded.
1282       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1283   } else if (ST.has16BitInsts()) {
1284     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1285   } else {
1286     // Prefer to promote to s32 before lowering if we don't have 16-bit
1287     // shifts. This avoid a lot of intermediate truncate and extend operations.
1288     SextInReg.lowerFor({{S32}, {S64}});
1289   }
1290 
1291   SextInReg
1292     .scalarize(0)
1293     .clampScalar(0, S32, S64)
1294     .lower();
1295 
1296   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1297     .legalFor({S64});
1298 
1299   getActionDefinitionsBuilder({
1300       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1301       G_FCOPYSIGN,
1302 
1303       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1304       G_READ_REGISTER,
1305       G_WRITE_REGISTER,
1306 
1307       G_SADDO, G_SSUBO,
1308 
1309        // TODO: Implement
1310       G_FMINIMUM, G_FMAXIMUM
1311     }).lower();
1312 
1313   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1314         G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1315         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1316     .unsupported();
1317 
1318   computeTables();
1319   verify(*ST.getInstrInfo());
1320 }
1321 
1322 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1323                                          MachineRegisterInfo &MRI,
1324                                          MachineIRBuilder &B,
1325                                          GISelChangeObserver &Observer) const {
1326   switch (MI.getOpcode()) {
1327   case TargetOpcode::G_ADDRSPACE_CAST:
1328     return legalizeAddrSpaceCast(MI, MRI, B);
1329   case TargetOpcode::G_FRINT:
1330     return legalizeFrint(MI, MRI, B);
1331   case TargetOpcode::G_FCEIL:
1332     return legalizeFceil(MI, MRI, B);
1333   case TargetOpcode::G_INTRINSIC_TRUNC:
1334     return legalizeIntrinsicTrunc(MI, MRI, B);
1335   case TargetOpcode::G_SITOFP:
1336     return legalizeITOFP(MI, MRI, B, true);
1337   case TargetOpcode::G_UITOFP:
1338     return legalizeITOFP(MI, MRI, B, false);
1339   case TargetOpcode::G_FPTOSI:
1340     return legalizeFPTOI(MI, MRI, B, true);
1341   case TargetOpcode::G_FPTOUI:
1342     return legalizeFPTOI(MI, MRI, B, false);
1343   case TargetOpcode::G_FMINNUM:
1344   case TargetOpcode::G_FMAXNUM:
1345   case TargetOpcode::G_FMINNUM_IEEE:
1346   case TargetOpcode::G_FMAXNUM_IEEE:
1347     return legalizeMinNumMaxNum(MI, MRI, B);
1348   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1349     return legalizeExtractVectorElt(MI, MRI, B);
1350   case TargetOpcode::G_INSERT_VECTOR_ELT:
1351     return legalizeInsertVectorElt(MI, MRI, B);
1352   case TargetOpcode::G_SHUFFLE_VECTOR:
1353     return legalizeShuffleVector(MI, MRI, B);
1354   case TargetOpcode::G_FSIN:
1355   case TargetOpcode::G_FCOS:
1356     return legalizeSinCos(MI, MRI, B);
1357   case TargetOpcode::G_GLOBAL_VALUE:
1358     return legalizeGlobalValue(MI, MRI, B);
1359   case TargetOpcode::G_LOAD:
1360     return legalizeLoad(MI, MRI, B, Observer);
1361   case TargetOpcode::G_FMAD:
1362     return legalizeFMad(MI, MRI, B);
1363   case TargetOpcode::G_FDIV:
1364     return legalizeFDIV(MI, MRI, B);
1365   case TargetOpcode::G_UDIV:
1366   case TargetOpcode::G_UREM:
1367     return legalizeUDIV_UREM(MI, MRI, B);
1368   case TargetOpcode::G_SDIV:
1369   case TargetOpcode::G_SREM:
1370     return legalizeSDIV_SREM(MI, MRI, B);
1371   case TargetOpcode::G_ATOMIC_CMPXCHG:
1372     return legalizeAtomicCmpXChg(MI, MRI, B);
1373   case TargetOpcode::G_FLOG:
1374     return legalizeFlog(MI, B, 1.0f / numbers::log2ef);
1375   case TargetOpcode::G_FLOG10:
1376     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1377   case TargetOpcode::G_FEXP:
1378     return legalizeFExp(MI, B);
1379   case TargetOpcode::G_FPOW:
1380     return legalizeFPow(MI, B);
1381   case TargetOpcode::G_FFLOOR:
1382     return legalizeFFloor(MI, MRI, B);
1383   case TargetOpcode::G_BUILD_VECTOR:
1384     return legalizeBuildVector(MI, MRI, B);
1385   default:
1386     return false;
1387   }
1388 
1389   llvm_unreachable("expected switch to return");
1390 }
1391 
1392 Register AMDGPULegalizerInfo::getSegmentAperture(
1393   unsigned AS,
1394   MachineRegisterInfo &MRI,
1395   MachineIRBuilder &B) const {
1396   MachineFunction &MF = B.getMF();
1397   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1398   const LLT S32 = LLT::scalar(32);
1399 
1400   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1401 
1402   if (ST.hasApertureRegs()) {
1403     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1404     // getreg.
1405     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1406         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1407         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1408     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1409         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1410         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1411     unsigned Encoding =
1412         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1413         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1414         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1415 
1416     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1417 
1418     B.buildInstr(AMDGPU::S_GETREG_B32)
1419       .addDef(GetReg)
1420       .addImm(Encoding);
1421     MRI.setType(GetReg, S32);
1422 
1423     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1424     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1425   }
1426 
1427   Register QueuePtr = MRI.createGenericVirtualRegister(
1428     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1429 
1430   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1431   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1432     return Register();
1433 
1434   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1435   // private_segment_aperture_base_hi.
1436   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1437 
1438   // TODO: can we be smarter about machine pointer info?
1439   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1440   MachineMemOperand *MMO = MF.getMachineMemOperand(
1441     PtrInfo,
1442     MachineMemOperand::MOLoad |
1443     MachineMemOperand::MODereferenceable |
1444     MachineMemOperand::MOInvariant,
1445     4,
1446     MinAlign(64, StructOffset));
1447 
1448   Register LoadAddr;
1449 
1450   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1451   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1452 }
1453 
1454 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1455   MachineInstr &MI, MachineRegisterInfo &MRI,
1456   MachineIRBuilder &B) const {
1457   MachineFunction &MF = B.getMF();
1458 
1459   B.setInstr(MI);
1460 
1461   const LLT S32 = LLT::scalar(32);
1462   Register Dst = MI.getOperand(0).getReg();
1463   Register Src = MI.getOperand(1).getReg();
1464 
1465   LLT DstTy = MRI.getType(Dst);
1466   LLT SrcTy = MRI.getType(Src);
1467   unsigned DestAS = DstTy.getAddressSpace();
1468   unsigned SrcAS = SrcTy.getAddressSpace();
1469 
1470   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1471   // vector element.
1472   assert(!DstTy.isVector());
1473 
1474   const AMDGPUTargetMachine &TM
1475     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1476 
1477   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1478   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1479     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1480     return true;
1481   }
1482 
1483   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1484     // Truncate.
1485     B.buildExtract(Dst, Src, 0);
1486     MI.eraseFromParent();
1487     return true;
1488   }
1489 
1490   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1491     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1492     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1493 
1494     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1495     // another. Merge operands are required to be the same type, but creating an
1496     // extra ptrtoint would be kind of pointless.
1497     auto HighAddr = B.buildConstant(
1498       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1499     B.buildMerge(Dst, {Src, HighAddr});
1500     MI.eraseFromParent();
1501     return true;
1502   }
1503 
1504   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1505     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1506            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1507     unsigned NullVal = TM.getNullPointerValue(DestAS);
1508 
1509     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1510     auto FlatNull = B.buildConstant(SrcTy, 0);
1511 
1512     // Extract low 32-bits of the pointer.
1513     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1514 
1515     auto CmpRes =
1516         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1517     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1518 
1519     MI.eraseFromParent();
1520     return true;
1521   }
1522 
1523   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1524     return false;
1525 
1526   if (!ST.hasFlatAddressSpace())
1527     return false;
1528 
1529   auto SegmentNull =
1530       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1531   auto FlatNull =
1532       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1533 
1534   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1535   if (!ApertureReg.isValid())
1536     return false;
1537 
1538   auto CmpRes =
1539       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1540 
1541   // Coerce the type of the low half of the result so we can use merge_values.
1542   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1543 
1544   // TODO: Should we allow mismatched types but matching sizes in merges to
1545   // avoid the ptrtoint?
1546   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1547   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1548 
1549   MI.eraseFromParent();
1550   return true;
1551 }
1552 
1553 bool AMDGPULegalizerInfo::legalizeFrint(
1554   MachineInstr &MI, MachineRegisterInfo &MRI,
1555   MachineIRBuilder &B) const {
1556   B.setInstr(MI);
1557 
1558   Register Src = MI.getOperand(1).getReg();
1559   LLT Ty = MRI.getType(Src);
1560   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1561 
1562   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1563   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1564 
1565   auto C1 = B.buildFConstant(Ty, C1Val);
1566   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1567 
1568   // TODO: Should this propagate fast-math-flags?
1569   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1570   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1571 
1572   auto C2 = B.buildFConstant(Ty, C2Val);
1573   auto Fabs = B.buildFAbs(Ty, Src);
1574 
1575   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1576   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1577   return true;
1578 }
1579 
1580 bool AMDGPULegalizerInfo::legalizeFceil(
1581   MachineInstr &MI, MachineRegisterInfo &MRI,
1582   MachineIRBuilder &B) const {
1583   B.setInstr(MI);
1584 
1585   const LLT S1 = LLT::scalar(1);
1586   const LLT S64 = LLT::scalar(64);
1587 
1588   Register Src = MI.getOperand(1).getReg();
1589   assert(MRI.getType(Src) == S64);
1590 
1591   // result = trunc(src)
1592   // if (src > 0.0 && src != result)
1593   //   result += 1.0
1594 
1595   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1596 
1597   const auto Zero = B.buildFConstant(S64, 0.0);
1598   const auto One = B.buildFConstant(S64, 1.0);
1599   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1600   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1601   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1602   auto Add = B.buildSelect(S64, And, One, Zero);
1603 
1604   // TODO: Should this propagate fast-math-flags?
1605   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1606   return true;
1607 }
1608 
1609 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1610                                               MachineIRBuilder &B) {
1611   const unsigned FractBits = 52;
1612   const unsigned ExpBits = 11;
1613   LLT S32 = LLT::scalar(32);
1614 
1615   auto Const0 = B.buildConstant(S32, FractBits - 32);
1616   auto Const1 = B.buildConstant(S32, ExpBits);
1617 
1618   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1619     .addUse(Const0.getReg(0))
1620     .addUse(Const1.getReg(0));
1621 
1622   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1623 }
1624 
1625 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1626   MachineInstr &MI, MachineRegisterInfo &MRI,
1627   MachineIRBuilder &B) const {
1628   B.setInstr(MI);
1629 
1630   const LLT S1 = LLT::scalar(1);
1631   const LLT S32 = LLT::scalar(32);
1632   const LLT S64 = LLT::scalar(64);
1633 
1634   Register Src = MI.getOperand(1).getReg();
1635   assert(MRI.getType(Src) == S64);
1636 
1637   // TODO: Should this use extract since the low half is unused?
1638   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1639   Register Hi = Unmerge.getReg(1);
1640 
1641   // Extract the upper half, since this is where we will find the sign and
1642   // exponent.
1643   auto Exp = extractF64Exponent(Hi, B);
1644 
1645   const unsigned FractBits = 52;
1646 
1647   // Extract the sign bit.
1648   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1649   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1650 
1651   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1652 
1653   const auto Zero32 = B.buildConstant(S32, 0);
1654 
1655   // Extend back to 64-bits.
1656   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1657 
1658   auto Shr = B.buildAShr(S64, FractMask, Exp);
1659   auto Not = B.buildNot(S64, Shr);
1660   auto Tmp0 = B.buildAnd(S64, Src, Not);
1661   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1662 
1663   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1664   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1665 
1666   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1667   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1668   return true;
1669 }
1670 
1671 bool AMDGPULegalizerInfo::legalizeITOFP(
1672   MachineInstr &MI, MachineRegisterInfo &MRI,
1673   MachineIRBuilder &B, bool Signed) const {
1674   B.setInstr(MI);
1675 
1676   Register Dst = MI.getOperand(0).getReg();
1677   Register Src = MI.getOperand(1).getReg();
1678 
1679   const LLT S64 = LLT::scalar(64);
1680   const LLT S32 = LLT::scalar(32);
1681 
1682   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1683 
1684   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1685 
1686   auto CvtHi = Signed ?
1687     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1688     B.buildUITOFP(S64, Unmerge.getReg(1));
1689 
1690   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1691 
1692   auto ThirtyTwo = B.buildConstant(S32, 32);
1693   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1694     .addUse(CvtHi.getReg(0))
1695     .addUse(ThirtyTwo.getReg(0));
1696 
1697   // TODO: Should this propagate fast-math-flags?
1698   B.buildFAdd(Dst, LdExp, CvtLo);
1699   MI.eraseFromParent();
1700   return true;
1701 }
1702 
1703 // TODO: Copied from DAG implementation. Verify logic and document how this
1704 // actually works.
1705 bool AMDGPULegalizerInfo::legalizeFPTOI(
1706   MachineInstr &MI, MachineRegisterInfo &MRI,
1707   MachineIRBuilder &B, bool Signed) const {
1708   B.setInstr(MI);
1709 
1710   Register Dst = MI.getOperand(0).getReg();
1711   Register Src = MI.getOperand(1).getReg();
1712 
1713   const LLT S64 = LLT::scalar(64);
1714   const LLT S32 = LLT::scalar(32);
1715 
1716   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1717 
1718   unsigned Flags = MI.getFlags();
1719 
1720   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
1721   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
1722   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
1723 
1724   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
1725   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
1726   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
1727 
1728   auto Hi = Signed ?
1729     B.buildFPTOSI(S32, FloorMul) :
1730     B.buildFPTOUI(S32, FloorMul);
1731   auto Lo = B.buildFPTOUI(S32, Fma);
1732 
1733   B.buildMerge(Dst, { Lo, Hi });
1734   MI.eraseFromParent();
1735 
1736   return true;
1737 }
1738 
1739 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1740   MachineInstr &MI, MachineRegisterInfo &MRI,
1741   MachineIRBuilder &B) const {
1742   MachineFunction &MF = B.getMF();
1743   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1744 
1745   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1746                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1747 
1748   // With ieee_mode disabled, the instructions have the correct behavior
1749   // already for G_FMINNUM/G_FMAXNUM
1750   if (!MFI->getMode().IEEE)
1751     return !IsIEEEOp;
1752 
1753   if (IsIEEEOp)
1754     return true;
1755 
1756   MachineIRBuilder HelperBuilder(MI);
1757   GISelObserverWrapper DummyObserver;
1758   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1759   HelperBuilder.setInstr(MI);
1760   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1761 }
1762 
1763 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1764   MachineInstr &MI, MachineRegisterInfo &MRI,
1765   MachineIRBuilder &B) const {
1766   // TODO: Should move some of this into LegalizerHelper.
1767 
1768   // TODO: Promote dynamic indexing of s16 to s32
1769 
1770   // FIXME: Artifact combiner probably should have replaced the truncated
1771   // constant before this, so we shouldn't need
1772   // getConstantVRegValWithLookThrough.
1773   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1774     MI.getOperand(2).getReg(), MRI);
1775   if (!IdxVal) // Dynamic case will be selected to register indexing.
1776     return true;
1777 
1778   Register Dst = MI.getOperand(0).getReg();
1779   Register Vec = MI.getOperand(1).getReg();
1780 
1781   LLT VecTy = MRI.getType(Vec);
1782   LLT EltTy = VecTy.getElementType();
1783   assert(EltTy == MRI.getType(Dst));
1784 
1785   B.setInstr(MI);
1786 
1787   if (IdxVal->Value < VecTy.getNumElements())
1788     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
1789   else
1790     B.buildUndef(Dst);
1791 
1792   MI.eraseFromParent();
1793   return true;
1794 }
1795 
1796 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1797   MachineInstr &MI, MachineRegisterInfo &MRI,
1798   MachineIRBuilder &B) const {
1799   // TODO: Should move some of this into LegalizerHelper.
1800 
1801   // TODO: Promote dynamic indexing of s16 to s32
1802 
1803   // FIXME: Artifact combiner probably should have replaced the truncated
1804   // constant before this, so we shouldn't need
1805   // getConstantVRegValWithLookThrough.
1806   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1807     MI.getOperand(3).getReg(), MRI);
1808   if (!IdxVal) // Dynamic case will be selected to register indexing.
1809     return true;
1810 
1811   Register Dst = MI.getOperand(0).getReg();
1812   Register Vec = MI.getOperand(1).getReg();
1813   Register Ins = MI.getOperand(2).getReg();
1814 
1815   LLT VecTy = MRI.getType(Vec);
1816   LLT EltTy = VecTy.getElementType();
1817   assert(EltTy == MRI.getType(Ins));
1818 
1819   B.setInstr(MI);
1820 
1821   if (IdxVal->Value < VecTy.getNumElements())
1822     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
1823   else
1824     B.buildUndef(Dst);
1825 
1826   MI.eraseFromParent();
1827   return true;
1828 }
1829 
1830 bool AMDGPULegalizerInfo::legalizeShuffleVector(
1831   MachineInstr &MI, MachineRegisterInfo &MRI,
1832   MachineIRBuilder &B) const {
1833   const LLT V2S16 = LLT::vector(2, 16);
1834 
1835   Register Dst = MI.getOperand(0).getReg();
1836   Register Src0 = MI.getOperand(1).getReg();
1837   LLT DstTy = MRI.getType(Dst);
1838   LLT SrcTy = MRI.getType(Src0);
1839 
1840   if (SrcTy == V2S16 && DstTy == V2S16 &&
1841       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
1842     return true;
1843 
1844   MachineIRBuilder HelperBuilder(MI);
1845   GISelObserverWrapper DummyObserver;
1846   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
1847   HelperBuilder.setInstr(MI);
1848   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
1849 }
1850 
1851 bool AMDGPULegalizerInfo::legalizeSinCos(
1852   MachineInstr &MI, MachineRegisterInfo &MRI,
1853   MachineIRBuilder &B) const {
1854   B.setInstr(MI);
1855 
1856   Register DstReg = MI.getOperand(0).getReg();
1857   Register SrcReg = MI.getOperand(1).getReg();
1858   LLT Ty = MRI.getType(DstReg);
1859   unsigned Flags = MI.getFlags();
1860 
1861   Register TrigVal;
1862   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1863   if (ST.hasTrigReducedRange()) {
1864     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1865     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1866       .addUse(MulVal.getReg(0))
1867       .setMIFlags(Flags).getReg(0);
1868   } else
1869     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1870 
1871   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1872     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1873   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1874     .addUse(TrigVal)
1875     .setMIFlags(Flags);
1876   MI.eraseFromParent();
1877   return true;
1878 }
1879 
1880 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1881   Register DstReg, LLT PtrTy,
1882   MachineIRBuilder &B, const GlobalValue *GV,
1883   unsigned Offset, unsigned GAFlags) const {
1884   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1885   // to the following code sequence:
1886   //
1887   // For constant address space:
1888   //   s_getpc_b64 s[0:1]
1889   //   s_add_u32 s0, s0, $symbol
1890   //   s_addc_u32 s1, s1, 0
1891   //
1892   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1893   //   a fixup or relocation is emitted to replace $symbol with a literal
1894   //   constant, which is a pc-relative offset from the encoding of the $symbol
1895   //   operand to the global variable.
1896   //
1897   // For global address space:
1898   //   s_getpc_b64 s[0:1]
1899   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1900   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1901   //
1902   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1903   //   fixups or relocations are emitted to replace $symbol@*@lo and
1904   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1905   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
1906   //   operand to the global variable.
1907   //
1908   // What we want here is an offset from the value returned by s_getpc
1909   // (which is the address of the s_add_u32 instruction) to the global
1910   // variable, but since the encoding of $symbol starts 4 bytes after the start
1911   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1912   // small. This requires us to add 4 to the global variable offset in order to
1913   // compute the correct address.
1914 
1915   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1916 
1917   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1918     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1919 
1920   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1921     .addDef(PCReg);
1922 
1923   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1924   if (GAFlags == SIInstrInfo::MO_NONE)
1925     MIB.addImm(0);
1926   else
1927     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1928 
1929   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1930 
1931   if (PtrTy.getSizeInBits() == 32)
1932     B.buildExtract(DstReg, PCReg, 0);
1933   return true;
1934  }
1935 
1936 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1937   MachineInstr &MI, MachineRegisterInfo &MRI,
1938   MachineIRBuilder &B) const {
1939   Register DstReg = MI.getOperand(0).getReg();
1940   LLT Ty = MRI.getType(DstReg);
1941   unsigned AS = Ty.getAddressSpace();
1942 
1943   const GlobalValue *GV = MI.getOperand(1).getGlobal();
1944   MachineFunction &MF = B.getMF();
1945   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1946   B.setInstr(MI);
1947 
1948   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1949     if (!MFI->isEntryFunction()) {
1950       const Function &Fn = MF.getFunction();
1951       DiagnosticInfoUnsupported BadLDSDecl(
1952         Fn, "local memory global used by non-kernel function", MI.getDebugLoc());
1953       Fn.getContext().diagnose(BadLDSDecl);
1954     }
1955 
1956     // TODO: We could emit code to handle the initialization somewhere.
1957     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1958       const SITargetLowering *TLI = ST.getTargetLowering();
1959       if (!TLI->shouldUseLDSConstAddress(GV)) {
1960         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
1961         return true; // Leave in place;
1962       }
1963 
1964       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1965       MI.eraseFromParent();
1966       return true;
1967     }
1968 
1969     const Function &Fn = MF.getFunction();
1970     DiagnosticInfoUnsupported BadInit(
1971       Fn, "unsupported initializer for address space", MI.getDebugLoc());
1972     Fn.getContext().diagnose(BadInit);
1973     return true;
1974   }
1975 
1976   const SITargetLowering *TLI = ST.getTargetLowering();
1977 
1978   if (TLI->shouldEmitFixup(GV)) {
1979     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
1980     MI.eraseFromParent();
1981     return true;
1982   }
1983 
1984   if (TLI->shouldEmitPCReloc(GV)) {
1985     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
1986     MI.eraseFromParent();
1987     return true;
1988   }
1989 
1990   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1991   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
1992 
1993   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
1994     MachinePointerInfo::getGOT(MF),
1995     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1996     MachineMemOperand::MOInvariant,
1997     8 /*Size*/, 8 /*Align*/);
1998 
1999   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2000 
2001   if (Ty.getSizeInBits() == 32) {
2002     // Truncate if this is a 32-bit constant adrdess.
2003     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2004     B.buildExtract(DstReg, Load, 0);
2005   } else
2006     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2007 
2008   MI.eraseFromParent();
2009   return true;
2010 }
2011 
2012 bool AMDGPULegalizerInfo::legalizeLoad(
2013   MachineInstr &MI, MachineRegisterInfo &MRI,
2014   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
2015   B.setInstr(MI);
2016   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2017   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
2018   Observer.changingInstr(MI);
2019   MI.getOperand(1).setReg(Cast.getReg(0));
2020   Observer.changedInstr(MI);
2021   return true;
2022 }
2023 
2024 bool AMDGPULegalizerInfo::legalizeFMad(
2025   MachineInstr &MI, MachineRegisterInfo &MRI,
2026   MachineIRBuilder &B) const {
2027   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2028   assert(Ty.isScalar());
2029 
2030   MachineFunction &MF = B.getMF();
2031   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2032 
2033   // TODO: Always legal with future ftz flag.
2034   // FIXME: Do we need just output?
2035   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2036     return true;
2037   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2038     return true;
2039 
2040   MachineIRBuilder HelperBuilder(MI);
2041   GISelObserverWrapper DummyObserver;
2042   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2043   HelperBuilder.setMBB(*MI.getParent());
2044   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2045 }
2046 
2047 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2048   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2049   Register DstReg = MI.getOperand(0).getReg();
2050   Register PtrReg = MI.getOperand(1).getReg();
2051   Register CmpVal = MI.getOperand(2).getReg();
2052   Register NewVal = MI.getOperand(3).getReg();
2053 
2054   assert(SITargetLowering::isFlatGlobalAddrSpace(
2055            MRI.getType(PtrReg).getAddressSpace()) &&
2056          "this should not have been custom lowered");
2057 
2058   LLT ValTy = MRI.getType(CmpVal);
2059   LLT VecTy = LLT::vector(2, ValTy);
2060 
2061   B.setInstr(MI);
2062   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2063 
2064   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2065     .addDef(DstReg)
2066     .addUse(PtrReg)
2067     .addUse(PackedVal)
2068     .setMemRefs(MI.memoperands());
2069 
2070   MI.eraseFromParent();
2071   return true;
2072 }
2073 
2074 bool AMDGPULegalizerInfo::legalizeFlog(
2075   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2076   Register Dst = MI.getOperand(0).getReg();
2077   Register Src = MI.getOperand(1).getReg();
2078   LLT Ty = B.getMRI()->getType(Dst);
2079   unsigned Flags = MI.getFlags();
2080   B.setInstr(MI);
2081 
2082   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2083   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2084 
2085   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2086   MI.eraseFromParent();
2087   return true;
2088 }
2089 
2090 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2091                                        MachineIRBuilder &B) const {
2092   Register Dst = MI.getOperand(0).getReg();
2093   Register Src = MI.getOperand(1).getReg();
2094   unsigned Flags = MI.getFlags();
2095   LLT Ty = B.getMRI()->getType(Dst);
2096   B.setInstr(MI);
2097 
2098   auto K = B.buildFConstant(Ty, numbers::log2e);
2099   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2100   B.buildFExp2(Dst, Mul, Flags);
2101   MI.eraseFromParent();
2102   return true;
2103 }
2104 
2105 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2106                                        MachineIRBuilder &B) const {
2107   Register Dst = MI.getOperand(0).getReg();
2108   Register Src0 = MI.getOperand(1).getReg();
2109   Register Src1 = MI.getOperand(2).getReg();
2110   unsigned Flags = MI.getFlags();
2111   LLT Ty = B.getMRI()->getType(Dst);
2112   B.setInstr(MI);
2113   const LLT S16 = LLT::scalar(16);
2114   const LLT S32 = LLT::scalar(32);
2115 
2116   if (Ty == S32) {
2117     auto Log = B.buildFLog2(S32, Src0, Flags);
2118     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2119       .addUse(Log.getReg(0))
2120       .addUse(Src1)
2121       .setMIFlags(Flags);
2122     B.buildFExp2(Dst, Mul, Flags);
2123   } else if (Ty == S16) {
2124     // There's no f16 fmul_legacy, so we need to convert for it.
2125     auto Log = B.buildFLog2(S16, Src0, Flags);
2126     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2127     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2128     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2129       .addUse(Ext0.getReg(0))
2130       .addUse(Ext1.getReg(0))
2131       .setMIFlags(Flags);
2132 
2133     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2134   } else
2135     return false;
2136 
2137   MI.eraseFromParent();
2138   return true;
2139 }
2140 
2141 // Find a source register, ignoring any possible source modifiers.
2142 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2143   Register ModSrc = OrigSrc;
2144   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2145     ModSrc = SrcFNeg->getOperand(1).getReg();
2146     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2147       ModSrc = SrcFAbs->getOperand(1).getReg();
2148   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2149     ModSrc = SrcFAbs->getOperand(1).getReg();
2150   return ModSrc;
2151 }
2152 
2153 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2154                                          MachineRegisterInfo &MRI,
2155                                          MachineIRBuilder &B) const {
2156   B.setInstr(MI);
2157 
2158   const LLT S1 = LLT::scalar(1);
2159   const LLT S64 = LLT::scalar(64);
2160   Register Dst = MI.getOperand(0).getReg();
2161   Register OrigSrc = MI.getOperand(1).getReg();
2162   unsigned Flags = MI.getFlags();
2163   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2164          "this should not have been custom lowered");
2165 
2166   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2167   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2168   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2169   // V_FRACT bug is:
2170   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2171   //
2172   // Convert floor(x) to (x - fract(x))
2173 
2174   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2175     .addUse(OrigSrc)
2176     .setMIFlags(Flags);
2177 
2178   // Give source modifier matching some assistance before obscuring a foldable
2179   // pattern.
2180 
2181   // TODO: We can avoid the neg on the fract? The input sign to fract
2182   // shouldn't matter?
2183   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2184 
2185   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2186 
2187   Register Min = MRI.createGenericVirtualRegister(S64);
2188 
2189   // We don't need to concern ourselves with the snan handling difference, so
2190   // use the one which will directly select.
2191   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2192   if (MFI->getMode().IEEE)
2193     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2194   else
2195     B.buildFMinNum(Min, Fract, Const, Flags);
2196 
2197   Register CorrectedFract = Min;
2198   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2199     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2200     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2201   }
2202 
2203   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2204   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2205 
2206   MI.eraseFromParent();
2207   return true;
2208 }
2209 
2210 // Turn an illegal packed v2s16 build vector into bit operations.
2211 // TODO: This should probably be a bitcast action in LegalizerHelper.
2212 bool AMDGPULegalizerInfo::legalizeBuildVector(
2213   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2214   Register Dst = MI.getOperand(0).getReg();
2215   LLT DstTy = MRI.getType(Dst);
2216   const LLT S32 = LLT::scalar(32);
2217   const LLT V2S16 = LLT::vector(2, 16);
2218   (void)DstTy;
2219   (void)V2S16;
2220   assert(DstTy == V2S16);
2221 
2222   Register Src0 = MI.getOperand(1).getReg();
2223   Register Src1 = MI.getOperand(2).getReg();
2224   assert(MRI.getType(Src0) == LLT::scalar(16));
2225 
2226   B.setInstr(MI);
2227   auto Merge = B.buildMerge(S32, {Src0, Src1});
2228   B.buildBitcast(Dst, Merge);
2229 
2230   MI.eraseFromParent();
2231   return true;
2232 }
2233 
2234 // Return the use branch instruction, otherwise null if the usage is invalid.
2235 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2236                                        MachineRegisterInfo &MRI,
2237                                        MachineInstr *&Br) {
2238   Register CondDef = MI.getOperand(0).getReg();
2239   if (!MRI.hasOneNonDBGUse(CondDef))
2240     return nullptr;
2241 
2242   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2243   if (UseMI.getParent() != MI.getParent() ||
2244       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2245     return nullptr;
2246 
2247   // Make sure the cond br is followed by a G_BR
2248   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2249   if (Next != MI.getParent()->end()) {
2250     if (Next->getOpcode() != AMDGPU::G_BR)
2251       return nullptr;
2252     Br = &*Next;
2253   }
2254 
2255   return &UseMI;
2256 }
2257 
2258 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
2259                                                 Register Reg, LLT Ty) const {
2260   Register LiveIn = MRI.getLiveInVirtReg(Reg);
2261   if (LiveIn)
2262     return LiveIn;
2263 
2264   Register NewReg = MRI.createGenericVirtualRegister(Ty);
2265   MRI.addLiveIn(Reg, NewReg);
2266   return NewReg;
2267 }
2268 
2269 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2270                                          const ArgDescriptor *Arg) const {
2271   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2272     return false; // TODO: Handle these
2273 
2274   assert(Arg->getRegister().isPhysical());
2275 
2276   MachineRegisterInfo &MRI = *B.getMRI();
2277 
2278   LLT Ty = MRI.getType(DstReg);
2279   Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
2280 
2281   if (Arg->isMasked()) {
2282     // TODO: Should we try to emit this once in the entry block?
2283     const LLT S32 = LLT::scalar(32);
2284     const unsigned Mask = Arg->getMask();
2285     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2286 
2287     Register AndMaskSrc = LiveIn;
2288 
2289     if (Shift != 0) {
2290       auto ShiftAmt = B.buildConstant(S32, Shift);
2291       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2292     }
2293 
2294     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2295   } else
2296     B.buildCopy(DstReg, LiveIn);
2297 
2298   // Insert the argument copy if it doens't already exist.
2299   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
2300   if (!MRI.getVRegDef(LiveIn)) {
2301     // FIXME: Should have scoped insert pt
2302     MachineBasicBlock &OrigInsBB = B.getMBB();
2303     auto OrigInsPt = B.getInsertPt();
2304 
2305     MachineBasicBlock &EntryMBB = B.getMF().front();
2306     EntryMBB.addLiveIn(Arg->getRegister());
2307     B.setInsertPt(EntryMBB, EntryMBB.begin());
2308     B.buildCopy(LiveIn, Arg->getRegister());
2309 
2310     B.setInsertPt(OrigInsBB, OrigInsPt);
2311   }
2312 
2313   return true;
2314 }
2315 
2316 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2317   MachineInstr &MI,
2318   MachineRegisterInfo &MRI,
2319   MachineIRBuilder &B,
2320   AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2321   B.setInstr(MI);
2322 
2323   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2324 
2325   const ArgDescriptor *Arg;
2326   const TargetRegisterClass *RC;
2327   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
2328   if (!Arg) {
2329     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
2330     return false;
2331   }
2332 
2333   if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
2334     MI.eraseFromParent();
2335     return true;
2336   }
2337 
2338   return false;
2339 }
2340 
2341 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2342                                        MachineRegisterInfo &MRI,
2343                                        MachineIRBuilder &B) const {
2344   B.setInstr(MI);
2345   Register Dst = MI.getOperand(0).getReg();
2346   LLT DstTy = MRI.getType(Dst);
2347   LLT S16 = LLT::scalar(16);
2348   LLT S32 = LLT::scalar(32);
2349   LLT S64 = LLT::scalar(64);
2350 
2351   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2352     return true;
2353 
2354   if (DstTy == S16)
2355     return legalizeFDIV16(MI, MRI, B);
2356   if (DstTy == S32)
2357     return legalizeFDIV32(MI, MRI, B);
2358   if (DstTy == S64)
2359     return legalizeFDIV64(MI, MRI, B);
2360 
2361   return false;
2362 }
2363 
2364 static Register buildDivRCP(MachineIRBuilder &B, Register Src) {
2365   const LLT S32 = LLT::scalar(32);
2366 
2367   auto Cvt0 = B.buildUITOFP(S32, Src);
2368   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Cvt0});
2369   auto FPUIntMaxPlus1 = B.buildFConstant(S32, BitsToFloat(0x4f800000));
2370   auto Mul = B.buildFMul(S32, RcpIFlag, FPUIntMaxPlus1);
2371   return B.buildFPTOUI(S32, Mul).getReg(0);
2372 }
2373 
2374 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2375                                                   Register DstReg,
2376                                                   Register Num,
2377                                                   Register Den,
2378                                                   bool IsRem) const {
2379   const LLT S1 = LLT::scalar(1);
2380   const LLT S32 = LLT::scalar(32);
2381 
2382   // RCP =  URECIP(Den) = 2^32 / Den + e
2383   // e is rounding error.
2384   auto RCP = buildDivRCP(B, Den);
2385 
2386   // RCP_LO = mul(RCP, Den)
2387   auto RCP_LO = B.buildMul(S32, RCP, Den);
2388 
2389   // RCP_HI = mulhu (RCP, Den) */
2390   auto RCP_HI = B.buildUMulH(S32, RCP, Den);
2391 
2392   // NEG_RCP_LO = -RCP_LO
2393   auto Zero = B.buildConstant(S32, 0);
2394   auto NEG_RCP_LO = B.buildSub(S32, Zero, RCP_LO);
2395 
2396   // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
2397   auto CmpRcpHiZero = B.buildICmp(CmpInst::ICMP_EQ, S1, RCP_HI, Zero);
2398   auto ABS_RCP_LO = B.buildSelect(S32, CmpRcpHiZero, NEG_RCP_LO, RCP_LO);
2399 
2400   // Calculate the rounding error from the URECIP instruction
2401   // E = mulhu(ABS_RCP_LO, RCP)
2402   auto E = B.buildUMulH(S32, ABS_RCP_LO, RCP);
2403 
2404   // RCP_A_E = RCP + E
2405   auto RCP_A_E = B.buildAdd(S32, RCP, E);
2406 
2407   // RCP_S_E = RCP - E
2408   auto RCP_S_E = B.buildSub(S32, RCP, E);
2409 
2410   // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
2411   auto Tmp0 = B.buildSelect(S32, CmpRcpHiZero, RCP_A_E, RCP_S_E);
2412 
2413   // Quotient = mulhu(Tmp0, Num)stmp
2414   auto Quotient = B.buildUMulH(S32, Tmp0, Num);
2415 
2416   // Num_S_Remainder = Quotient * Den
2417   auto Num_S_Remainder = B.buildMul(S32, Quotient, Den);
2418 
2419   // Remainder = Num - Num_S_Remainder
2420   auto Remainder = B.buildSub(S32, Num, Num_S_Remainder);
2421 
2422   // Remainder_GE_Den = Remainder >= Den
2423   auto Remainder_GE_Den = B.buildICmp(CmpInst::ICMP_UGE, S1, Remainder, Den);
2424 
2425   // Remainder_GE_Zero = Num >= Num_S_Remainder;
2426   auto Remainder_GE_Zero = B.buildICmp(CmpInst::ICMP_UGE, S1,
2427                                        Num, Num_S_Remainder);
2428 
2429   // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
2430   auto Tmp1 = B.buildAnd(S1, Remainder_GE_Den, Remainder_GE_Zero);
2431 
2432   // Calculate Division result:
2433 
2434   // Quotient_A_One = Quotient + 1
2435   auto One = B.buildConstant(S32, 1);
2436   auto Quotient_A_One = B.buildAdd(S32, Quotient, One);
2437 
2438   // Quotient_S_One = Quotient - 1
2439   auto Quotient_S_One = B.buildSub(S32, Quotient, One);
2440 
2441   // Div = (Tmp1 == 0 ? Quotient_A_One : Quotient)
2442   auto Div = B.buildSelect(S32, Tmp1, Quotient, Quotient_A_One);
2443 
2444   // Div = (Remainder_GE_Zero ? Div : Quotient_S_One)
2445   if (IsRem) {
2446     Div = B.buildSelect(S32, Remainder_GE_Zero, Div, Quotient_S_One);
2447 
2448     // Calculate Rem result:
2449     auto Remainder_S_Den = B.buildSub(S32, Remainder, Den);
2450 
2451     // Remainder_A_Den = Remainder + Den
2452     auto Remainder_A_Den = B.buildAdd(S32, Remainder, Den);
2453 
2454     // Rem = (Tmp1 ? Remainder_S_Den : Remainder)
2455     auto Rem = B.buildSelect(S32, Tmp1, Remainder_S_Den, Remainder);
2456 
2457     // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den)
2458     B.buildSelect(DstReg, Remainder_GE_Zero, Rem, Remainder_A_Den);
2459   } else {
2460     B.buildSelect(DstReg, Remainder_GE_Zero, Div, Quotient_S_One);
2461   }
2462 }
2463 
2464 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2465                                               MachineRegisterInfo &MRI,
2466                                               MachineIRBuilder &B) const {
2467   B.setInstr(MI);
2468   const bool IsRem = MI.getOpcode() == AMDGPU::G_UREM;
2469   Register DstReg = MI.getOperand(0).getReg();
2470   Register Num = MI.getOperand(1).getReg();
2471   Register Den = MI.getOperand(2).getReg();
2472   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsRem);
2473   MI.eraseFromParent();
2474   return true;
2475 }
2476 
2477 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2478                                             MachineRegisterInfo &MRI,
2479                                             MachineIRBuilder &B) const {
2480   if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32))
2481     return legalizeUDIV_UREM32(MI, MRI, B);
2482   return false;
2483 }
2484 
2485 bool AMDGPULegalizerInfo::legalizeSDIV_SREM32(MachineInstr &MI,
2486                                               MachineRegisterInfo &MRI,
2487                                               MachineIRBuilder &B) const {
2488   B.setInstr(MI);
2489   const LLT S32 = LLT::scalar(32);
2490 
2491   const bool IsRem = MI.getOpcode() == AMDGPU::G_SREM;
2492   Register DstReg = MI.getOperand(0).getReg();
2493   Register LHS = MI.getOperand(1).getReg();
2494   Register RHS = MI.getOperand(2).getReg();
2495 
2496   auto ThirtyOne = B.buildConstant(S32, 31);
2497   auto LHSign = B.buildAShr(S32, LHS, ThirtyOne);
2498   auto RHSign = B.buildAShr(S32, LHS, ThirtyOne);
2499 
2500   LHS = B.buildAdd(S32, LHS, LHSign).getReg(0);
2501   RHS = B.buildAdd(S32, RHS, RHSign).getReg(0);
2502 
2503   LHS = B.buildXor(S32, LHS, LHSign).getReg(0);
2504   RHS = B.buildXor(S32, RHS, RHSign).getReg(0);
2505 
2506   Register UDivRem = MRI.createGenericVirtualRegister(S32);
2507   legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsRem);
2508 
2509   if (IsRem) {
2510     auto RSign = LHSign; // Remainder sign is the same as LHS
2511     UDivRem = B.buildXor(S32, UDivRem, RSign).getReg(0);
2512     B.buildSub(DstReg, UDivRem, RSign);
2513   } else {
2514     auto DSign = B.buildXor(S32, LHSign, RHSign);
2515     UDivRem = B.buildXor(S32, UDivRem, DSign).getReg(0);
2516     B.buildSub(DstReg, UDivRem, DSign);
2517   }
2518 
2519   MI.eraseFromParent();
2520   return true;
2521 }
2522 
2523 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2524                                             MachineRegisterInfo &MRI,
2525                                             MachineIRBuilder &B) const {
2526   if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32))
2527     return legalizeSDIV_SREM32(MI, MRI, B);
2528   return false;
2529 }
2530 
2531 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2532                                                  MachineRegisterInfo &MRI,
2533                                                  MachineIRBuilder &B) const {
2534   Register Res = MI.getOperand(0).getReg();
2535   Register LHS = MI.getOperand(1).getReg();
2536   Register RHS = MI.getOperand(2).getReg();
2537 
2538   uint16_t Flags = MI.getFlags();
2539 
2540   LLT ResTy = MRI.getType(Res);
2541   LLT S32 = LLT::scalar(32);
2542   LLT S64 = LLT::scalar(64);
2543 
2544   const MachineFunction &MF = B.getMF();
2545   bool Unsafe =
2546     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2547 
2548   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2549     return false;
2550 
2551   if (!Unsafe && ResTy == S32 &&
2552       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2553     return false;
2554 
2555   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2556     // 1 / x -> RCP(x)
2557     if (CLHS->isExactlyValue(1.0)) {
2558       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2559         .addUse(RHS)
2560         .setMIFlags(Flags);
2561 
2562       MI.eraseFromParent();
2563       return true;
2564     }
2565 
2566     // -1 / x -> RCP( FNEG(x) )
2567     if (CLHS->isExactlyValue(-1.0)) {
2568       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2569       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2570         .addUse(FNeg.getReg(0))
2571         .setMIFlags(Flags);
2572 
2573       MI.eraseFromParent();
2574       return true;
2575     }
2576   }
2577 
2578   // x / y -> x * (1.0 / y)
2579   if (Unsafe) {
2580     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2581       .addUse(RHS)
2582       .setMIFlags(Flags);
2583     B.buildFMul(Res, LHS, RCP, Flags);
2584 
2585     MI.eraseFromParent();
2586     return true;
2587   }
2588 
2589   return false;
2590 }
2591 
2592 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2593                                          MachineRegisterInfo &MRI,
2594                                          MachineIRBuilder &B) const {
2595   B.setInstr(MI);
2596   Register Res = MI.getOperand(0).getReg();
2597   Register LHS = MI.getOperand(1).getReg();
2598   Register RHS = MI.getOperand(2).getReg();
2599 
2600   uint16_t Flags = MI.getFlags();
2601 
2602   LLT S16 = LLT::scalar(16);
2603   LLT S32 = LLT::scalar(32);
2604 
2605   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2606   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2607 
2608   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2609     .addUse(RHSExt.getReg(0))
2610     .setMIFlags(Flags);
2611 
2612   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2613   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2614 
2615   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2616     .addUse(RDst.getReg(0))
2617     .addUse(RHS)
2618     .addUse(LHS)
2619     .setMIFlags(Flags);
2620 
2621   MI.eraseFromParent();
2622   return true;
2623 }
2624 
2625 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2626 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2627 static void toggleSPDenormMode(bool Enable,
2628                                MachineIRBuilder &B,
2629                                const GCNSubtarget &ST,
2630                                AMDGPU::SIModeRegisterDefaults Mode) {
2631   // Set SP denorm mode to this value.
2632   unsigned SPDenormMode =
2633     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
2634 
2635   if (ST.hasDenormModeInst()) {
2636     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2637     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
2638 
2639     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2640     B.buildInstr(AMDGPU::S_DENORM_MODE)
2641       .addImm(NewDenormModeValue);
2642 
2643   } else {
2644     // Select FP32 bit field in mode register.
2645     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2646                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2647                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2648 
2649     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2650       .addImm(SPDenormMode)
2651       .addImm(SPDenormModeBitField);
2652   }
2653 }
2654 
2655 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2656                                          MachineRegisterInfo &MRI,
2657                                          MachineIRBuilder &B) const {
2658   B.setInstr(MI);
2659   Register Res = MI.getOperand(0).getReg();
2660   Register LHS = MI.getOperand(1).getReg();
2661   Register RHS = MI.getOperand(2).getReg();
2662   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2663   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2664 
2665   uint16_t Flags = MI.getFlags();
2666 
2667   LLT S32 = LLT::scalar(32);
2668   LLT S1 = LLT::scalar(1);
2669 
2670   auto One = B.buildFConstant(S32, 1.0f);
2671 
2672   auto DenominatorScaled =
2673     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2674       .addUse(RHS)
2675       .addUse(LHS)
2676       .addImm(1)
2677       .setMIFlags(Flags);
2678   auto NumeratorScaled =
2679     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2680       .addUse(LHS)
2681       .addUse(RHS)
2682       .addImm(0)
2683       .setMIFlags(Flags);
2684 
2685   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2686     .addUse(DenominatorScaled.getReg(0))
2687     .setMIFlags(Flags);
2688   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2689 
2690   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2691   // aren't modeled as reading it.
2692   if (!Mode.allFP32Denormals())
2693     toggleSPDenormMode(true, B, ST, Mode);
2694 
2695   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2696   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2697   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2698   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2699   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2700   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2701 
2702   if (!Mode.allFP32Denormals())
2703     toggleSPDenormMode(false, B, ST, Mode);
2704 
2705   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2706     .addUse(Fma4.getReg(0))
2707     .addUse(Fma1.getReg(0))
2708     .addUse(Fma3.getReg(0))
2709     .addUse(NumeratorScaled.getReg(1))
2710     .setMIFlags(Flags);
2711 
2712   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2713     .addUse(Fmas.getReg(0))
2714     .addUse(RHS)
2715     .addUse(LHS)
2716     .setMIFlags(Flags);
2717 
2718   MI.eraseFromParent();
2719   return true;
2720 }
2721 
2722 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
2723                                          MachineRegisterInfo &MRI,
2724                                          MachineIRBuilder &B) const {
2725   B.setInstr(MI);
2726   Register Res = MI.getOperand(0).getReg();
2727   Register LHS = MI.getOperand(1).getReg();
2728   Register RHS = MI.getOperand(2).getReg();
2729 
2730   uint16_t Flags = MI.getFlags();
2731 
2732   LLT S64 = LLT::scalar(64);
2733   LLT S1 = LLT::scalar(1);
2734 
2735   auto One = B.buildFConstant(S64, 1.0);
2736 
2737   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2738     .addUse(LHS)
2739     .addUse(RHS)
2740     .addImm(1)
2741     .setMIFlags(Flags);
2742 
2743   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
2744 
2745   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
2746     .addUse(DivScale0.getReg(0))
2747     .setMIFlags(Flags);
2748 
2749   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
2750   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
2751   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
2752 
2753   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2754     .addUse(LHS)
2755     .addUse(RHS)
2756     .addImm(0)
2757     .setMIFlags(Flags);
2758 
2759   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
2760   auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags);
2761   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
2762 
2763   Register Scale;
2764   if (!ST.hasUsableDivScaleConditionOutput()) {
2765     // Workaround a hardware bug on SI where the condition output from div_scale
2766     // is not usable.
2767 
2768     LLT S32 = LLT::scalar(32);
2769 
2770     auto NumUnmerge = B.buildUnmerge(S32, LHS);
2771     auto DenUnmerge = B.buildUnmerge(S32, RHS);
2772     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
2773     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
2774 
2775     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
2776                               Scale1Unmerge.getReg(1));
2777     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
2778                               Scale0Unmerge.getReg(1));
2779     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
2780   } else {
2781     Scale = DivScale1.getReg(1);
2782   }
2783 
2784   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
2785     .addUse(Fma4.getReg(0))
2786     .addUse(Fma3.getReg(0))
2787     .addUse(Mul.getReg(0))
2788     .addUse(Scale)
2789     .setMIFlags(Flags);
2790 
2791   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
2792     .addUse(Fmas.getReg(0))
2793     .addUse(RHS)
2794     .addUse(LHS)
2795     .setMIFlags(Flags);
2796 
2797   MI.eraseFromParent();
2798   return true;
2799 }
2800 
2801 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
2802                                                  MachineRegisterInfo &MRI,
2803                                                  MachineIRBuilder &B) const {
2804   B.setInstr(MI);
2805   Register Res = MI.getOperand(0).getReg();
2806   Register LHS = MI.getOperand(2).getReg();
2807   Register RHS = MI.getOperand(3).getReg();
2808   uint16_t Flags = MI.getFlags();
2809 
2810   LLT S32 = LLT::scalar(32);
2811   LLT S1 = LLT::scalar(1);
2812 
2813   auto Abs = B.buildFAbs(S32, RHS, Flags);
2814   const APFloat C0Val(1.0f);
2815 
2816   auto C0 = B.buildConstant(S32, 0x6f800000);
2817   auto C1 = B.buildConstant(S32, 0x2f800000);
2818   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
2819 
2820   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
2821   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
2822 
2823   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
2824 
2825   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2826     .addUse(Mul0.getReg(0))
2827     .setMIFlags(Flags);
2828 
2829   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
2830 
2831   B.buildFMul(Res, Sel, Mul1, Flags);
2832 
2833   MI.eraseFromParent();
2834   return true;
2835 }
2836 
2837 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
2838                                                  MachineRegisterInfo &MRI,
2839                                                  MachineIRBuilder &B) const {
2840   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2841   if (!MFI->isEntryFunction()) {
2842     return legalizePreloadedArgIntrin(MI, MRI, B,
2843                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
2844   }
2845 
2846   B.setInstr(MI);
2847 
2848   uint64_t Offset =
2849     ST.getTargetLowering()->getImplicitParameterOffset(
2850       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
2851   Register DstReg = MI.getOperand(0).getReg();
2852   LLT DstTy = MRI.getType(DstReg);
2853   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
2854 
2855   const ArgDescriptor *Arg;
2856   const TargetRegisterClass *RC;
2857   std::tie(Arg, RC)
2858     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2859   if (!Arg)
2860     return false;
2861 
2862   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
2863   if (!loadInputValue(KernargPtrReg, B, Arg))
2864     return false;
2865 
2866   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
2867   MI.eraseFromParent();
2868   return true;
2869 }
2870 
2871 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
2872                                               MachineRegisterInfo &MRI,
2873                                               MachineIRBuilder &B,
2874                                               unsigned AddrSpace) const {
2875   B.setInstr(MI);
2876   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
2877   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
2878   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
2879   MI.eraseFromParent();
2880   return true;
2881 }
2882 
2883 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
2884 // offset (the offset that is included in bounds checking and swizzling, to be
2885 // split between the instruction's voffset and immoffset fields) and soffset
2886 // (the offset that is excluded from bounds checking and swizzling, to go in
2887 // the instruction's soffset field).  This function takes the first kind of
2888 // offset and figures out how to split it between voffset and immoffset.
2889 std::tuple<Register, unsigned, unsigned>
2890 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
2891                                         Register OrigOffset) const {
2892   const unsigned MaxImm = 4095;
2893   Register BaseReg;
2894   unsigned TotalConstOffset;
2895   MachineInstr *OffsetDef;
2896   const LLT S32 = LLT::scalar(32);
2897 
2898   std::tie(BaseReg, TotalConstOffset, OffsetDef)
2899     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
2900 
2901   unsigned ImmOffset = TotalConstOffset;
2902 
2903   // If the immediate value is too big for the immoffset field, put the value
2904   // and -4096 into the immoffset field so that the value that is copied/added
2905   // for the voffset field is a multiple of 4096, and it stands more chance
2906   // of being CSEd with the copy/add for another similar load/store.
2907   // However, do not do that rounding down to a multiple of 4096 if that is a
2908   // negative number, as it appears to be illegal to have a negative offset
2909   // in the vgpr, even if adding the immediate offset makes it positive.
2910   unsigned Overflow = ImmOffset & ~MaxImm;
2911   ImmOffset -= Overflow;
2912   if ((int32_t)Overflow < 0) {
2913     Overflow += ImmOffset;
2914     ImmOffset = 0;
2915   }
2916 
2917   if (Overflow != 0) {
2918     if (!BaseReg) {
2919       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
2920     } else {
2921       auto OverflowVal = B.buildConstant(S32, Overflow);
2922       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
2923     }
2924   }
2925 
2926   if (!BaseReg)
2927     BaseReg = B.buildConstant(S32, 0).getReg(0);
2928 
2929   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
2930 }
2931 
2932 /// Handle register layout difference for f16 images for some subtargets.
2933 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
2934                                              MachineRegisterInfo &MRI,
2935                                              Register Reg) const {
2936   if (!ST.hasUnpackedD16VMem())
2937     return Reg;
2938 
2939   const LLT S16 = LLT::scalar(16);
2940   const LLT S32 = LLT::scalar(32);
2941   LLT StoreVT = MRI.getType(Reg);
2942   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
2943 
2944   auto Unmerge = B.buildUnmerge(S16, Reg);
2945 
2946   SmallVector<Register, 4> WideRegs;
2947   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
2948     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
2949 
2950   int NumElts = StoreVT.getNumElements();
2951 
2952   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
2953 }
2954 
2955 Register AMDGPULegalizerInfo::fixStoreSourceType(
2956   MachineIRBuilder &B, Register VData, bool IsFormat) const {
2957   MachineRegisterInfo *MRI = B.getMRI();
2958   LLT Ty = MRI->getType(VData);
2959 
2960   const LLT S16 = LLT::scalar(16);
2961 
2962   // Fixup illegal register types for i8 stores.
2963   if (Ty == LLT::scalar(8) || Ty == S16) {
2964     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
2965     return AnyExt;
2966   }
2967 
2968   if (Ty.isVector()) {
2969     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
2970       if (IsFormat)
2971         return handleD16VData(B, *MRI, VData);
2972     }
2973   }
2974 
2975   return VData;
2976 }
2977 
2978 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
2979                                               MachineRegisterInfo &MRI,
2980                                               MachineIRBuilder &B,
2981                                               bool IsTyped,
2982                                               bool IsFormat) const {
2983   B.setInstr(MI);
2984 
2985   Register VData = MI.getOperand(1).getReg();
2986   LLT Ty = MRI.getType(VData);
2987   LLT EltTy = Ty.getScalarType();
2988   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
2989   const LLT S32 = LLT::scalar(32);
2990 
2991   VData = fixStoreSourceType(B, VData, IsFormat);
2992   Register RSrc = MI.getOperand(2).getReg();
2993 
2994   MachineMemOperand *MMO = *MI.memoperands_begin();
2995   const int MemSize = MMO->getSize();
2996 
2997   unsigned ImmOffset;
2998   unsigned TotalOffset;
2999 
3000   // The typed intrinsics add an immediate after the registers.
3001   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3002 
3003   // The struct intrinsic variants add one additional operand over raw.
3004   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3005   Register VIndex;
3006   int OpOffset = 0;
3007   if (HasVIndex) {
3008     VIndex = MI.getOperand(3).getReg();
3009     OpOffset = 1;
3010   }
3011 
3012   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3013   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3014 
3015   unsigned Format = 0;
3016   if (IsTyped) {
3017     Format = MI.getOperand(5 + OpOffset).getImm();
3018     ++OpOffset;
3019   }
3020 
3021   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3022 
3023   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3024   if (TotalOffset != 0)
3025     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3026 
3027   unsigned Opc;
3028   if (IsTyped) {
3029     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3030                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3031   } else if (IsFormat) {
3032     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3033                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3034   } else {
3035     switch (MemSize) {
3036     case 1:
3037       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3038       break;
3039     case 2:
3040       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3041       break;
3042     default:
3043       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3044       break;
3045     }
3046   }
3047 
3048   if (!VIndex)
3049     VIndex = B.buildConstant(S32, 0).getReg(0);
3050 
3051   auto MIB = B.buildInstr(Opc)
3052     .addUse(VData)              // vdata
3053     .addUse(RSrc)               // rsrc
3054     .addUse(VIndex)             // vindex
3055     .addUse(VOffset)            // voffset
3056     .addUse(SOffset)            // soffset
3057     .addImm(ImmOffset);         // offset(imm)
3058 
3059   if (IsTyped)
3060     MIB.addImm(Format);
3061 
3062   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3063      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3064      .addMemOperand(MMO);
3065 
3066   MI.eraseFromParent();
3067   return true;
3068 }
3069 
3070 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3071                                              MachineRegisterInfo &MRI,
3072                                              MachineIRBuilder &B,
3073                                              bool IsFormat,
3074                                              bool IsTyped) const {
3075   B.setInstr(MI);
3076 
3077   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3078   MachineMemOperand *MMO = *MI.memoperands_begin();
3079   const int MemSize = MMO->getSize();
3080   const LLT S32 = LLT::scalar(32);
3081 
3082   Register Dst = MI.getOperand(0).getReg();
3083   Register RSrc = MI.getOperand(2).getReg();
3084 
3085   // The typed intrinsics add an immediate after the registers.
3086   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3087 
3088   // The struct intrinsic variants add one additional operand over raw.
3089   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3090   Register VIndex;
3091   int OpOffset = 0;
3092   if (HasVIndex) {
3093     VIndex = MI.getOperand(3).getReg();
3094     OpOffset = 1;
3095   }
3096 
3097   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3098   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3099 
3100   unsigned Format = 0;
3101   if (IsTyped) {
3102     Format = MI.getOperand(5 + OpOffset).getImm();
3103     ++OpOffset;
3104   }
3105 
3106   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3107   unsigned ImmOffset;
3108   unsigned TotalOffset;
3109 
3110   LLT Ty = MRI.getType(Dst);
3111   LLT EltTy = Ty.getScalarType();
3112   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3113   const bool Unpacked = ST.hasUnpackedD16VMem();
3114 
3115   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3116   if (TotalOffset != 0)
3117     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3118 
3119   unsigned Opc;
3120 
3121   if (IsTyped) {
3122     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3123                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3124   } else if (IsFormat) {
3125     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3126                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3127   } else {
3128     switch (MemSize) {
3129     case 1:
3130       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3131       break;
3132     case 2:
3133       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3134       break;
3135     default:
3136       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3137       break;
3138     }
3139   }
3140 
3141   Register LoadDstReg;
3142 
3143   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3144   LLT UnpackedTy = Ty.changeElementSize(32);
3145 
3146   if (IsExtLoad)
3147     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3148   else if (Unpacked && IsD16 && Ty.isVector())
3149     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3150   else
3151     LoadDstReg = Dst;
3152 
3153   if (!VIndex)
3154     VIndex = B.buildConstant(S32, 0).getReg(0);
3155 
3156   auto MIB = B.buildInstr(Opc)
3157     .addDef(LoadDstReg)         // vdata
3158     .addUse(RSrc)               // rsrc
3159     .addUse(VIndex)             // vindex
3160     .addUse(VOffset)            // voffset
3161     .addUse(SOffset)            // soffset
3162     .addImm(ImmOffset);         // offset(imm)
3163 
3164   if (IsTyped)
3165     MIB.addImm(Format);
3166 
3167   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3168      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3169      .addMemOperand(MMO);
3170 
3171   if (LoadDstReg != Dst) {
3172     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3173 
3174     // Widen result for extending loads was widened.
3175     if (IsExtLoad)
3176       B.buildTrunc(Dst, LoadDstReg);
3177     else {
3178       // Repack to original 16-bit vector result
3179       // FIXME: G_TRUNC should work, but legalization currently fails
3180       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3181       SmallVector<Register, 4> Repack;
3182       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3183         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3184       B.buildMerge(Dst, Repack);
3185     }
3186   }
3187 
3188   MI.eraseFromParent();
3189   return true;
3190 }
3191 
3192 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3193                                                MachineIRBuilder &B,
3194                                                bool IsInc) const {
3195   B.setInstr(MI);
3196   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3197                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3198   B.buildInstr(Opc)
3199     .addDef(MI.getOperand(0).getReg())
3200     .addUse(MI.getOperand(2).getReg())
3201     .addUse(MI.getOperand(3).getReg())
3202     .cloneMemRefs(MI);
3203   MI.eraseFromParent();
3204   return true;
3205 }
3206 
3207 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3208   switch (IntrID) {
3209   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3210   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3211     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3212   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3213   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3214     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3215   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3216   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3217     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3218   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3219   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3220     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3221   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3222   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3223     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3224   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3225   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3226     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3227   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3228   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3229     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3230   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3231   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3232     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3233   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3234   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3235     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3236   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3237   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3238     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3239   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3240   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3241     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3242   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3243   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3244     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3245   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3246   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3247     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3248   default:
3249     llvm_unreachable("unhandled atomic opcode");
3250   }
3251 }
3252 
3253 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3254                                                MachineIRBuilder &B,
3255                                                Intrinsic::ID IID) const {
3256   B.setInstr(MI);
3257 
3258   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3259                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3260 
3261   Register Dst = MI.getOperand(0).getReg();
3262   Register VData = MI.getOperand(2).getReg();
3263 
3264   Register CmpVal;
3265   int OpOffset = 0;
3266 
3267   if (IsCmpSwap) {
3268     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3269     ++OpOffset;
3270   }
3271 
3272   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3273   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
3274 
3275   // The struct intrinsic variants add one additional operand over raw.
3276   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3277   Register VIndex;
3278   if (HasVIndex) {
3279     VIndex = MI.getOperand(4 + OpOffset).getReg();
3280     ++OpOffset;
3281   }
3282 
3283   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3284   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3285   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3286 
3287   MachineMemOperand *MMO = *MI.memoperands_begin();
3288 
3289   unsigned ImmOffset;
3290   unsigned TotalOffset;
3291   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3292   if (TotalOffset != 0)
3293     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3294 
3295   if (!VIndex)
3296     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3297 
3298   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
3299     .addDef(Dst)
3300     .addUse(VData); // vdata
3301 
3302   if (IsCmpSwap)
3303     MIB.addReg(CmpVal);
3304 
3305   MIB.addUse(RSrc)               // rsrc
3306      .addUse(VIndex)             // vindex
3307      .addUse(VOffset)            // voffset
3308      .addUse(SOffset)            // soffset
3309      .addImm(ImmOffset)          // offset(imm)
3310      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3311      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3312      .addMemOperand(MMO);
3313 
3314   MI.eraseFromParent();
3315   return true;
3316 }
3317 
3318 // Produce a vector of s16 elements from s32 pieces.
3319 static void truncToS16Vector(MachineIRBuilder &B, Register DstReg,
3320                              ArrayRef<Register> UnmergeParts) {
3321   const LLT S16 = LLT::scalar(16);
3322 
3323   SmallVector<Register, 4> RemergeParts(UnmergeParts.size());
3324   for (int I = 0, E = UnmergeParts.size(); I != E; ++I)
3325     RemergeParts[I] = B.buildTrunc(S16, UnmergeParts[I]).getReg(0);
3326 
3327   B.buildBuildVector(DstReg, RemergeParts);
3328 }
3329 
3330 /// Convert a set of s32 registers to a result vector with s16 elements.
3331 static void bitcastToS16Vector(MachineIRBuilder &B, Register DstReg,
3332                                ArrayRef<Register> UnmergeParts) {
3333   MachineRegisterInfo &MRI = *B.getMRI();
3334   const LLT V2S16 = LLT::vector(2, 16);
3335   LLT TargetTy = MRI.getType(DstReg);
3336   int NumElts = UnmergeParts.size();
3337 
3338   if (NumElts == 1) {
3339     assert(TargetTy == V2S16);
3340     B.buildBitcast(DstReg, UnmergeParts[0]);
3341     return;
3342   }
3343 
3344   SmallVector<Register, 4> RemergeParts(NumElts);
3345   for (int I = 0; I != NumElts; ++I)
3346     RemergeParts[I] = B.buildBitcast(V2S16, UnmergeParts[I]).getReg(0);
3347 
3348   if (TargetTy.getSizeInBits() == 32u * NumElts) {
3349     B.buildConcatVectors(DstReg, RemergeParts);
3350     return;
3351   }
3352 
3353   const LLT V3S16 = LLT::vector(3, 16);
3354   const LLT V6S16 = LLT::vector(6, 16);
3355 
3356   // Widen to v6s16 and unpack v3 parts.
3357   assert(TargetTy == V3S16);
3358 
3359   RemergeParts.push_back(B.buildUndef(V2S16).getReg(0));
3360   auto Concat = B.buildConcatVectors(V6S16, RemergeParts);
3361   B.buildUnmerge({DstReg, MRI.createGenericVirtualRegister(V3S16)}, Concat);
3362 }
3363 
3364 // FIXME: Just vector trunc should be sufficent, but legalization currently
3365 // broken.
3366 static void repackUnpackedD16Load(MachineIRBuilder &B, Register DstReg,
3367                                   Register WideDstReg) {
3368   const LLT S32 = LLT::scalar(32);
3369   const LLT S16 = LLT::scalar(16);
3370 
3371   auto Unmerge = B.buildUnmerge(S32, WideDstReg);
3372 
3373   int NumOps = Unmerge->getNumOperands() - 1;
3374   SmallVector<Register, 4> RemergeParts(NumOps);
3375   for (int I = 0; I != NumOps; ++I)
3376     RemergeParts[I] = B.buildTrunc(S16, Unmerge.getReg(I)).getReg(0);
3377 
3378   B.buildBuildVector(DstReg, RemergeParts);
3379 }
3380 
3381 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3382     MachineInstr &MI, MachineIRBuilder &B,
3383     GISelChangeObserver &Observer,
3384     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3385   bool IsTFE = MI.getNumExplicitDefs() == 2;
3386 
3387   // We are only processing the operands of d16 image operations on subtargets
3388   // that use the unpacked register layout, or need to repack the TFE result.
3389 
3390   // TODO: Need to handle a16 images too
3391   // TODO: Do we need to guard against already legalized intrinsics?
3392   if (!IsTFE && !ST.hasUnpackedD16VMem())
3393     return true;
3394 
3395   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3396     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3397 
3398   if (BaseOpcode->Atomic) // No d16 atomics, or TFE.
3399     return true;
3400 
3401   B.setInstr(MI);
3402 
3403   MachineRegisterInfo *MRI = B.getMRI();
3404   const LLT S32 = LLT::scalar(32);
3405   const LLT S16 = LLT::scalar(16);
3406 
3407   if (BaseOpcode->Store) { // No TFE for stores?
3408     Register VData = MI.getOperand(1).getReg();
3409     LLT Ty = MRI->getType(VData);
3410     if (!Ty.isVector() || Ty.getElementType() != S16)
3411       return true;
3412 
3413     B.setInstr(MI);
3414 
3415     Observer.changingInstr(MI);
3416     MI.getOperand(1).setReg(handleD16VData(B, *MRI, VData));
3417     Observer.changedInstr(MI);
3418     return true;
3419   }
3420 
3421   Register DstReg = MI.getOperand(0).getReg();
3422   LLT Ty = MRI->getType(DstReg);
3423   const LLT EltTy = Ty.getScalarType();
3424   const bool IsD16 = Ty.getScalarType() == S16;
3425   const unsigned NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
3426 
3427   if (IsTFE) {
3428     // In the IR, TFE is supposed to be used with a 2 element struct return
3429     // type. The intruction really returns these two values in one contiguous
3430     // register, with one additional dword beyond the loaded data. Rewrite the
3431     // return type to use a single register result.
3432     Register Dst1Reg = MI.getOperand(1).getReg();
3433     if (MRI->getType(Dst1Reg) != S32)
3434       return false;
3435 
3436     // TODO: Make sure the TFE operand bit is set.
3437 
3438     // The raw dword aligned data component of the load. The only legal cases
3439     // where this matters should be when using the packed D16 format, for
3440     // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
3441     LLT RoundedTy;
3442     LLT TFETy;
3443 
3444     if (IsD16 && ST.hasUnpackedD16VMem()) {
3445       RoundedTy = LLT::scalarOrVector(NumElts, 32);
3446       TFETy = LLT::vector(NumElts + 1, 32);
3447     } else {
3448       unsigned EltSize = Ty.getScalarSizeInBits();
3449       unsigned RoundedElts = (Ty.getSizeInBits() + 31) / 32;
3450       unsigned RoundedSize = 32 * RoundedElts;
3451       RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
3452       TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
3453     }
3454 
3455     Register TFEReg = MRI->createGenericVirtualRegister(TFETy);
3456     Observer.changingInstr(MI);
3457 
3458     MI.getOperand(0).setReg(TFEReg);
3459     MI.RemoveOperand(1);
3460 
3461     Observer.changedInstr(MI);
3462 
3463     // Insert after the instruction.
3464     B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3465 
3466     // Now figure out how to copy the new result register back into the old
3467     // result.
3468 
3469     SmallVector<Register, 5> UnmergeResults(TFETy.getNumElements(), Dst1Reg);
3470     int NumDataElts = TFETy.getNumElements() - 1;
3471 
3472     if (!Ty.isVector()) {
3473       // Simplest case is a trivial unmerge (plus a truncate for d16).
3474       UnmergeResults[0] = Ty == S32 ?
3475         DstReg : MRI->createGenericVirtualRegister(S32);
3476 
3477       B.buildUnmerge(UnmergeResults, TFEReg);
3478       if (Ty != S32)
3479         B.buildTrunc(DstReg, UnmergeResults[0]);
3480       return true;
3481     }
3482 
3483     // We have to repack into a new vector of some kind.
3484     for (int I = 0; I != NumDataElts; ++I)
3485       UnmergeResults[I] = MRI->createGenericVirtualRegister(S32);
3486     B.buildUnmerge(UnmergeResults, TFEReg);
3487 
3488     // Drop the final TFE element.
3489     ArrayRef<Register> DataPart(UnmergeResults.data(), NumDataElts);
3490 
3491     if (EltTy == S32)
3492       B.buildBuildVector(DstReg, DataPart);
3493     else if (ST.hasUnpackedD16VMem())
3494       truncToS16Vector(B, DstReg, DataPart);
3495     else
3496       bitcastToS16Vector(B, DstReg, DataPart);
3497 
3498     return true;
3499   }
3500 
3501   // Must be an image load.
3502   if (!Ty.isVector() || Ty.getElementType() != S16)
3503     return true;
3504 
3505   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3506 
3507   LLT WidenedTy = Ty.changeElementType(S32);
3508   Register WideDstReg = MRI->createGenericVirtualRegister(WidenedTy);
3509 
3510   Observer.changingInstr(MI);
3511   MI.getOperand(0).setReg(WideDstReg);
3512   Observer.changedInstr(MI);
3513 
3514   repackUnpackedD16Load(B, DstReg, WideDstReg);
3515   return true;
3516 }
3517 
3518 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
3519   MachineInstr &MI, MachineIRBuilder &B,
3520   GISelChangeObserver &Observer) const {
3521   Register Dst = MI.getOperand(0).getReg();
3522   LLT Ty = B.getMRI()->getType(Dst);
3523   unsigned Size = Ty.getSizeInBits();
3524   MachineFunction &MF = B.getMF();
3525 
3526   Observer.changingInstr(MI);
3527 
3528   // FIXME: We don't really need this intermediate instruction. The intrinsic
3529   // should be fixed to have a memory operand. Since it's readnone, we're not
3530   // allowed to add one.
3531   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
3532   MI.RemoveOperand(1); // Remove intrinsic ID
3533 
3534   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
3535   // TODO: Should this use datalayout alignment?
3536   const unsigned MemSize = (Size + 7) / 8;
3537   const unsigned MemAlign = 4;
3538   MachineMemOperand *MMO = MF.getMachineMemOperand(
3539     MachinePointerInfo(),
3540     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
3541     MachineMemOperand::MOInvariant, MemSize, MemAlign);
3542   MI.addMemOperand(MF, MMO);
3543 
3544   // There are no 96-bit result scalar loads, but widening to 128-bit should
3545   // always be legal. We may need to restore this to a 96-bit result if it turns
3546   // out this needs to be converted to a vector load during RegBankSelect.
3547   if (!isPowerOf2_32(Size)) {
3548     LegalizerHelper Helper(MF, *this, Observer, B);
3549     B.setInstr(MI);
3550 
3551     if (Ty.isVector())
3552       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
3553     else
3554       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
3555   }
3556 
3557   Observer.changedInstr(MI);
3558   return true;
3559 }
3560 
3561 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
3562                                             MachineIRBuilder &B,
3563                                             GISelChangeObserver &Observer) const {
3564   MachineRegisterInfo &MRI = *B.getMRI();
3565 
3566   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
3567   auto IntrID = MI.getIntrinsicID();
3568   switch (IntrID) {
3569   case Intrinsic::amdgcn_if:
3570   case Intrinsic::amdgcn_else: {
3571     MachineInstr *Br = nullptr;
3572     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
3573       const SIRegisterInfo *TRI
3574         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
3575 
3576       B.setInstr(*BrCond);
3577       Register Def = MI.getOperand(1).getReg();
3578       Register Use = MI.getOperand(3).getReg();
3579 
3580       MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB();
3581       if (Br)
3582         BrTarget = Br->getOperand(0).getMBB();
3583 
3584       if (IntrID == Intrinsic::amdgcn_if) {
3585         B.buildInstr(AMDGPU::SI_IF)
3586           .addDef(Def)
3587           .addUse(Use)
3588           .addMBB(BrTarget);
3589       } else {
3590         B.buildInstr(AMDGPU::SI_ELSE)
3591           .addDef(Def)
3592           .addUse(Use)
3593           .addMBB(BrTarget)
3594           .addImm(0);
3595       }
3596 
3597       if (Br)
3598         Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB());
3599 
3600       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
3601       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
3602       MI.eraseFromParent();
3603       BrCond->eraseFromParent();
3604       return true;
3605     }
3606 
3607     return false;
3608   }
3609   case Intrinsic::amdgcn_loop: {
3610     MachineInstr *Br = nullptr;
3611     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
3612       const SIRegisterInfo *TRI
3613         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
3614 
3615       B.setInstr(*BrCond);
3616 
3617       MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB();
3618       if (Br)
3619         BrTarget = Br->getOperand(0).getMBB();
3620 
3621       Register Reg = MI.getOperand(2).getReg();
3622       B.buildInstr(AMDGPU::SI_LOOP)
3623         .addUse(Reg)
3624         .addMBB(BrTarget);
3625 
3626       if (Br)
3627         Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB());
3628 
3629       MI.eraseFromParent();
3630       BrCond->eraseFromParent();
3631       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
3632       return true;
3633     }
3634 
3635     return false;
3636   }
3637   case Intrinsic::amdgcn_kernarg_segment_ptr:
3638     return legalizePreloadedArgIntrin(
3639       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
3640   case Intrinsic::amdgcn_implicitarg_ptr:
3641     return legalizeImplicitArgPtr(MI, MRI, B);
3642   case Intrinsic::amdgcn_workitem_id_x:
3643     return legalizePreloadedArgIntrin(MI, MRI, B,
3644                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
3645   case Intrinsic::amdgcn_workitem_id_y:
3646     return legalizePreloadedArgIntrin(MI, MRI, B,
3647                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
3648   case Intrinsic::amdgcn_workitem_id_z:
3649     return legalizePreloadedArgIntrin(MI, MRI, B,
3650                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
3651   case Intrinsic::amdgcn_workgroup_id_x:
3652     return legalizePreloadedArgIntrin(MI, MRI, B,
3653                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
3654   case Intrinsic::amdgcn_workgroup_id_y:
3655     return legalizePreloadedArgIntrin(MI, MRI, B,
3656                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
3657   case Intrinsic::amdgcn_workgroup_id_z:
3658     return legalizePreloadedArgIntrin(MI, MRI, B,
3659                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
3660   case Intrinsic::amdgcn_dispatch_ptr:
3661     return legalizePreloadedArgIntrin(MI, MRI, B,
3662                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
3663   case Intrinsic::amdgcn_queue_ptr:
3664     return legalizePreloadedArgIntrin(MI, MRI, B,
3665                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
3666   case Intrinsic::amdgcn_implicit_buffer_ptr:
3667     return legalizePreloadedArgIntrin(
3668       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
3669   case Intrinsic::amdgcn_dispatch_id:
3670     return legalizePreloadedArgIntrin(MI, MRI, B,
3671                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
3672   case Intrinsic::amdgcn_fdiv_fast:
3673     return legalizeFDIVFastIntrin(MI, MRI, B);
3674   case Intrinsic::amdgcn_is_shared:
3675     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
3676   case Intrinsic::amdgcn_is_private:
3677     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
3678   case Intrinsic::amdgcn_wavefrontsize: {
3679     B.setInstr(MI);
3680     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
3681     MI.eraseFromParent();
3682     return true;
3683   }
3684   case Intrinsic::amdgcn_s_buffer_load:
3685     return legalizeSBufferLoad(MI, B, Observer);
3686   case Intrinsic::amdgcn_raw_buffer_store:
3687   case Intrinsic::amdgcn_struct_buffer_store:
3688     return legalizeBufferStore(MI, MRI, B, false, false);
3689   case Intrinsic::amdgcn_raw_buffer_store_format:
3690   case Intrinsic::amdgcn_struct_buffer_store_format:
3691     return legalizeBufferStore(MI, MRI, B, false, true);
3692   case Intrinsic::amdgcn_raw_tbuffer_store:
3693   case Intrinsic::amdgcn_struct_tbuffer_store:
3694     return legalizeBufferStore(MI, MRI, B, true, true);
3695   case Intrinsic::amdgcn_raw_buffer_load:
3696   case Intrinsic::amdgcn_struct_buffer_load:
3697     return legalizeBufferLoad(MI, MRI, B, false, false);
3698   case Intrinsic::amdgcn_raw_buffer_load_format:
3699   case Intrinsic::amdgcn_struct_buffer_load_format:
3700     return legalizeBufferLoad(MI, MRI, B, true, false);
3701   case Intrinsic::amdgcn_raw_tbuffer_load:
3702   case Intrinsic::amdgcn_struct_tbuffer_load:
3703     return legalizeBufferLoad(MI, MRI, B, true, true);
3704   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3705   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3706   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3707   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3708   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3709   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3710   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3711   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3712   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3713   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3714   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3715   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3716   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3717   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3718   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3719   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3720   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3721   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3722   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3723   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3724   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3725   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3726   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3727   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3728   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3729   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3730     return legalizeBufferAtomic(MI, B, IntrID);
3731   case Intrinsic::amdgcn_atomic_inc:
3732     return legalizeAtomicIncDec(MI, B, true);
3733   case Intrinsic::amdgcn_atomic_dec:
3734     return legalizeAtomicIncDec(MI, B, false);
3735   default: {
3736     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
3737             AMDGPU::getImageDimIntrinsicInfo(IntrID))
3738       return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr);
3739     return true;
3740   }
3741   }
3742 
3743   return true;
3744 }
3745