1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
19 #endif
20 
21 #include "AMDGPULegalizerInfo.h"
22 
23 #include "AMDGPU.h"
24 #include "AMDGPUGlobalISelUtils.h"
25 #include "AMDGPUTargetMachine.h"
26 #include "SIMachineFunctionInfo.h"
27 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
28 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
29 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
30 #include "llvm/CodeGen/TargetOpcodes.h"
31 #include "llvm/CodeGen/ValueTypes.h"
32 #include "llvm/IR/DerivedTypes.h"
33 #include "llvm/IR/DiagnosticInfo.h"
34 #include "llvm/IR/Type.h"
35 #include "llvm/Support/Debug.h"
36 
37 #define DEBUG_TYPE "amdgpu-legalinfo"
38 
39 using namespace llvm;
40 using namespace LegalizeActions;
41 using namespace LegalizeMutations;
42 using namespace LegalityPredicates;
43 using namespace MIPatternMatch;
44 
45 // Round the number of elements to the next power of two elements
46 static LLT getPow2VectorType(LLT Ty) {
47   unsigned NElts = Ty.getNumElements();
48   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
49   return Ty.changeNumElements(Pow2NElts);
50 }
51 
52 // Round the number of bits to the next power of two bits
53 static LLT getPow2ScalarType(LLT Ty) {
54   unsigned Bits = Ty.getSizeInBits();
55   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
56   return LLT::scalar(Pow2Bits);
57 }
58 
59 static LegalityPredicate isMultiple32(unsigned TypeIdx,
60                                       unsigned MaxSize = 1024) {
61   return [=](const LegalityQuery &Query) {
62     const LLT Ty = Query.Types[TypeIdx];
63     const LLT EltTy = Ty.getScalarType();
64     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
65   };
66 }
67 
68 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
69   return [=](const LegalityQuery &Query) {
70     return Query.Types[TypeIdx].getSizeInBits() == Size;
71   };
72 }
73 
74 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
75   return [=](const LegalityQuery &Query) {
76     const LLT Ty = Query.Types[TypeIdx];
77     return Ty.isVector() &&
78            Ty.getNumElements() % 2 != 0 &&
79            Ty.getElementType().getSizeInBits() < 32 &&
80            Ty.getSizeInBits() % 32 != 0;
81   };
82 }
83 
84 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
85   return [=](const LegalityQuery &Query) {
86     const LLT Ty = Query.Types[TypeIdx];
87     const LLT EltTy = Ty.getScalarType();
88     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
89   };
90 }
91 
92 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
93   return [=](const LegalityQuery &Query) {
94     const LLT Ty = Query.Types[TypeIdx];
95     const LLT EltTy = Ty.getElementType();
96     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
97   };
98 }
99 
100 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
101   return [=](const LegalityQuery &Query) {
102     const LLT Ty = Query.Types[TypeIdx];
103     const LLT EltTy = Ty.getElementType();
104     unsigned Size = Ty.getSizeInBits();
105     unsigned Pieces = (Size + 63) / 64;
106     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
107     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
108   };
109 }
110 
111 // Increase the number of vector elements to reach the next multiple of 32-bit
112 // type.
113 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
114   return [=](const LegalityQuery &Query) {
115     const LLT Ty = Query.Types[TypeIdx];
116 
117     const LLT EltTy = Ty.getElementType();
118     const int Size = Ty.getSizeInBits();
119     const int EltSize = EltTy.getSizeInBits();
120     const int NextMul32 = (Size + 31) / 32;
121 
122     assert(EltSize < 32);
123 
124     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
125     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
126   };
127 }
128 
129 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
130   return [=](const LegalityQuery &Query) {
131     const LLT QueryTy = Query.Types[TypeIdx];
132     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
133   };
134 }
135 
136 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
137   return [=](const LegalityQuery &Query) {
138     const LLT QueryTy = Query.Types[TypeIdx];
139     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
140   };
141 }
142 
143 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
144   return [=](const LegalityQuery &Query) {
145     const LLT QueryTy = Query.Types[TypeIdx];
146     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
147   };
148 }
149 
150 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
151 // v2s16.
152 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
153   return [=](const LegalityQuery &Query) {
154     const LLT Ty = Query.Types[TypeIdx];
155     if (Ty.isVector()) {
156       const int EltSize = Ty.getElementType().getSizeInBits();
157       return EltSize == 32 || EltSize == 64 ||
158             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
159              EltSize == 128 || EltSize == 256;
160     }
161 
162     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
163   };
164 }
165 
166 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
167   return [=](const LegalityQuery &Query) {
168     const LLT QueryTy = Query.Types[TypeIdx];
169     return QueryTy.isVector() && QueryTy.getElementType() == Type;
170   };
171 }
172 
173 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
174   return [=](const LegalityQuery &Query) {
175     const LLT Ty = Query.Types[TypeIdx];
176     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
177            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
178   };
179 }
180 
181 static LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1) {
182   return [=](const LegalityQuery &Query) {
183     return Query.Types[TypeIdx0].getSizeInBits() <
184            Query.Types[TypeIdx1].getSizeInBits();
185   };
186 }
187 
188 static LegalityPredicate greaterThan(unsigned TypeIdx0, unsigned TypeIdx1) {
189   return [=](const LegalityQuery &Query) {
190     return Query.Types[TypeIdx0].getSizeInBits() >
191            Query.Types[TypeIdx1].getSizeInBits();
192   };
193 }
194 
195 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
196                                          const GCNTargetMachine &TM)
197   :  ST(ST_) {
198   using namespace TargetOpcode;
199 
200   auto GetAddrSpacePtr = [&TM](unsigned AS) {
201     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
202   };
203 
204   const LLT S1 = LLT::scalar(1);
205   const LLT S16 = LLT::scalar(16);
206   const LLT S32 = LLT::scalar(32);
207   const LLT S64 = LLT::scalar(64);
208   const LLT S128 = LLT::scalar(128);
209   const LLT S256 = LLT::scalar(256);
210   const LLT S1024 = LLT::scalar(1024);
211 
212   const LLT V2S16 = LLT::vector(2, 16);
213   const LLT V4S16 = LLT::vector(4, 16);
214 
215   const LLT V2S32 = LLT::vector(2, 32);
216   const LLT V3S32 = LLT::vector(3, 32);
217   const LLT V4S32 = LLT::vector(4, 32);
218   const LLT V5S32 = LLT::vector(5, 32);
219   const LLT V6S32 = LLT::vector(6, 32);
220   const LLT V7S32 = LLT::vector(7, 32);
221   const LLT V8S32 = LLT::vector(8, 32);
222   const LLT V9S32 = LLT::vector(9, 32);
223   const LLT V10S32 = LLT::vector(10, 32);
224   const LLT V11S32 = LLT::vector(11, 32);
225   const LLT V12S32 = LLT::vector(12, 32);
226   const LLT V13S32 = LLT::vector(13, 32);
227   const LLT V14S32 = LLT::vector(14, 32);
228   const LLT V15S32 = LLT::vector(15, 32);
229   const LLT V16S32 = LLT::vector(16, 32);
230   const LLT V32S32 = LLT::vector(32, 32);
231 
232   const LLT V2S64 = LLT::vector(2, 64);
233   const LLT V3S64 = LLT::vector(3, 64);
234   const LLT V4S64 = LLT::vector(4, 64);
235   const LLT V5S64 = LLT::vector(5, 64);
236   const LLT V6S64 = LLT::vector(6, 64);
237   const LLT V7S64 = LLT::vector(7, 64);
238   const LLT V8S64 = LLT::vector(8, 64);
239   const LLT V16S64 = LLT::vector(16, 64);
240 
241   std::initializer_list<LLT> AllS32Vectors =
242     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
243      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
244   std::initializer_list<LLT> AllS64Vectors =
245     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
246 
247   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
248   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
249   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
250   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
251   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
252   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
253   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
254 
255   const LLT CodePtr = FlatPtr;
256 
257   const std::initializer_list<LLT> AddrSpaces64 = {
258     GlobalPtr, ConstantPtr, FlatPtr
259   };
260 
261   const std::initializer_list<LLT> AddrSpaces32 = {
262     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
263   };
264 
265   const std::initializer_list<LLT> FPTypesBase = {
266     S32, S64
267   };
268 
269   const std::initializer_list<LLT> FPTypes16 = {
270     S32, S64, S16
271   };
272 
273   const std::initializer_list<LLT> FPTypesPK16 = {
274     S32, S64, S16, V2S16
275   };
276 
277   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
278 
279   setAction({G_BRCOND, S1}, Legal); // VCC branches
280   setAction({G_BRCOND, S32}, Legal); // SCC branches
281 
282   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
283   // elements for v3s16
284   getActionDefinitionsBuilder(G_PHI)
285     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
286     .legalFor(AllS32Vectors)
287     .legalFor(AllS64Vectors)
288     .legalFor(AddrSpaces64)
289     .legalFor(AddrSpaces32)
290     .clampScalar(0, S32, S256)
291     .widenScalarToNextPow2(0, 32)
292     .clampMaxNumElements(0, S32, 16)
293     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
294     .legalIf(isPointer(0));
295 
296   if (ST.has16BitInsts()) {
297     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
298       .legalFor({S32, S16})
299       .clampScalar(0, S16, S32)
300       .scalarize(0)
301       .widenScalarToNextPow2(0, 32);
302   } else {
303     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
304       .legalFor({S32})
305       .clampScalar(0, S32, S32)
306       .scalarize(0);
307   }
308 
309   // FIXME: Not really legal. Placeholder for custom lowering.
310   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
311     .customFor({S32, S64})
312     .clampScalar(0, S32, S64)
313     .widenScalarToNextPow2(0, 32)
314     .scalarize(0);
315 
316   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
317     .legalFor({S32})
318     .clampScalar(0, S32, S32)
319     .scalarize(0);
320 
321   // Report legal for any types we can handle anywhere. For the cases only legal
322   // on the SALU, RegBankSelect will be able to re-legalize.
323   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
324     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
325     .clampScalar(0, S32, S64)
326     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
327     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
328     .widenScalarToNextPow2(0)
329     .scalarize(0);
330 
331   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
332                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
333     .legalFor({{S32, S1}, {S32, S32}})
334     .clampScalar(0, S32, S32)
335     .scalarize(0); // TODO: Implement.
336 
337   getActionDefinitionsBuilder(G_BITCAST)
338     // Don't worry about the size constraint.
339     .legalIf(all(isRegisterType(0), isRegisterType(1)))
340     .lower();
341 
342 
343   getActionDefinitionsBuilder(G_CONSTANT)
344     .legalFor({S1, S32, S64, S16, GlobalPtr,
345                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
346     .clampScalar(0, S32, S64)
347     .widenScalarToNextPow2(0)
348     .legalIf(isPointer(0));
349 
350   getActionDefinitionsBuilder(G_FCONSTANT)
351     .legalFor({S32, S64, S16})
352     .clampScalar(0, S16, S64);
353 
354   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
355     .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
356                ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
357     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
358     .clampScalarOrElt(0, S32, S1024)
359     .legalIf(isMultiple32(0))
360     .widenScalarToNextPow2(0, 32)
361     .clampMaxNumElements(0, S32, 16);
362 
363   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
364   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
365     .unsupportedFor({PrivatePtr})
366     .custom();
367   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
368 
369   auto &FPOpActions = getActionDefinitionsBuilder(
370     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
371     .legalFor({S32, S64});
372   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
373     .customFor({S32, S64});
374   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
375     .customFor({S32, S64});
376 
377   if (ST.has16BitInsts()) {
378     if (ST.hasVOP3PInsts())
379       FPOpActions.legalFor({S16, V2S16});
380     else
381       FPOpActions.legalFor({S16});
382 
383     TrigActions.customFor({S16});
384     FDIVActions.customFor({S16});
385   }
386 
387   auto &MinNumMaxNum = getActionDefinitionsBuilder({
388       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
389 
390   if (ST.hasVOP3PInsts()) {
391     MinNumMaxNum.customFor(FPTypesPK16)
392       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
393       .clampMaxNumElements(0, S16, 2)
394       .clampScalar(0, S16, S64)
395       .scalarize(0);
396   } else if (ST.has16BitInsts()) {
397     MinNumMaxNum.customFor(FPTypes16)
398       .clampScalar(0, S16, S64)
399       .scalarize(0);
400   } else {
401     MinNumMaxNum.customFor(FPTypesBase)
402       .clampScalar(0, S32, S64)
403       .scalarize(0);
404   }
405 
406   if (ST.hasVOP3PInsts())
407     FPOpActions.clampMaxNumElements(0, S16, 2);
408 
409   FPOpActions
410     .scalarize(0)
411     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
412 
413   TrigActions
414     .scalarize(0)
415     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
416 
417   FDIVActions
418     .scalarize(0)
419     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
420 
421   getActionDefinitionsBuilder({G_FNEG, G_FABS})
422     .legalFor(FPTypesPK16)
423     .clampMaxNumElements(0, S16, 2)
424     .scalarize(0)
425     .clampScalar(0, S16, S64);
426 
427   if (ST.has16BitInsts()) {
428     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
429       .legalFor({S32, S64, S16})
430       .scalarize(0)
431       .clampScalar(0, S16, S64);
432   } else {
433     getActionDefinitionsBuilder(G_FSQRT)
434       .legalFor({S32, S64})
435       .scalarize(0)
436       .clampScalar(0, S32, S64);
437 
438     if (ST.hasFractBug()) {
439       getActionDefinitionsBuilder(G_FFLOOR)
440         .customFor({S64})
441         .legalFor({S32, S64})
442         .scalarize(0)
443         .clampScalar(0, S32, S64);
444     } else {
445       getActionDefinitionsBuilder(G_FFLOOR)
446         .legalFor({S32, S64})
447         .scalarize(0)
448         .clampScalar(0, S32, S64);
449     }
450   }
451 
452   getActionDefinitionsBuilder(G_FPTRUNC)
453     .legalFor({{S32, S64}, {S16, S32}})
454     .scalarize(0)
455     .lower();
456 
457   getActionDefinitionsBuilder(G_FPEXT)
458     .legalFor({{S64, S32}, {S32, S16}})
459     .lowerFor({{S64, S16}}) // FIXME: Implement
460     .scalarize(0);
461 
462   getActionDefinitionsBuilder(G_FSUB)
463       // Use actual fsub instruction
464       .legalFor({S32})
465       // Must use fadd + fneg
466       .lowerFor({S64, S16, V2S16})
467       .scalarize(0)
468       .clampScalar(0, S32, S64);
469 
470   // Whether this is legal depends on the floating point mode for the function.
471   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
472   if (ST.hasMadF16())
473     FMad.customFor({S32, S16});
474   else
475     FMad.customFor({S32});
476   FMad.scalarize(0)
477       .lower();
478 
479   getActionDefinitionsBuilder(G_TRUNC)
480     .alwaysLegal();
481 
482   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
483     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
484                {S32, S1}, {S64, S1}, {S16, S1}})
485     .scalarize(0)
486     .clampScalar(0, S32, S64)
487     .widenScalarToNextPow2(1, 32);
488 
489   // TODO: Split s1->s64 during regbankselect for VALU.
490   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
491     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
492     .lowerFor({{S32, S64}})
493     .lowerIf(typeIs(1, S1))
494     .customFor({{S64, S64}});
495   if (ST.has16BitInsts())
496     IToFP.legalFor({{S16, S16}});
497   IToFP.clampScalar(1, S32, S64)
498        .scalarize(0)
499        .widenScalarToNextPow2(1);
500 
501   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
502     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
503     .customFor({{S64, S64}});
504   if (ST.has16BitInsts())
505     FPToI.legalFor({{S16, S16}});
506   else
507     FPToI.minScalar(1, S32);
508 
509   FPToI.minScalar(0, S32)
510        .scalarize(0)
511        .lower();
512 
513   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
514     .scalarize(0)
515     .lower();
516 
517   if (ST.has16BitInsts()) {
518     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
519       .legalFor({S16, S32, S64})
520       .clampScalar(0, S16, S64)
521       .scalarize(0);
522   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
523     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
524       .legalFor({S32, S64})
525       .clampScalar(0, S32, S64)
526       .scalarize(0);
527   } else {
528     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
529       .legalFor({S32})
530       .customFor({S64})
531       .clampScalar(0, S32, S64)
532       .scalarize(0);
533   }
534 
535   getActionDefinitionsBuilder({G_PTR_ADD, G_PTR_MASK})
536     .scalarize(0)
537     .alwaysLegal();
538 
539   auto &CmpBuilder =
540     getActionDefinitionsBuilder(G_ICMP)
541     // The compare output type differs based on the register bank of the output,
542     // so make both s1 and s32 legal.
543     //
544     // Scalar compares producing output in scc will be promoted to s32, as that
545     // is the allocatable register type that will be needed for the copy from
546     // scc. This will be promoted during RegBankSelect, and we assume something
547     // before that won't try to use s32 result types.
548     //
549     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
550     // bank.
551     .legalForCartesianProduct(
552       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
553     .legalForCartesianProduct(
554       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
555   if (ST.has16BitInsts()) {
556     CmpBuilder.legalFor({{S1, S16}});
557   }
558 
559   CmpBuilder
560     .widenScalarToNextPow2(1)
561     .clampScalar(1, S32, S64)
562     .scalarize(0)
563     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
564 
565   getActionDefinitionsBuilder(G_FCMP)
566     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
567     .widenScalarToNextPow2(1)
568     .clampScalar(1, S32, S64)
569     .scalarize(0);
570 
571   // FIXME: fpow has a selection pattern that should move to custom lowering.
572   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2, G_FPOW});
573   if (ST.has16BitInsts())
574     Exp2Ops.legalFor({S32, S16});
575   else
576     Exp2Ops.legalFor({S32});
577   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
578   Exp2Ops.scalarize(0);
579 
580   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10});
581   if (ST.has16BitInsts())
582     ExpOps.customFor({{S32}, {S16}});
583   else
584     ExpOps.customFor({S32});
585   ExpOps.clampScalar(0, MinScalarFPTy, S32)
586         .scalarize(0);
587 
588   // The 64-bit versions produce 32-bit results, but only on the SALU.
589   getActionDefinitionsBuilder(G_CTPOP)
590     .legalFor({{S32, S32}, {S32, S64}})
591     .clampScalar(0, S32, S32)
592     .clampScalar(1, S32, S64)
593     .scalarize(0)
594     .widenScalarToNextPow2(0, 32)
595     .widenScalarToNextPow2(1, 32);
596 
597   // The hardware instructions return a different result on 0 than the generic
598   // instructions expect. The hardware produces -1, but these produce the
599   // bitwidth.
600   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
601     .scalarize(0)
602     .clampScalar(0, S32, S32)
603     .clampScalar(1, S32, S64)
604     .widenScalarToNextPow2(0, 32)
605     .widenScalarToNextPow2(1, 32)
606     .lower();
607 
608   // The 64-bit versions produce 32-bit results, but only on the SALU.
609   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
610     .legalFor({{S32, S32}, {S32, S64}})
611     .clampScalar(0, S32, S32)
612     .clampScalar(1, S32, S64)
613     .scalarize(0)
614     .widenScalarToNextPow2(0, 32)
615     .widenScalarToNextPow2(1, 32);
616 
617   getActionDefinitionsBuilder(G_BITREVERSE)
618     .legalFor({S32})
619     .clampScalar(0, S32, S32)
620     .scalarize(0);
621 
622   if (ST.has16BitInsts()) {
623     getActionDefinitionsBuilder(G_BSWAP)
624       .legalFor({S16, S32, V2S16})
625       .clampMaxNumElements(0, S16, 2)
626       // FIXME: Fixing non-power-of-2 before clamp is workaround for
627       // narrowScalar limitation.
628       .widenScalarToNextPow2(0)
629       .clampScalar(0, S16, S32)
630       .scalarize(0);
631 
632     if (ST.hasVOP3PInsts()) {
633       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
634         .legalFor({S32, S16, V2S16})
635         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
636         .clampMaxNumElements(0, S16, 2)
637         .clampScalar(0, S16, S32)
638         .widenScalarToNextPow2(0)
639         .scalarize(0);
640     } else {
641       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
642         .legalFor({S32, S16})
643         .widenScalarToNextPow2(0)
644         .clampScalar(0, S16, S32)
645         .scalarize(0);
646     }
647   } else {
648     // TODO: Should have same legality without v_perm_b32
649     getActionDefinitionsBuilder(G_BSWAP)
650       .legalFor({S32})
651       .lowerIf(narrowerThan(0, 32))
652       // FIXME: Fixing non-power-of-2 before clamp is workaround for
653       // narrowScalar limitation.
654       .widenScalarToNextPow2(0)
655       .maxScalar(0, S32)
656       .scalarize(0)
657       .lower();
658 
659     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
660       .legalFor({S32})
661       .clampScalar(0, S32, S32)
662       .widenScalarToNextPow2(0)
663       .scalarize(0);
664   }
665 
666   getActionDefinitionsBuilder(G_INTTOPTR)
667     // List the common cases
668     .legalForCartesianProduct(AddrSpaces64, {S64})
669     .legalForCartesianProduct(AddrSpaces32, {S32})
670     .scalarize(0)
671     // Accept any address space as long as the size matches
672     .legalIf(sameSize(0, 1))
673     .widenScalarIf(smallerThan(1, 0),
674       [](const LegalityQuery &Query) {
675         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
676       })
677     .narrowScalarIf(greaterThan(1, 0),
678       [](const LegalityQuery &Query) {
679         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
680       });
681 
682   getActionDefinitionsBuilder(G_PTRTOINT)
683     // List the common cases
684     .legalForCartesianProduct(AddrSpaces64, {S64})
685     .legalForCartesianProduct(AddrSpaces32, {S32})
686     .scalarize(0)
687     // Accept any address space as long as the size matches
688     .legalIf(sameSize(0, 1))
689     .widenScalarIf(smallerThan(0, 1),
690       [](const LegalityQuery &Query) {
691         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
692       })
693     .narrowScalarIf(
694       greaterThan(0, 1),
695       [](const LegalityQuery &Query) {
696         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
697       });
698 
699   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
700     .scalarize(0)
701     .custom();
702 
703   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
704   // handle some operations by just promoting the register during
705   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
706   auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned {
707     switch (AS) {
708     // FIXME: Private element size.
709     case AMDGPUAS::PRIVATE_ADDRESS:
710       return 32;
711     // FIXME: Check subtarget
712     case AMDGPUAS::LOCAL_ADDRESS:
713       return ST.useDS128() ? 128 : 64;
714 
715     // Treat constant and global as identical. SMRD loads are sometimes usable
716     // for global loads (ideally constant address space should be eliminated)
717     // depending on the context. Legality cannot be context dependent, but
718     // RegBankSelect can split the load as necessary depending on the pointer
719     // register bank/uniformity and if the memory is invariant or not written in
720     // a kernel.
721     case AMDGPUAS::CONSTANT_ADDRESS:
722     case AMDGPUAS::GLOBAL_ADDRESS:
723       return IsLoad ? 512 : 128;
724     default:
725       return 128;
726     }
727   };
728 
729   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
730                                     bool IsLoad) -> bool {
731     const LLT DstTy = Query.Types[0];
732 
733     // Split vector extloads.
734     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
735     unsigned Align = Query.MMODescrs[0].AlignInBits;
736 
737     if (MemSize < DstTy.getSizeInBits())
738       MemSize = std::max(MemSize, Align);
739 
740     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
741       return true;
742 
743     const LLT PtrTy = Query.Types[1];
744     unsigned AS = PtrTy.getAddressSpace();
745     if (MemSize > maxSizeForAddrSpace(AS, IsLoad))
746       return true;
747 
748     // Catch weird sized loads that don't evenly divide into the access sizes
749     // TODO: May be able to widen depending on alignment etc.
750     unsigned NumRegs = (MemSize + 31) / 32;
751     if (NumRegs == 3) {
752       if (!ST.hasDwordx3LoadStores())
753         return true;
754     } else {
755       // If the alignment allows, these should have been widened.
756       if (!isPowerOf2_32(NumRegs))
757         return true;
758     }
759 
760     if (Align < MemSize) {
761       const SITargetLowering *TLI = ST.getTargetLowering();
762       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
763     }
764 
765     return false;
766   };
767 
768   const auto shouldWidenLoadResult = [=](const LegalityQuery &Query) -> bool {
769     unsigned Size = Query.Types[0].getSizeInBits();
770     if (isPowerOf2_32(Size))
771       return false;
772 
773     if (Size == 96 && ST.hasDwordx3LoadStores())
774       return false;
775 
776     unsigned AddrSpace = Query.Types[1].getAddressSpace();
777     if (Size >= maxSizeForAddrSpace(AddrSpace, true))
778       return false;
779 
780     unsigned Align = Query.MMODescrs[0].AlignInBits;
781     unsigned RoundedSize = NextPowerOf2(Size);
782     return (Align >= RoundedSize);
783   };
784 
785   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
786   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
787   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
788 
789   // TODO: Refine based on subtargets which support unaligned access or 128-bit
790   // LDS
791   // TODO: Unsupported flat for SI.
792 
793   for (unsigned Op : {G_LOAD, G_STORE}) {
794     const bool IsStore = Op == G_STORE;
795 
796     auto &Actions = getActionDefinitionsBuilder(Op);
797     // Whitelist the common cases.
798     // TODO: Loads to s16 on gfx9
799     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
800                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
801                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
802                                       {S128, GlobalPtr, 128, GlobalAlign32},
803                                       {S64, GlobalPtr, 64, GlobalAlign32},
804                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
805                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
806                                       {S32, GlobalPtr, 8, GlobalAlign8},
807                                       {S32, GlobalPtr, 16, GlobalAlign16},
808 
809                                       {S32, LocalPtr, 32, 32},
810                                       {S64, LocalPtr, 64, 32},
811                                       {V2S32, LocalPtr, 64, 32},
812                                       {S32, LocalPtr, 8, 8},
813                                       {S32, LocalPtr, 16, 16},
814                                       {V2S16, LocalPtr, 32, 32},
815 
816                                       {S32, PrivatePtr, 32, 32},
817                                       {S32, PrivatePtr, 8, 8},
818                                       {S32, PrivatePtr, 16, 16},
819                                       {V2S16, PrivatePtr, 32, 32},
820 
821                                       {S32, FlatPtr, 32, GlobalAlign32},
822                                       {S32, FlatPtr, 16, GlobalAlign16},
823                                       {S32, FlatPtr, 8, GlobalAlign8},
824                                       {V2S16, FlatPtr, 32, GlobalAlign32},
825 
826                                       {S32, ConstantPtr, 32, GlobalAlign32},
827                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
828                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
829                                       {S64, ConstantPtr, 64, GlobalAlign32},
830                                       {S128, ConstantPtr, 128, GlobalAlign32},
831                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
832     Actions
833         .customIf(typeIs(1, Constant32Ptr))
834         // Widen suitably aligned loads by loading extra elements.
835         .moreElementsIf([=](const LegalityQuery &Query) {
836             const LLT Ty = Query.Types[0];
837             return Op == G_LOAD && Ty.isVector() &&
838                    shouldWidenLoadResult(Query);
839           }, moreElementsToNextPow2(0))
840         .widenScalarIf([=](const LegalityQuery &Query) {
841             const LLT Ty = Query.Types[0];
842             return Op == G_LOAD && !Ty.isVector() &&
843                    shouldWidenLoadResult(Query);
844           }, widenScalarOrEltToNextPow2(0))
845         .narrowScalarIf(
846             [=](const LegalityQuery &Query) -> bool {
847               return !Query.Types[0].isVector() &&
848                      needToSplitMemOp(Query, Op == G_LOAD);
849             },
850             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
851               const LLT DstTy = Query.Types[0];
852               const LLT PtrTy = Query.Types[1];
853 
854               const unsigned DstSize = DstTy.getSizeInBits();
855               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
856 
857               // Split extloads.
858               if (DstSize > MemSize)
859                 return std::make_pair(0, LLT::scalar(MemSize));
860 
861               if (!isPowerOf2_32(DstSize)) {
862                 // We're probably decomposing an odd sized store. Try to split
863                 // to the widest type. TODO: Account for alignment. As-is it
864                 // should be OK, since the new parts will be further legalized.
865                 unsigned FloorSize = PowerOf2Floor(DstSize);
866                 return std::make_pair(0, LLT::scalar(FloorSize));
867               }
868 
869               if (DstSize > 32 && (DstSize % 32 != 0)) {
870                 // FIXME: Need a way to specify non-extload of larger size if
871                 // suitably aligned.
872                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
873               }
874 
875               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
876                                                      Op == G_LOAD);
877               if (MemSize > MaxSize)
878                 return std::make_pair(0, LLT::scalar(MaxSize));
879 
880               unsigned Align = Query.MMODescrs[0].AlignInBits;
881               return std::make_pair(0, LLT::scalar(Align));
882             })
883         .fewerElementsIf(
884             [=](const LegalityQuery &Query) -> bool {
885               return Query.Types[0].isVector() &&
886                      needToSplitMemOp(Query, Op == G_LOAD);
887             },
888             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
889               const LLT DstTy = Query.Types[0];
890               const LLT PtrTy = Query.Types[1];
891 
892               LLT EltTy = DstTy.getElementType();
893               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
894                                                      Op == G_LOAD);
895 
896               // FIXME: Handle widened to power of 2 results better. This ends
897               // up scalarizing.
898               // FIXME: 3 element stores scalarized on SI
899 
900               // Split if it's too large for the address space.
901               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
902                 unsigned NumElts = DstTy.getNumElements();
903                 unsigned EltSize = EltTy.getSizeInBits();
904 
905                 if (MaxSize % EltSize == 0) {
906                   return std::make_pair(
907                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
908                 }
909 
910                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
911 
912                 // FIXME: Refine when odd breakdowns handled
913                 // The scalars will need to be re-legalized.
914                 if (NumPieces == 1 || NumPieces >= NumElts ||
915                     NumElts % NumPieces != 0)
916                   return std::make_pair(0, EltTy);
917 
918                 return std::make_pair(0,
919                                       LLT::vector(NumElts / NumPieces, EltTy));
920               }
921 
922               // FIXME: We could probably handle weird extending loads better.
923               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
924               if (DstTy.getSizeInBits() > MemSize)
925                 return std::make_pair(0, EltTy);
926 
927               unsigned EltSize = EltTy.getSizeInBits();
928               unsigned DstSize = DstTy.getSizeInBits();
929               if (!isPowerOf2_32(DstSize)) {
930                 // We're probably decomposing an odd sized store. Try to split
931                 // to the widest type. TODO: Account for alignment. As-is it
932                 // should be OK, since the new parts will be further legalized.
933                 unsigned FloorSize = PowerOf2Floor(DstSize);
934                 return std::make_pair(
935                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
936               }
937 
938               // Need to split because of alignment.
939               unsigned Align = Query.MMODescrs[0].AlignInBits;
940               if (EltSize > Align &&
941                   (EltSize / Align < DstTy.getNumElements())) {
942                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
943               }
944 
945               // May need relegalization for the scalars.
946               return std::make_pair(0, EltTy);
947             })
948         .minScalar(0, S32);
949 
950     if (IsStore)
951       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
952 
953     // TODO: Need a bitcast lower option?
954     Actions
955         .legalIf([=](const LegalityQuery &Query) {
956           const LLT Ty0 = Query.Types[0];
957           unsigned Size = Ty0.getSizeInBits();
958           unsigned MemSize = Query.MMODescrs[0].SizeInBits;
959           unsigned Align = Query.MMODescrs[0].AlignInBits;
960 
961           // FIXME: Widening store from alignment not valid.
962           if (MemSize < Size)
963             MemSize = std::max(MemSize, Align);
964 
965           // No extending vector loads.
966           if (Size > MemSize && Ty0.isVector())
967             return false;
968 
969           switch (MemSize) {
970           case 8:
971           case 16:
972             return Size == 32;
973           case 32:
974           case 64:
975           case 128:
976             return true;
977           case 96:
978             return ST.hasDwordx3LoadStores();
979           case 256:
980           case 512:
981             return true;
982           default:
983             return false;
984           }
985         })
986         .widenScalarToNextPow2(0)
987         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
988   }
989 
990   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
991                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
992                                                   {S32, GlobalPtr, 16, 2 * 8},
993                                                   {S32, LocalPtr, 8, 8},
994                                                   {S32, LocalPtr, 16, 16},
995                                                   {S32, PrivatePtr, 8, 8},
996                                                   {S32, PrivatePtr, 16, 16},
997                                                   {S32, ConstantPtr, 8, 8},
998                                                   {S32, ConstantPtr, 16, 2 * 8}});
999   if (ST.hasFlatAddressSpace()) {
1000     ExtLoads.legalForTypesWithMemDesc(
1001         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1002   }
1003 
1004   ExtLoads.clampScalar(0, S32, S32)
1005           .widenScalarToNextPow2(0)
1006           .unsupportedIfMemSizeNotPow2()
1007           .lower();
1008 
1009   auto &Atomics = getActionDefinitionsBuilder(
1010     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1011      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1012      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1013      G_ATOMICRMW_UMIN})
1014     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1015                {S64, GlobalPtr}, {S64, LocalPtr}});
1016   if (ST.hasFlatAddressSpace()) {
1017     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1018   }
1019 
1020   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1021     .legalFor({{S32, LocalPtr}});
1022 
1023   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1024   // demarshalling
1025   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1026     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1027                 {S32, FlatPtr}, {S64, FlatPtr}})
1028     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1029                {S32, RegionPtr}, {S64, RegionPtr}});
1030   // TODO: Pointer types, any 32-bit or 64-bit vector
1031 
1032   // Condition should be s32 for scalar, s1 for vector.
1033   getActionDefinitionsBuilder(G_SELECT)
1034     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1035           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1036           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1037     .clampScalar(0, S16, S64)
1038     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1039     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1040     .scalarize(1)
1041     .clampMaxNumElements(0, S32, 2)
1042     .clampMaxNumElements(0, LocalPtr, 2)
1043     .clampMaxNumElements(0, PrivatePtr, 2)
1044     .scalarize(0)
1045     .widenScalarToNextPow2(0)
1046     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1047 
1048   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1049   // be more flexible with the shift amount type.
1050   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1051     .legalFor({{S32, S32}, {S64, S32}});
1052   if (ST.has16BitInsts()) {
1053     if (ST.hasVOP3PInsts()) {
1054       Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
1055             .clampMaxNumElements(0, S16, 2);
1056     } else
1057       Shifts.legalFor({{S16, S32}, {S16, S16}});
1058 
1059     // TODO: Support 16-bit shift amounts
1060     Shifts.clampScalar(1, S32, S32);
1061     Shifts.clampScalar(0, S16, S64);
1062     Shifts.widenScalarToNextPow2(0, 16);
1063   } else {
1064     // Make sure we legalize the shift amount type first, as the general
1065     // expansion for the shifted type will produce much worse code if it hasn't
1066     // been truncated already.
1067     Shifts.clampScalar(1, S32, S32);
1068     Shifts.clampScalar(0, S32, S64);
1069     Shifts.widenScalarToNextPow2(0, 32);
1070   }
1071   Shifts.scalarize(0);
1072 
1073   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1074     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1075     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1076     unsigned IdxTypeIdx = 2;
1077 
1078     getActionDefinitionsBuilder(Op)
1079       .customIf([=](const LegalityQuery &Query) {
1080           const LLT EltTy = Query.Types[EltTypeIdx];
1081           const LLT VecTy = Query.Types[VecTypeIdx];
1082           const LLT IdxTy = Query.Types[IdxTypeIdx];
1083           return (EltTy.getSizeInBits() == 16 ||
1084                   EltTy.getSizeInBits() % 32 == 0) &&
1085                  VecTy.getSizeInBits() % 32 == 0 &&
1086                  VecTy.getSizeInBits() <= 1024 &&
1087                  IdxTy.getSizeInBits() == 32;
1088         })
1089       .clampScalar(EltTypeIdx, S32, S64)
1090       .clampScalar(VecTypeIdx, S32, S64)
1091       .clampScalar(IdxTypeIdx, S32, S32);
1092   }
1093 
1094   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1095     .unsupportedIf([=](const LegalityQuery &Query) {
1096         const LLT &EltTy = Query.Types[1].getElementType();
1097         return Query.Types[0] != EltTy;
1098       });
1099 
1100   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1101     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1102     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1103 
1104     // FIXME: Doesn't handle extract of illegal sizes.
1105     getActionDefinitionsBuilder(Op)
1106       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1107       // FIXME: Multiples of 16 should not be legal.
1108       .legalIf([=](const LegalityQuery &Query) {
1109           const LLT BigTy = Query.Types[BigTyIdx];
1110           const LLT LitTy = Query.Types[LitTyIdx];
1111           return (BigTy.getSizeInBits() % 32 == 0) &&
1112                  (LitTy.getSizeInBits() % 16 == 0);
1113         })
1114       .widenScalarIf(
1115         [=](const LegalityQuery &Query) {
1116           const LLT BigTy = Query.Types[BigTyIdx];
1117           return (BigTy.getScalarSizeInBits() < 16);
1118         },
1119         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1120       .widenScalarIf(
1121         [=](const LegalityQuery &Query) {
1122           const LLT LitTy = Query.Types[LitTyIdx];
1123           return (LitTy.getScalarSizeInBits() < 16);
1124         },
1125         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1126       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1127       .widenScalarToNextPow2(BigTyIdx, 32);
1128 
1129   }
1130 
1131   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1132     .legalForCartesianProduct(AllS32Vectors, {S32})
1133     .legalForCartesianProduct(AllS64Vectors, {S64})
1134     .clampNumElements(0, V16S32, V32S32)
1135     .clampNumElements(0, V2S64, V16S64)
1136     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1137 
1138   if (ST.hasScalarPackInsts()) {
1139     BuildVector
1140       // FIXME: Should probably widen s1 vectors straight to s32
1141       .minScalarOrElt(0, S16)
1142       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1143       .minScalar(1, S32);
1144 
1145     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1146       .legalFor({V2S16, S32})
1147       .lower();
1148     BuildVector.minScalarOrElt(0, S32);
1149   } else {
1150     BuildVector.customFor({V2S16, S16});
1151     BuildVector.minScalarOrElt(0, S32);
1152 
1153     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1154       .customFor({V2S16, S32})
1155       .lower();
1156   }
1157 
1158   BuildVector.legalIf(isRegisterType(0));
1159 
1160   // FIXME: Clamp maximum size
1161   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1162     .legalIf(isRegisterType(0));
1163 
1164   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1165   // pre-legalize.
1166   if (ST.hasVOP3PInsts()) {
1167     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1168       .customFor({V2S16, V2S16})
1169       .lower();
1170   } else
1171     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1172 
1173   // Merge/Unmerge
1174   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1175     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1176     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1177 
1178     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1179       const LLT &Ty = Query.Types[TypeIdx];
1180       if (Ty.isVector()) {
1181         const LLT &EltTy = Ty.getElementType();
1182         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
1183           return true;
1184         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1185           return true;
1186       }
1187       return false;
1188     };
1189 
1190     auto &Builder = getActionDefinitionsBuilder(Op)
1191       // Try to widen to s16 first for small types.
1192       // TODO: Only do this on targets with legal s16 shifts
1193       .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1194 
1195       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1196       .lowerFor({{S16, V2S16}})
1197       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1198       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1199                            elementTypeIs(1, S16)),
1200                        changeTo(1, V2S16))
1201       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1202       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1203       // valid.
1204       .clampScalar(LitTyIdx, S32, S256)
1205       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1206       // Break up vectors with weird elements into scalars
1207       .fewerElementsIf(
1208         [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
1209         scalarize(0))
1210       .fewerElementsIf(
1211         [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
1212         scalarize(1))
1213       .clampScalar(BigTyIdx, S32, S1024);
1214 
1215     if (Op == G_MERGE_VALUES) {
1216       Builder.widenScalarIf(
1217         // TODO: Use 16-bit shifts if legal for 8-bit values?
1218         [=](const LegalityQuery &Query) {
1219           const LLT Ty = Query.Types[LitTyIdx];
1220           return Ty.getSizeInBits() < 32;
1221         },
1222         changeTo(LitTyIdx, S32));
1223     }
1224 
1225     Builder.widenScalarIf(
1226       [=](const LegalityQuery &Query) {
1227         const LLT Ty = Query.Types[BigTyIdx];
1228         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1229           Ty.getSizeInBits() % 16 != 0;
1230       },
1231       [=](const LegalityQuery &Query) {
1232         // Pick the next power of 2, or a multiple of 64 over 128.
1233         // Whichever is smaller.
1234         const LLT &Ty = Query.Types[BigTyIdx];
1235         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1236         if (NewSizeInBits >= 256) {
1237           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1238           if (RoundedTo < NewSizeInBits)
1239             NewSizeInBits = RoundedTo;
1240         }
1241         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1242       })
1243       .legalIf([=](const LegalityQuery &Query) {
1244           const LLT &BigTy = Query.Types[BigTyIdx];
1245           const LLT &LitTy = Query.Types[LitTyIdx];
1246 
1247           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1248             return false;
1249           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1250             return false;
1251 
1252           return BigTy.getSizeInBits() % 16 == 0 &&
1253                  LitTy.getSizeInBits() % 16 == 0 &&
1254                  BigTy.getSizeInBits() <= 1024;
1255         })
1256       // Any vectors left are the wrong size. Scalarize them.
1257       .scalarize(0)
1258       .scalarize(1);
1259   }
1260 
1261   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1262   // RegBankSelect.
1263   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1264     .legalFor({{S32}, {S64}});
1265 
1266   if (ST.hasVOP3PInsts()) {
1267     SextInReg.lowerFor({{V2S16}})
1268       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1269       // get more vector shift opportunities, since we'll get those when
1270       // expanded.
1271       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1272   } else if (ST.has16BitInsts()) {
1273     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1274   } else {
1275     // Prefer to promote to s32 before lowering if we don't have 16-bit
1276     // shifts. This avoid a lot of intermediate truncate and extend operations.
1277     SextInReg.lowerFor({{S32}, {S64}});
1278   }
1279 
1280   SextInReg
1281     .scalarize(0)
1282     .clampScalar(0, S32, S64)
1283     .lower();
1284 
1285   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1286     .legalFor({S64});
1287 
1288   getActionDefinitionsBuilder({
1289       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1290       G_FCOPYSIGN,
1291 
1292       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1293       G_READ_REGISTER,
1294       G_WRITE_REGISTER,
1295 
1296       G_SADDO, G_SSUBO,
1297 
1298        // TODO: Implement
1299       G_FMINIMUM, G_FMAXIMUM
1300     }).lower();
1301 
1302   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1303         G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1304         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1305     .unsupported();
1306 
1307   computeTables();
1308   verify(*ST.getInstrInfo());
1309 }
1310 
1311 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1312                                          MachineRegisterInfo &MRI,
1313                                          MachineIRBuilder &B,
1314                                          GISelChangeObserver &Observer) const {
1315   switch (MI.getOpcode()) {
1316   case TargetOpcode::G_ADDRSPACE_CAST:
1317     return legalizeAddrSpaceCast(MI, MRI, B);
1318   case TargetOpcode::G_FRINT:
1319     return legalizeFrint(MI, MRI, B);
1320   case TargetOpcode::G_FCEIL:
1321     return legalizeFceil(MI, MRI, B);
1322   case TargetOpcode::G_INTRINSIC_TRUNC:
1323     return legalizeIntrinsicTrunc(MI, MRI, B);
1324   case TargetOpcode::G_SITOFP:
1325     return legalizeITOFP(MI, MRI, B, true);
1326   case TargetOpcode::G_UITOFP:
1327     return legalizeITOFP(MI, MRI, B, false);
1328   case TargetOpcode::G_FPTOSI:
1329     return legalizeFPTOI(MI, MRI, B, true);
1330   case TargetOpcode::G_FPTOUI:
1331     return legalizeFPTOI(MI, MRI, B, false);
1332   case TargetOpcode::G_FMINNUM:
1333   case TargetOpcode::G_FMAXNUM:
1334   case TargetOpcode::G_FMINNUM_IEEE:
1335   case TargetOpcode::G_FMAXNUM_IEEE:
1336     return legalizeMinNumMaxNum(MI, MRI, B);
1337   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1338     return legalizeExtractVectorElt(MI, MRI, B);
1339   case TargetOpcode::G_INSERT_VECTOR_ELT:
1340     return legalizeInsertVectorElt(MI, MRI, B);
1341   case TargetOpcode::G_SHUFFLE_VECTOR:
1342     return legalizeShuffleVector(MI, MRI, B);
1343   case TargetOpcode::G_FSIN:
1344   case TargetOpcode::G_FCOS:
1345     return legalizeSinCos(MI, MRI, B);
1346   case TargetOpcode::G_GLOBAL_VALUE:
1347     return legalizeGlobalValue(MI, MRI, B);
1348   case TargetOpcode::G_LOAD:
1349     return legalizeLoad(MI, MRI, B, Observer);
1350   case TargetOpcode::G_FMAD:
1351     return legalizeFMad(MI, MRI, B);
1352   case TargetOpcode::G_FDIV:
1353     return legalizeFDIV(MI, MRI, B);
1354   case TargetOpcode::G_UDIV:
1355   case TargetOpcode::G_UREM:
1356     return legalizeUDIV_UREM(MI, MRI, B);
1357   case TargetOpcode::G_SDIV:
1358   case TargetOpcode::G_SREM:
1359     return legalizeSDIV_SREM(MI, MRI, B);
1360   case TargetOpcode::G_ATOMIC_CMPXCHG:
1361     return legalizeAtomicCmpXChg(MI, MRI, B);
1362   case TargetOpcode::G_FLOG:
1363     return legalizeFlog(MI, B, 1.0f / numbers::log2ef);
1364   case TargetOpcode::G_FLOG10:
1365     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1366   case TargetOpcode::G_FEXP:
1367     return legalizeFExp(MI, B);
1368   case TargetOpcode::G_FFLOOR:
1369     return legalizeFFloor(MI, MRI, B);
1370   case TargetOpcode::G_BUILD_VECTOR:
1371     return legalizeBuildVector(MI, MRI, B);
1372   default:
1373     return false;
1374   }
1375 
1376   llvm_unreachable("expected switch to return");
1377 }
1378 
1379 Register AMDGPULegalizerInfo::getSegmentAperture(
1380   unsigned AS,
1381   MachineRegisterInfo &MRI,
1382   MachineIRBuilder &B) const {
1383   MachineFunction &MF = B.getMF();
1384   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1385   const LLT S32 = LLT::scalar(32);
1386 
1387   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1388 
1389   if (ST.hasApertureRegs()) {
1390     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1391     // getreg.
1392     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1393         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1394         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1395     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1396         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1397         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1398     unsigned Encoding =
1399         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1400         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1401         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1402 
1403     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1404 
1405     B.buildInstr(AMDGPU::S_GETREG_B32)
1406       .addDef(GetReg)
1407       .addImm(Encoding);
1408     MRI.setType(GetReg, S32);
1409 
1410     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1411     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1412   }
1413 
1414   Register QueuePtr = MRI.createGenericVirtualRegister(
1415     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1416 
1417   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1418   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1419     return Register();
1420 
1421   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1422   // private_segment_aperture_base_hi.
1423   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1424 
1425   // TODO: can we be smarter about machine pointer info?
1426   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1427   MachineMemOperand *MMO = MF.getMachineMemOperand(
1428     PtrInfo,
1429     MachineMemOperand::MOLoad |
1430     MachineMemOperand::MODereferenceable |
1431     MachineMemOperand::MOInvariant,
1432     4,
1433     MinAlign(64, StructOffset));
1434 
1435   Register LoadAddr;
1436 
1437   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1438   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1439 }
1440 
1441 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1442   MachineInstr &MI, MachineRegisterInfo &MRI,
1443   MachineIRBuilder &B) const {
1444   MachineFunction &MF = B.getMF();
1445 
1446   B.setInstr(MI);
1447 
1448   const LLT S32 = LLT::scalar(32);
1449   Register Dst = MI.getOperand(0).getReg();
1450   Register Src = MI.getOperand(1).getReg();
1451 
1452   LLT DstTy = MRI.getType(Dst);
1453   LLT SrcTy = MRI.getType(Src);
1454   unsigned DestAS = DstTy.getAddressSpace();
1455   unsigned SrcAS = SrcTy.getAddressSpace();
1456 
1457   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1458   // vector element.
1459   assert(!DstTy.isVector());
1460 
1461   const AMDGPUTargetMachine &TM
1462     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1463 
1464   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1465   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1466     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1467     return true;
1468   }
1469 
1470   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1471     // Truncate.
1472     B.buildExtract(Dst, Src, 0);
1473     MI.eraseFromParent();
1474     return true;
1475   }
1476 
1477   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1478     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1479     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1480 
1481     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1482     // another. Merge operands are required to be the same type, but creating an
1483     // extra ptrtoint would be kind of pointless.
1484     auto HighAddr = B.buildConstant(
1485       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1486     B.buildMerge(Dst, {Src, HighAddr});
1487     MI.eraseFromParent();
1488     return true;
1489   }
1490 
1491   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1492     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1493            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1494     unsigned NullVal = TM.getNullPointerValue(DestAS);
1495 
1496     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1497     auto FlatNull = B.buildConstant(SrcTy, 0);
1498 
1499     // Extract low 32-bits of the pointer.
1500     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1501 
1502     auto CmpRes =
1503         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1504     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1505 
1506     MI.eraseFromParent();
1507     return true;
1508   }
1509 
1510   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1511     return false;
1512 
1513   if (!ST.hasFlatAddressSpace())
1514     return false;
1515 
1516   auto SegmentNull =
1517       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1518   auto FlatNull =
1519       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1520 
1521   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1522   if (!ApertureReg.isValid())
1523     return false;
1524 
1525   auto CmpRes =
1526       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1527 
1528   // Coerce the type of the low half of the result so we can use merge_values.
1529   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1530 
1531   // TODO: Should we allow mismatched types but matching sizes in merges to
1532   // avoid the ptrtoint?
1533   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1534   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1535 
1536   MI.eraseFromParent();
1537   return true;
1538 }
1539 
1540 bool AMDGPULegalizerInfo::legalizeFrint(
1541   MachineInstr &MI, MachineRegisterInfo &MRI,
1542   MachineIRBuilder &B) const {
1543   B.setInstr(MI);
1544 
1545   Register Src = MI.getOperand(1).getReg();
1546   LLT Ty = MRI.getType(Src);
1547   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1548 
1549   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1550   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1551 
1552   auto C1 = B.buildFConstant(Ty, C1Val);
1553   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1554 
1555   // TODO: Should this propagate fast-math-flags?
1556   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1557   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1558 
1559   auto C2 = B.buildFConstant(Ty, C2Val);
1560   auto Fabs = B.buildFAbs(Ty, Src);
1561 
1562   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1563   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1564   return true;
1565 }
1566 
1567 bool AMDGPULegalizerInfo::legalizeFceil(
1568   MachineInstr &MI, MachineRegisterInfo &MRI,
1569   MachineIRBuilder &B) const {
1570   B.setInstr(MI);
1571 
1572   const LLT S1 = LLT::scalar(1);
1573   const LLT S64 = LLT::scalar(64);
1574 
1575   Register Src = MI.getOperand(1).getReg();
1576   assert(MRI.getType(Src) == S64);
1577 
1578   // result = trunc(src)
1579   // if (src > 0.0 && src != result)
1580   //   result += 1.0
1581 
1582   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1583 
1584   const auto Zero = B.buildFConstant(S64, 0.0);
1585   const auto One = B.buildFConstant(S64, 1.0);
1586   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1587   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1588   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1589   auto Add = B.buildSelect(S64, And, One, Zero);
1590 
1591   // TODO: Should this propagate fast-math-flags?
1592   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1593   return true;
1594 }
1595 
1596 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1597                                               MachineIRBuilder &B) {
1598   const unsigned FractBits = 52;
1599   const unsigned ExpBits = 11;
1600   LLT S32 = LLT::scalar(32);
1601 
1602   auto Const0 = B.buildConstant(S32, FractBits - 32);
1603   auto Const1 = B.buildConstant(S32, ExpBits);
1604 
1605   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1606     .addUse(Const0.getReg(0))
1607     .addUse(Const1.getReg(0));
1608 
1609   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1610 }
1611 
1612 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1613   MachineInstr &MI, MachineRegisterInfo &MRI,
1614   MachineIRBuilder &B) const {
1615   B.setInstr(MI);
1616 
1617   const LLT S1 = LLT::scalar(1);
1618   const LLT S32 = LLT::scalar(32);
1619   const LLT S64 = LLT::scalar(64);
1620 
1621   Register Src = MI.getOperand(1).getReg();
1622   assert(MRI.getType(Src) == S64);
1623 
1624   // TODO: Should this use extract since the low half is unused?
1625   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1626   Register Hi = Unmerge.getReg(1);
1627 
1628   // Extract the upper half, since this is where we will find the sign and
1629   // exponent.
1630   auto Exp = extractF64Exponent(Hi, B);
1631 
1632   const unsigned FractBits = 52;
1633 
1634   // Extract the sign bit.
1635   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1636   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1637 
1638   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1639 
1640   const auto Zero32 = B.buildConstant(S32, 0);
1641 
1642   // Extend back to 64-bits.
1643   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1644 
1645   auto Shr = B.buildAShr(S64, FractMask, Exp);
1646   auto Not = B.buildNot(S64, Shr);
1647   auto Tmp0 = B.buildAnd(S64, Src, Not);
1648   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1649 
1650   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1651   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1652 
1653   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1654   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1655   return true;
1656 }
1657 
1658 bool AMDGPULegalizerInfo::legalizeITOFP(
1659   MachineInstr &MI, MachineRegisterInfo &MRI,
1660   MachineIRBuilder &B, bool Signed) const {
1661   B.setInstr(MI);
1662 
1663   Register Dst = MI.getOperand(0).getReg();
1664   Register Src = MI.getOperand(1).getReg();
1665 
1666   const LLT S64 = LLT::scalar(64);
1667   const LLT S32 = LLT::scalar(32);
1668 
1669   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1670 
1671   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1672 
1673   auto CvtHi = Signed ?
1674     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1675     B.buildUITOFP(S64, Unmerge.getReg(1));
1676 
1677   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1678 
1679   auto ThirtyTwo = B.buildConstant(S32, 32);
1680   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1681     .addUse(CvtHi.getReg(0))
1682     .addUse(ThirtyTwo.getReg(0));
1683 
1684   // TODO: Should this propagate fast-math-flags?
1685   B.buildFAdd(Dst, LdExp, CvtLo);
1686   MI.eraseFromParent();
1687   return true;
1688 }
1689 
1690 // TODO: Copied from DAG implementation. Verify logic and document how this
1691 // actually works.
1692 bool AMDGPULegalizerInfo::legalizeFPTOI(
1693   MachineInstr &MI, MachineRegisterInfo &MRI,
1694   MachineIRBuilder &B, bool Signed) const {
1695   B.setInstr(MI);
1696 
1697   Register Dst = MI.getOperand(0).getReg();
1698   Register Src = MI.getOperand(1).getReg();
1699 
1700   const LLT S64 = LLT::scalar(64);
1701   const LLT S32 = LLT::scalar(32);
1702 
1703   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1704 
1705   unsigned Flags = MI.getFlags();
1706 
1707   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
1708   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
1709   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
1710 
1711   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
1712   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
1713   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
1714 
1715   auto Hi = Signed ?
1716     B.buildFPTOSI(S32, FloorMul) :
1717     B.buildFPTOUI(S32, FloorMul);
1718   auto Lo = B.buildFPTOUI(S32, Fma);
1719 
1720   B.buildMerge(Dst, { Lo, Hi });
1721   MI.eraseFromParent();
1722 
1723   return true;
1724 }
1725 
1726 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1727   MachineInstr &MI, MachineRegisterInfo &MRI,
1728   MachineIRBuilder &B) const {
1729   MachineFunction &MF = B.getMF();
1730   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1731 
1732   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1733                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1734 
1735   // With ieee_mode disabled, the instructions have the correct behavior
1736   // already for G_FMINNUM/G_FMAXNUM
1737   if (!MFI->getMode().IEEE)
1738     return !IsIEEEOp;
1739 
1740   if (IsIEEEOp)
1741     return true;
1742 
1743   MachineIRBuilder HelperBuilder(MI);
1744   GISelObserverWrapper DummyObserver;
1745   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1746   HelperBuilder.setInstr(MI);
1747   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1748 }
1749 
1750 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1751   MachineInstr &MI, MachineRegisterInfo &MRI,
1752   MachineIRBuilder &B) const {
1753   // TODO: Should move some of this into LegalizerHelper.
1754 
1755   // TODO: Promote dynamic indexing of s16 to s32
1756 
1757   // FIXME: Artifact combiner probably should have replaced the truncated
1758   // constant before this, so we shouldn't need
1759   // getConstantVRegValWithLookThrough.
1760   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1761     MI.getOperand(2).getReg(), MRI);
1762   if (!IdxVal) // Dynamic case will be selected to register indexing.
1763     return true;
1764 
1765   Register Dst = MI.getOperand(0).getReg();
1766   Register Vec = MI.getOperand(1).getReg();
1767 
1768   LLT VecTy = MRI.getType(Vec);
1769   LLT EltTy = VecTy.getElementType();
1770   assert(EltTy == MRI.getType(Dst));
1771 
1772   B.setInstr(MI);
1773 
1774   if (IdxVal->Value < VecTy.getNumElements())
1775     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
1776   else
1777     B.buildUndef(Dst);
1778 
1779   MI.eraseFromParent();
1780   return true;
1781 }
1782 
1783 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1784   MachineInstr &MI, MachineRegisterInfo &MRI,
1785   MachineIRBuilder &B) const {
1786   // TODO: Should move some of this into LegalizerHelper.
1787 
1788   // TODO: Promote dynamic indexing of s16 to s32
1789 
1790   // FIXME: Artifact combiner probably should have replaced the truncated
1791   // constant before this, so we shouldn't need
1792   // getConstantVRegValWithLookThrough.
1793   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1794     MI.getOperand(3).getReg(), MRI);
1795   if (!IdxVal) // Dynamic case will be selected to register indexing.
1796     return true;
1797 
1798   Register Dst = MI.getOperand(0).getReg();
1799   Register Vec = MI.getOperand(1).getReg();
1800   Register Ins = MI.getOperand(2).getReg();
1801 
1802   LLT VecTy = MRI.getType(Vec);
1803   LLT EltTy = VecTy.getElementType();
1804   assert(EltTy == MRI.getType(Ins));
1805 
1806   B.setInstr(MI);
1807 
1808   if (IdxVal->Value < VecTy.getNumElements())
1809     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
1810   else
1811     B.buildUndef(Dst);
1812 
1813   MI.eraseFromParent();
1814   return true;
1815 }
1816 
1817 static bool isLegalVOP3PShuffleMask(ArrayRef<int> Mask) {
1818   assert(Mask.size() == 2);
1819 
1820   // If one half is undef, the other is trivially in the same reg.
1821   if (Mask[0] == -1 || Mask[1] == -1)
1822     return true;
1823   return ((Mask[0] == 0 || Mask[0] == 1) && (Mask[1] == 0 || Mask[1] == 1)) ||
1824          ((Mask[0] == 2 || Mask[0] == 3) && (Mask[1] == 2 || Mask[1] == 3));
1825 }
1826 
1827 bool AMDGPULegalizerInfo::legalizeShuffleVector(
1828   MachineInstr &MI, MachineRegisterInfo &MRI,
1829   MachineIRBuilder &B) const {
1830   const LLT V2S16 = LLT::vector(2, 16);
1831 
1832   Register Dst = MI.getOperand(0).getReg();
1833   Register Src0 = MI.getOperand(1).getReg();
1834   LLT DstTy = MRI.getType(Dst);
1835   LLT SrcTy = MRI.getType(Src0);
1836 
1837   if (SrcTy == V2S16 && DstTy == V2S16 &&
1838       isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
1839     return true;
1840 
1841   MachineIRBuilder HelperBuilder(MI);
1842   GISelObserverWrapper DummyObserver;
1843   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
1844   HelperBuilder.setInstr(MI);
1845   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
1846 }
1847 
1848 bool AMDGPULegalizerInfo::legalizeSinCos(
1849   MachineInstr &MI, MachineRegisterInfo &MRI,
1850   MachineIRBuilder &B) const {
1851   B.setInstr(MI);
1852 
1853   Register DstReg = MI.getOperand(0).getReg();
1854   Register SrcReg = MI.getOperand(1).getReg();
1855   LLT Ty = MRI.getType(DstReg);
1856   unsigned Flags = MI.getFlags();
1857 
1858   Register TrigVal;
1859   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1860   if (ST.hasTrigReducedRange()) {
1861     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1862     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1863       .addUse(MulVal.getReg(0))
1864       .setMIFlags(Flags).getReg(0);
1865   } else
1866     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1867 
1868   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1869     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1870   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1871     .addUse(TrigVal)
1872     .setMIFlags(Flags);
1873   MI.eraseFromParent();
1874   return true;
1875 }
1876 
1877 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1878   Register DstReg, LLT PtrTy,
1879   MachineIRBuilder &B, const GlobalValue *GV,
1880   unsigned Offset, unsigned GAFlags) const {
1881   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1882   // to the following code sequence:
1883   //
1884   // For constant address space:
1885   //   s_getpc_b64 s[0:1]
1886   //   s_add_u32 s0, s0, $symbol
1887   //   s_addc_u32 s1, s1, 0
1888   //
1889   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1890   //   a fixup or relocation is emitted to replace $symbol with a literal
1891   //   constant, which is a pc-relative offset from the encoding of the $symbol
1892   //   operand to the global variable.
1893   //
1894   // For global address space:
1895   //   s_getpc_b64 s[0:1]
1896   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1897   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1898   //
1899   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1900   //   fixups or relocations are emitted to replace $symbol@*@lo and
1901   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1902   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
1903   //   operand to the global variable.
1904   //
1905   // What we want here is an offset from the value returned by s_getpc
1906   // (which is the address of the s_add_u32 instruction) to the global
1907   // variable, but since the encoding of $symbol starts 4 bytes after the start
1908   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1909   // small. This requires us to add 4 to the global variable offset in order to
1910   // compute the correct address.
1911 
1912   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1913 
1914   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1915     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1916 
1917   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1918     .addDef(PCReg);
1919 
1920   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1921   if (GAFlags == SIInstrInfo::MO_NONE)
1922     MIB.addImm(0);
1923   else
1924     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1925 
1926   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1927 
1928   if (PtrTy.getSizeInBits() == 32)
1929     B.buildExtract(DstReg, PCReg, 0);
1930   return true;
1931  }
1932 
1933 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1934   MachineInstr &MI, MachineRegisterInfo &MRI,
1935   MachineIRBuilder &B) const {
1936   Register DstReg = MI.getOperand(0).getReg();
1937   LLT Ty = MRI.getType(DstReg);
1938   unsigned AS = Ty.getAddressSpace();
1939 
1940   const GlobalValue *GV = MI.getOperand(1).getGlobal();
1941   MachineFunction &MF = B.getMF();
1942   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1943   B.setInstr(MI);
1944 
1945   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1946     if (!MFI->isEntryFunction()) {
1947       const Function &Fn = MF.getFunction();
1948       DiagnosticInfoUnsupported BadLDSDecl(
1949         Fn, "local memory global used by non-kernel function", MI.getDebugLoc());
1950       Fn.getContext().diagnose(BadLDSDecl);
1951     }
1952 
1953     // TODO: We could emit code to handle the initialization somewhere.
1954     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1955       const SITargetLowering *TLI = ST.getTargetLowering();
1956       if (!TLI->shouldUseLDSConstAddress(GV)) {
1957         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
1958         return true; // Leave in place;
1959       }
1960 
1961       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1962       MI.eraseFromParent();
1963       return true;
1964     }
1965 
1966     const Function &Fn = MF.getFunction();
1967     DiagnosticInfoUnsupported BadInit(
1968       Fn, "unsupported initializer for address space", MI.getDebugLoc());
1969     Fn.getContext().diagnose(BadInit);
1970     return true;
1971   }
1972 
1973   const SITargetLowering *TLI = ST.getTargetLowering();
1974 
1975   if (TLI->shouldEmitFixup(GV)) {
1976     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
1977     MI.eraseFromParent();
1978     return true;
1979   }
1980 
1981   if (TLI->shouldEmitPCReloc(GV)) {
1982     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
1983     MI.eraseFromParent();
1984     return true;
1985   }
1986 
1987   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1988   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
1989 
1990   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
1991     MachinePointerInfo::getGOT(MF),
1992     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1993     MachineMemOperand::MOInvariant,
1994     8 /*Size*/, 8 /*Align*/);
1995 
1996   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
1997 
1998   if (Ty.getSizeInBits() == 32) {
1999     // Truncate if this is a 32-bit constant adrdess.
2000     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2001     B.buildExtract(DstReg, Load, 0);
2002   } else
2003     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2004 
2005   MI.eraseFromParent();
2006   return true;
2007 }
2008 
2009 bool AMDGPULegalizerInfo::legalizeLoad(
2010   MachineInstr &MI, MachineRegisterInfo &MRI,
2011   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
2012   B.setInstr(MI);
2013   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2014   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
2015   Observer.changingInstr(MI);
2016   MI.getOperand(1).setReg(Cast.getReg(0));
2017   Observer.changedInstr(MI);
2018   return true;
2019 }
2020 
2021 bool AMDGPULegalizerInfo::legalizeFMad(
2022   MachineInstr &MI, MachineRegisterInfo &MRI,
2023   MachineIRBuilder &B) const {
2024   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2025   assert(Ty.isScalar());
2026 
2027   MachineFunction &MF = B.getMF();
2028   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2029 
2030   // TODO: Always legal with future ftz flag.
2031   // FIXME: Do we need just output?
2032   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2033     return true;
2034   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2035     return true;
2036 
2037   MachineIRBuilder HelperBuilder(MI);
2038   GISelObserverWrapper DummyObserver;
2039   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2040   HelperBuilder.setMBB(*MI.getParent());
2041   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2042 }
2043 
2044 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2045   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2046   Register DstReg = MI.getOperand(0).getReg();
2047   Register PtrReg = MI.getOperand(1).getReg();
2048   Register CmpVal = MI.getOperand(2).getReg();
2049   Register NewVal = MI.getOperand(3).getReg();
2050 
2051   assert(SITargetLowering::isFlatGlobalAddrSpace(
2052            MRI.getType(PtrReg).getAddressSpace()) &&
2053          "this should not have been custom lowered");
2054 
2055   LLT ValTy = MRI.getType(CmpVal);
2056   LLT VecTy = LLT::vector(2, ValTy);
2057 
2058   B.setInstr(MI);
2059   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2060 
2061   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2062     .addDef(DstReg)
2063     .addUse(PtrReg)
2064     .addUse(PackedVal)
2065     .setMemRefs(MI.memoperands());
2066 
2067   MI.eraseFromParent();
2068   return true;
2069 }
2070 
2071 bool AMDGPULegalizerInfo::legalizeFlog(
2072   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2073   Register Dst = MI.getOperand(0).getReg();
2074   Register Src = MI.getOperand(1).getReg();
2075   LLT Ty = B.getMRI()->getType(Dst);
2076   unsigned Flags = MI.getFlags();
2077   B.setInstr(MI);
2078 
2079   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2080   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2081 
2082   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2083   MI.eraseFromParent();
2084   return true;
2085 }
2086 
2087 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2088                                        MachineIRBuilder &B) const {
2089   Register Dst = MI.getOperand(0).getReg();
2090   Register Src = MI.getOperand(1).getReg();
2091   unsigned Flags = MI.getFlags();
2092   LLT Ty = B.getMRI()->getType(Dst);
2093   B.setInstr(MI);
2094 
2095   auto K = B.buildFConstant(Ty, numbers::log2e);
2096   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2097   B.buildFExp2(Dst, Mul, Flags);
2098   MI.eraseFromParent();
2099   return true;
2100 }
2101 
2102 // Find a source register, ignoring any possible source modifiers.
2103 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2104   Register ModSrc = OrigSrc;
2105   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2106     ModSrc = SrcFNeg->getOperand(1).getReg();
2107     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2108       ModSrc = SrcFAbs->getOperand(1).getReg();
2109   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2110     ModSrc = SrcFAbs->getOperand(1).getReg();
2111   return ModSrc;
2112 }
2113 
2114 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2115                                          MachineRegisterInfo &MRI,
2116                                          MachineIRBuilder &B) const {
2117   B.setInstr(MI);
2118 
2119   const LLT S1 = LLT::scalar(1);
2120   const LLT S64 = LLT::scalar(64);
2121   Register Dst = MI.getOperand(0).getReg();
2122   Register OrigSrc = MI.getOperand(1).getReg();
2123   unsigned Flags = MI.getFlags();
2124   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2125          "this should not have been custom lowered");
2126 
2127   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2128   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2129   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2130   // V_FRACT bug is:
2131   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2132   //
2133   // Convert floor(x) to (x - fract(x))
2134 
2135   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2136     .addUse(OrigSrc)
2137     .setMIFlags(Flags);
2138 
2139   // Give source modifier matching some assistance before obscuring a foldable
2140   // pattern.
2141 
2142   // TODO: We can avoid the neg on the fract? The input sign to fract
2143   // shouldn't matter?
2144   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2145 
2146   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2147 
2148   Register Min = MRI.createGenericVirtualRegister(S64);
2149 
2150   // We don't need to concern ourselves with the snan handling difference, so
2151   // use the one which will directly select.
2152   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2153   if (MFI->getMode().IEEE)
2154     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2155   else
2156     B.buildFMinNum(Min, Fract, Const, Flags);
2157 
2158   Register CorrectedFract = Min;
2159   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2160     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2161     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2162   }
2163 
2164   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2165   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2166 
2167   MI.eraseFromParent();
2168   return true;
2169 }
2170 
2171 // Turn an illegal packed v2s16 build vector into bit operations.
2172 // TODO: This should probably be a bitcast action in LegalizerHelper.
2173 bool AMDGPULegalizerInfo::legalizeBuildVector(
2174   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2175   Register Dst = MI.getOperand(0).getReg();
2176   LLT DstTy = MRI.getType(Dst);
2177   const LLT S32 = LLT::scalar(32);
2178   const LLT V2S16 = LLT::vector(2, 16);
2179   (void)DstTy;
2180   (void)V2S16;
2181   assert(DstTy == V2S16);
2182 
2183   Register Src0 = MI.getOperand(1).getReg();
2184   Register Src1 = MI.getOperand(2).getReg();
2185   assert(MRI.getType(Src0) == LLT::scalar(16));
2186 
2187   B.setInstr(MI);
2188   auto Merge = B.buildMerge(S32, {Src0, Src1});
2189   B.buildBitcast(Dst, Merge);
2190 
2191   MI.eraseFromParent();
2192   return true;
2193 }
2194 
2195 // Return the use branch instruction, otherwise null if the usage is invalid.
2196 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2197                                        MachineRegisterInfo &MRI,
2198                                        MachineInstr *&Br) {
2199   Register CondDef = MI.getOperand(0).getReg();
2200   if (!MRI.hasOneNonDBGUse(CondDef))
2201     return nullptr;
2202 
2203   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2204   if (UseMI.getParent() != MI.getParent() ||
2205       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2206     return nullptr;
2207 
2208   // Make sure the cond br is followed by a G_BR
2209   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2210   if (Next != MI.getParent()->end()) {
2211     if (Next->getOpcode() != AMDGPU::G_BR)
2212       return nullptr;
2213     Br = &*Next;
2214   }
2215 
2216   return &UseMI;
2217 }
2218 
2219 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
2220                                                 Register Reg, LLT Ty) const {
2221   Register LiveIn = MRI.getLiveInVirtReg(Reg);
2222   if (LiveIn)
2223     return LiveIn;
2224 
2225   Register NewReg = MRI.createGenericVirtualRegister(Ty);
2226   MRI.addLiveIn(Reg, NewReg);
2227   return NewReg;
2228 }
2229 
2230 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2231                                          const ArgDescriptor *Arg) const {
2232   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2233     return false; // TODO: Handle these
2234 
2235   assert(Arg->getRegister().isPhysical());
2236 
2237   MachineRegisterInfo &MRI = *B.getMRI();
2238 
2239   LLT Ty = MRI.getType(DstReg);
2240   Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
2241 
2242   if (Arg->isMasked()) {
2243     // TODO: Should we try to emit this once in the entry block?
2244     const LLT S32 = LLT::scalar(32);
2245     const unsigned Mask = Arg->getMask();
2246     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2247 
2248     Register AndMaskSrc = LiveIn;
2249 
2250     if (Shift != 0) {
2251       auto ShiftAmt = B.buildConstant(S32, Shift);
2252       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2253     }
2254 
2255     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2256   } else
2257     B.buildCopy(DstReg, LiveIn);
2258 
2259   // Insert the argument copy if it doens't already exist.
2260   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
2261   if (!MRI.getVRegDef(LiveIn)) {
2262     // FIXME: Should have scoped insert pt
2263     MachineBasicBlock &OrigInsBB = B.getMBB();
2264     auto OrigInsPt = B.getInsertPt();
2265 
2266     MachineBasicBlock &EntryMBB = B.getMF().front();
2267     EntryMBB.addLiveIn(Arg->getRegister());
2268     B.setInsertPt(EntryMBB, EntryMBB.begin());
2269     B.buildCopy(LiveIn, Arg->getRegister());
2270 
2271     B.setInsertPt(OrigInsBB, OrigInsPt);
2272   }
2273 
2274   return true;
2275 }
2276 
2277 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2278   MachineInstr &MI,
2279   MachineRegisterInfo &MRI,
2280   MachineIRBuilder &B,
2281   AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2282   B.setInstr(MI);
2283 
2284   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2285 
2286   const ArgDescriptor *Arg;
2287   const TargetRegisterClass *RC;
2288   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
2289   if (!Arg) {
2290     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
2291     return false;
2292   }
2293 
2294   if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
2295     MI.eraseFromParent();
2296     return true;
2297   }
2298 
2299   return false;
2300 }
2301 
2302 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2303                                        MachineRegisterInfo &MRI,
2304                                        MachineIRBuilder &B) const {
2305   B.setInstr(MI);
2306   Register Dst = MI.getOperand(0).getReg();
2307   LLT DstTy = MRI.getType(Dst);
2308   LLT S16 = LLT::scalar(16);
2309   LLT S32 = LLT::scalar(32);
2310   LLT S64 = LLT::scalar(64);
2311 
2312   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2313     return true;
2314 
2315   if (DstTy == S16)
2316     return legalizeFDIV16(MI, MRI, B);
2317   if (DstTy == S32)
2318     return legalizeFDIV32(MI, MRI, B);
2319   if (DstTy == S64)
2320     return legalizeFDIV64(MI, MRI, B);
2321 
2322   return false;
2323 }
2324 
2325 static Register buildDivRCP(MachineIRBuilder &B, Register Src) {
2326   const LLT S32 = LLT::scalar(32);
2327 
2328   auto Cvt0 = B.buildUITOFP(S32, Src);
2329   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Cvt0});
2330   auto FPUIntMaxPlus1 = B.buildFConstant(S32, BitsToFloat(0x4f800000));
2331   auto Mul = B.buildFMul(S32, RcpIFlag, FPUIntMaxPlus1);
2332   return B.buildFPTOUI(S32, Mul).getReg(0);
2333 }
2334 
2335 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2336                                                   Register DstReg,
2337                                                   Register Num,
2338                                                   Register Den,
2339                                                   bool IsRem) const {
2340   const LLT S1 = LLT::scalar(1);
2341   const LLT S32 = LLT::scalar(32);
2342 
2343   // RCP =  URECIP(Den) = 2^32 / Den + e
2344   // e is rounding error.
2345   auto RCP = buildDivRCP(B, Den);
2346 
2347   // RCP_LO = mul(RCP, Den)
2348   auto RCP_LO = B.buildMul(S32, RCP, Den);
2349 
2350   // RCP_HI = mulhu (RCP, Den) */
2351   auto RCP_HI = B.buildUMulH(S32, RCP, Den);
2352 
2353   // NEG_RCP_LO = -RCP_LO
2354   auto Zero = B.buildConstant(S32, 0);
2355   auto NEG_RCP_LO = B.buildSub(S32, Zero, RCP_LO);
2356 
2357   // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
2358   auto CmpRcpHiZero = B.buildICmp(CmpInst::ICMP_EQ, S1, RCP_HI, Zero);
2359   auto ABS_RCP_LO = B.buildSelect(S32, CmpRcpHiZero, NEG_RCP_LO, RCP_LO);
2360 
2361   // Calculate the rounding error from the URECIP instruction
2362   // E = mulhu(ABS_RCP_LO, RCP)
2363   auto E = B.buildUMulH(S32, ABS_RCP_LO, RCP);
2364 
2365   // RCP_A_E = RCP + E
2366   auto RCP_A_E = B.buildAdd(S32, RCP, E);
2367 
2368   // RCP_S_E = RCP - E
2369   auto RCP_S_E = B.buildSub(S32, RCP, E);
2370 
2371   // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
2372   auto Tmp0 = B.buildSelect(S32, CmpRcpHiZero, RCP_A_E, RCP_S_E);
2373 
2374   // Quotient = mulhu(Tmp0, Num)stmp
2375   auto Quotient = B.buildUMulH(S32, Tmp0, Num);
2376 
2377   // Num_S_Remainder = Quotient * Den
2378   auto Num_S_Remainder = B.buildMul(S32, Quotient, Den);
2379 
2380   // Remainder = Num - Num_S_Remainder
2381   auto Remainder = B.buildSub(S32, Num, Num_S_Remainder);
2382 
2383   // Remainder_GE_Den = Remainder >= Den
2384   auto Remainder_GE_Den = B.buildICmp(CmpInst::ICMP_UGE, S1, Remainder, Den);
2385 
2386   // Remainder_GE_Zero = Num >= Num_S_Remainder;
2387   auto Remainder_GE_Zero = B.buildICmp(CmpInst::ICMP_UGE, S1,
2388                                        Num, Num_S_Remainder);
2389 
2390   // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
2391   auto Tmp1 = B.buildAnd(S1, Remainder_GE_Den, Remainder_GE_Zero);
2392 
2393   // Calculate Division result:
2394 
2395   // Quotient_A_One = Quotient + 1
2396   auto One = B.buildConstant(S32, 1);
2397   auto Quotient_A_One = B.buildAdd(S32, Quotient, One);
2398 
2399   // Quotient_S_One = Quotient - 1
2400   auto Quotient_S_One = B.buildSub(S32, Quotient, One);
2401 
2402   // Div = (Tmp1 == 0 ? Quotient_A_One : Quotient)
2403   auto Div = B.buildSelect(S32, Tmp1, Quotient, Quotient_A_One);
2404 
2405   // Div = (Remainder_GE_Zero ? Div : Quotient_S_One)
2406   if (IsRem) {
2407     Div = B.buildSelect(S32, Remainder_GE_Zero, Div, Quotient_S_One);
2408 
2409     // Calculate Rem result:
2410     auto Remainder_S_Den = B.buildSub(S32, Remainder, Den);
2411 
2412     // Remainder_A_Den = Remainder + Den
2413     auto Remainder_A_Den = B.buildAdd(S32, Remainder, Den);
2414 
2415     // Rem = (Tmp1 ? Remainder_S_Den : Remainder)
2416     auto Rem = B.buildSelect(S32, Tmp1, Remainder_S_Den, Remainder);
2417 
2418     // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den)
2419     B.buildSelect(DstReg, Remainder_GE_Zero, Rem, Remainder_A_Den);
2420   } else {
2421     B.buildSelect(DstReg, Remainder_GE_Zero, Div, Quotient_S_One);
2422   }
2423 }
2424 
2425 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2426                                               MachineRegisterInfo &MRI,
2427                                               MachineIRBuilder &B) const {
2428   B.setInstr(MI);
2429   const bool IsRem = MI.getOpcode() == AMDGPU::G_UREM;
2430   Register DstReg = MI.getOperand(0).getReg();
2431   Register Num = MI.getOperand(1).getReg();
2432   Register Den = MI.getOperand(2).getReg();
2433   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsRem);
2434   MI.eraseFromParent();
2435   return true;
2436 }
2437 
2438 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2439                                             MachineRegisterInfo &MRI,
2440                                             MachineIRBuilder &B) const {
2441   if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32))
2442     return legalizeUDIV_UREM32(MI, MRI, B);
2443   return false;
2444 }
2445 
2446 bool AMDGPULegalizerInfo::legalizeSDIV_SREM32(MachineInstr &MI,
2447                                               MachineRegisterInfo &MRI,
2448                                               MachineIRBuilder &B) const {
2449   B.setInstr(MI);
2450   const LLT S32 = LLT::scalar(32);
2451 
2452   const bool IsRem = MI.getOpcode() == AMDGPU::G_SREM;
2453   Register DstReg = MI.getOperand(0).getReg();
2454   Register LHS = MI.getOperand(1).getReg();
2455   Register RHS = MI.getOperand(2).getReg();
2456 
2457   auto ThirtyOne = B.buildConstant(S32, 31);
2458   auto LHSign = B.buildAShr(S32, LHS, ThirtyOne);
2459   auto RHSign = B.buildAShr(S32, LHS, ThirtyOne);
2460 
2461   LHS = B.buildAdd(S32, LHS, LHSign).getReg(0);
2462   RHS = B.buildAdd(S32, RHS, RHSign).getReg(0);
2463 
2464   LHS = B.buildXor(S32, LHS, LHSign).getReg(0);
2465   RHS = B.buildXor(S32, RHS, RHSign).getReg(0);
2466 
2467   Register UDivRem = MRI.createGenericVirtualRegister(S32);
2468   legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsRem);
2469 
2470   if (IsRem) {
2471     auto RSign = LHSign; // Remainder sign is the same as LHS
2472     UDivRem = B.buildXor(S32, UDivRem, RSign).getReg(0);
2473     B.buildSub(DstReg, UDivRem, RSign);
2474   } else {
2475     auto DSign = B.buildXor(S32, LHSign, RHSign);
2476     UDivRem = B.buildXor(S32, UDivRem, DSign).getReg(0);
2477     B.buildSub(DstReg, UDivRem, DSign);
2478   }
2479 
2480   MI.eraseFromParent();
2481   return true;
2482 }
2483 
2484 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2485                                             MachineRegisterInfo &MRI,
2486                                             MachineIRBuilder &B) const {
2487   if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32))
2488     return legalizeSDIV_SREM32(MI, MRI, B);
2489   return false;
2490 }
2491 
2492 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2493                                                  MachineRegisterInfo &MRI,
2494                                                  MachineIRBuilder &B) const {
2495   Register Res = MI.getOperand(0).getReg();
2496   Register LHS = MI.getOperand(1).getReg();
2497   Register RHS = MI.getOperand(2).getReg();
2498 
2499   uint16_t Flags = MI.getFlags();
2500 
2501   LLT ResTy = MRI.getType(Res);
2502   LLT S32 = LLT::scalar(32);
2503   LLT S64 = LLT::scalar(64);
2504 
2505   const MachineFunction &MF = B.getMF();
2506   bool Unsafe =
2507     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2508 
2509   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2510     return false;
2511 
2512   if (!Unsafe && ResTy == S32 &&
2513       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2514     return false;
2515 
2516   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2517     // 1 / x -> RCP(x)
2518     if (CLHS->isExactlyValue(1.0)) {
2519       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2520         .addUse(RHS)
2521         .setMIFlags(Flags);
2522 
2523       MI.eraseFromParent();
2524       return true;
2525     }
2526 
2527     // -1 / x -> RCP( FNEG(x) )
2528     if (CLHS->isExactlyValue(-1.0)) {
2529       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2530       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2531         .addUse(FNeg.getReg(0))
2532         .setMIFlags(Flags);
2533 
2534       MI.eraseFromParent();
2535       return true;
2536     }
2537   }
2538 
2539   // x / y -> x * (1.0 / y)
2540   if (Unsafe) {
2541     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2542       .addUse(RHS)
2543       .setMIFlags(Flags);
2544     B.buildFMul(Res, LHS, RCP, Flags);
2545 
2546     MI.eraseFromParent();
2547     return true;
2548   }
2549 
2550   return false;
2551 }
2552 
2553 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2554                                          MachineRegisterInfo &MRI,
2555                                          MachineIRBuilder &B) const {
2556   B.setInstr(MI);
2557   Register Res = MI.getOperand(0).getReg();
2558   Register LHS = MI.getOperand(1).getReg();
2559   Register RHS = MI.getOperand(2).getReg();
2560 
2561   uint16_t Flags = MI.getFlags();
2562 
2563   LLT S16 = LLT::scalar(16);
2564   LLT S32 = LLT::scalar(32);
2565 
2566   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2567   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2568 
2569   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2570     .addUse(RHSExt.getReg(0))
2571     .setMIFlags(Flags);
2572 
2573   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2574   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2575 
2576   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2577     .addUse(RDst.getReg(0))
2578     .addUse(RHS)
2579     .addUse(LHS)
2580     .setMIFlags(Flags);
2581 
2582   MI.eraseFromParent();
2583   return true;
2584 }
2585 
2586 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2587 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2588 static void toggleSPDenormMode(bool Enable,
2589                                MachineIRBuilder &B,
2590                                const GCNSubtarget &ST,
2591                                AMDGPU::SIModeRegisterDefaults Mode) {
2592   // Set SP denorm mode to this value.
2593   unsigned SPDenormMode =
2594     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
2595 
2596   if (ST.hasDenormModeInst()) {
2597     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2598     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
2599 
2600     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2601     B.buildInstr(AMDGPU::S_DENORM_MODE)
2602       .addImm(NewDenormModeValue);
2603 
2604   } else {
2605     // Select FP32 bit field in mode register.
2606     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2607                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2608                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2609 
2610     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2611       .addImm(SPDenormMode)
2612       .addImm(SPDenormModeBitField);
2613   }
2614 }
2615 
2616 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2617                                          MachineRegisterInfo &MRI,
2618                                          MachineIRBuilder &B) const {
2619   B.setInstr(MI);
2620   Register Res = MI.getOperand(0).getReg();
2621   Register LHS = MI.getOperand(1).getReg();
2622   Register RHS = MI.getOperand(2).getReg();
2623   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2624   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2625 
2626   uint16_t Flags = MI.getFlags();
2627 
2628   LLT S32 = LLT::scalar(32);
2629   LLT S1 = LLT::scalar(1);
2630 
2631   auto One = B.buildFConstant(S32, 1.0f);
2632 
2633   auto DenominatorScaled =
2634     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2635       .addUse(RHS)
2636       .addUse(LHS)
2637       .addImm(1)
2638       .setMIFlags(Flags);
2639   auto NumeratorScaled =
2640     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2641       .addUse(LHS)
2642       .addUse(RHS)
2643       .addImm(0)
2644       .setMIFlags(Flags);
2645 
2646   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2647     .addUse(DenominatorScaled.getReg(0))
2648     .setMIFlags(Flags);
2649   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2650 
2651   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2652   // aren't modeled as reading it.
2653   if (!Mode.allFP32Denormals())
2654     toggleSPDenormMode(true, B, ST, Mode);
2655 
2656   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2657   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2658   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2659   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2660   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2661   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2662 
2663   if (!Mode.allFP32Denormals())
2664     toggleSPDenormMode(false, B, ST, Mode);
2665 
2666   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2667     .addUse(Fma4.getReg(0))
2668     .addUse(Fma1.getReg(0))
2669     .addUse(Fma3.getReg(0))
2670     .addUse(NumeratorScaled.getReg(1))
2671     .setMIFlags(Flags);
2672 
2673   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2674     .addUse(Fmas.getReg(0))
2675     .addUse(RHS)
2676     .addUse(LHS)
2677     .setMIFlags(Flags);
2678 
2679   MI.eraseFromParent();
2680   return true;
2681 }
2682 
2683 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
2684                                          MachineRegisterInfo &MRI,
2685                                          MachineIRBuilder &B) const {
2686   B.setInstr(MI);
2687   Register Res = MI.getOperand(0).getReg();
2688   Register LHS = MI.getOperand(1).getReg();
2689   Register RHS = MI.getOperand(2).getReg();
2690 
2691   uint16_t Flags = MI.getFlags();
2692 
2693   LLT S64 = LLT::scalar(64);
2694   LLT S1 = LLT::scalar(1);
2695 
2696   auto One = B.buildFConstant(S64, 1.0);
2697 
2698   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2699     .addUse(LHS)
2700     .addUse(RHS)
2701     .addImm(1)
2702     .setMIFlags(Flags);
2703 
2704   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
2705 
2706   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
2707     .addUse(DivScale0.getReg(0))
2708     .setMIFlags(Flags);
2709 
2710   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
2711   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
2712   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
2713 
2714   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2715     .addUse(LHS)
2716     .addUse(RHS)
2717     .addImm(0)
2718     .setMIFlags(Flags);
2719 
2720   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
2721   auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags);
2722   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
2723 
2724   Register Scale;
2725   if (!ST.hasUsableDivScaleConditionOutput()) {
2726     // Workaround a hardware bug on SI where the condition output from div_scale
2727     // is not usable.
2728 
2729     LLT S32 = LLT::scalar(32);
2730 
2731     auto NumUnmerge = B.buildUnmerge(S32, LHS);
2732     auto DenUnmerge = B.buildUnmerge(S32, RHS);
2733     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
2734     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
2735 
2736     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
2737                               Scale1Unmerge.getReg(1));
2738     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
2739                               Scale0Unmerge.getReg(1));
2740     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
2741   } else {
2742     Scale = DivScale1.getReg(1);
2743   }
2744 
2745   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
2746     .addUse(Fma4.getReg(0))
2747     .addUse(Fma3.getReg(0))
2748     .addUse(Mul.getReg(0))
2749     .addUse(Scale)
2750     .setMIFlags(Flags);
2751 
2752   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
2753     .addUse(Fmas.getReg(0))
2754     .addUse(RHS)
2755     .addUse(LHS)
2756     .setMIFlags(Flags);
2757 
2758   MI.eraseFromParent();
2759   return true;
2760 }
2761 
2762 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
2763                                                  MachineRegisterInfo &MRI,
2764                                                  MachineIRBuilder &B) const {
2765   B.setInstr(MI);
2766   Register Res = MI.getOperand(0).getReg();
2767   Register LHS = MI.getOperand(2).getReg();
2768   Register RHS = MI.getOperand(3).getReg();
2769   uint16_t Flags = MI.getFlags();
2770 
2771   LLT S32 = LLT::scalar(32);
2772   LLT S1 = LLT::scalar(1);
2773 
2774   auto Abs = B.buildFAbs(S32, RHS, Flags);
2775   const APFloat C0Val(1.0f);
2776 
2777   auto C0 = B.buildConstant(S32, 0x6f800000);
2778   auto C1 = B.buildConstant(S32, 0x2f800000);
2779   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
2780 
2781   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
2782   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
2783 
2784   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
2785 
2786   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2787     .addUse(Mul0.getReg(0))
2788     .setMIFlags(Flags);
2789 
2790   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
2791 
2792   B.buildFMul(Res, Sel, Mul1, Flags);
2793 
2794   MI.eraseFromParent();
2795   return true;
2796 }
2797 
2798 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
2799                                                  MachineRegisterInfo &MRI,
2800                                                  MachineIRBuilder &B) const {
2801   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2802   if (!MFI->isEntryFunction()) {
2803     return legalizePreloadedArgIntrin(MI, MRI, B,
2804                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
2805   }
2806 
2807   B.setInstr(MI);
2808 
2809   uint64_t Offset =
2810     ST.getTargetLowering()->getImplicitParameterOffset(
2811       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
2812   Register DstReg = MI.getOperand(0).getReg();
2813   LLT DstTy = MRI.getType(DstReg);
2814   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
2815 
2816   const ArgDescriptor *Arg;
2817   const TargetRegisterClass *RC;
2818   std::tie(Arg, RC)
2819     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2820   if (!Arg)
2821     return false;
2822 
2823   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
2824   if (!loadInputValue(KernargPtrReg, B, Arg))
2825     return false;
2826 
2827   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
2828   MI.eraseFromParent();
2829   return true;
2830 }
2831 
2832 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
2833                                               MachineRegisterInfo &MRI,
2834                                               MachineIRBuilder &B,
2835                                               unsigned AddrSpace) const {
2836   B.setInstr(MI);
2837   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
2838   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
2839   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
2840   MI.eraseFromParent();
2841   return true;
2842 }
2843 
2844 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
2845 // offset (the offset that is included in bounds checking and swizzling, to be
2846 // split between the instruction's voffset and immoffset fields) and soffset
2847 // (the offset that is excluded from bounds checking and swizzling, to go in
2848 // the instruction's soffset field).  This function takes the first kind of
2849 // offset and figures out how to split it between voffset and immoffset.
2850 std::tuple<Register, unsigned, unsigned>
2851 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
2852                                         Register OrigOffset) const {
2853   const unsigned MaxImm = 4095;
2854   Register BaseReg;
2855   unsigned TotalConstOffset;
2856   MachineInstr *OffsetDef;
2857   const LLT S32 = LLT::scalar(32);
2858 
2859   std::tie(BaseReg, TotalConstOffset, OffsetDef)
2860     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
2861 
2862   unsigned ImmOffset = TotalConstOffset;
2863 
2864   // If the immediate value is too big for the immoffset field, put the value
2865   // and -4096 into the immoffset field so that the value that is copied/added
2866   // for the voffset field is a multiple of 4096, and it stands more chance
2867   // of being CSEd with the copy/add for another similar load/store.
2868   // However, do not do that rounding down to a multiple of 4096 if that is a
2869   // negative number, as it appears to be illegal to have a negative offset
2870   // in the vgpr, even if adding the immediate offset makes it positive.
2871   unsigned Overflow = ImmOffset & ~MaxImm;
2872   ImmOffset -= Overflow;
2873   if ((int32_t)Overflow < 0) {
2874     Overflow += ImmOffset;
2875     ImmOffset = 0;
2876   }
2877 
2878   if (Overflow != 0) {
2879     if (!BaseReg) {
2880       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
2881     } else {
2882       auto OverflowVal = B.buildConstant(S32, Overflow);
2883       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
2884     }
2885   }
2886 
2887   if (!BaseReg)
2888     BaseReg = B.buildConstant(S32, 0).getReg(0);
2889 
2890   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
2891 }
2892 
2893 /// Handle register layout difference for f16 images for some subtargets.
2894 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
2895                                              MachineRegisterInfo &MRI,
2896                                              Register Reg) const {
2897   if (!ST.hasUnpackedD16VMem())
2898     return Reg;
2899 
2900   const LLT S16 = LLT::scalar(16);
2901   const LLT S32 = LLT::scalar(32);
2902   LLT StoreVT = MRI.getType(Reg);
2903   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
2904 
2905   auto Unmerge = B.buildUnmerge(S16, Reg);
2906 
2907   SmallVector<Register, 4> WideRegs;
2908   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
2909     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
2910 
2911   int NumElts = StoreVT.getNumElements();
2912 
2913   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
2914 }
2915 
2916 Register AMDGPULegalizerInfo::fixStoreSourceType(
2917   MachineIRBuilder &B, Register VData, bool IsFormat) const {
2918   MachineRegisterInfo *MRI = B.getMRI();
2919   LLT Ty = MRI->getType(VData);
2920 
2921   const LLT S16 = LLT::scalar(16);
2922 
2923   // Fixup illegal register types for i8 stores.
2924   if (Ty == LLT::scalar(8) || Ty == S16) {
2925     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
2926     return AnyExt;
2927   }
2928 
2929   if (Ty.isVector()) {
2930     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
2931       if (IsFormat)
2932         return handleD16VData(B, *MRI, VData);
2933     }
2934   }
2935 
2936   return VData;
2937 }
2938 
2939 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
2940                                               MachineRegisterInfo &MRI,
2941                                               MachineIRBuilder &B,
2942                                               bool IsTyped,
2943                                               bool IsFormat) const {
2944   B.setInstr(MI);
2945 
2946   Register VData = MI.getOperand(1).getReg();
2947   LLT Ty = MRI.getType(VData);
2948   LLT EltTy = Ty.getScalarType();
2949   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
2950   const LLT S32 = LLT::scalar(32);
2951 
2952   VData = fixStoreSourceType(B, VData, IsFormat);
2953   Register RSrc = MI.getOperand(2).getReg();
2954 
2955   MachineMemOperand *MMO = *MI.memoperands_begin();
2956   const int MemSize = MMO->getSize();
2957 
2958   unsigned ImmOffset;
2959   unsigned TotalOffset;
2960 
2961   // The typed intrinsics add an immediate after the registers.
2962   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
2963 
2964   // The struct intrinsic variants add one additional operand over raw.
2965   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
2966   Register VIndex;
2967   int OpOffset = 0;
2968   if (HasVIndex) {
2969     VIndex = MI.getOperand(3).getReg();
2970     OpOffset = 1;
2971   }
2972 
2973   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
2974   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
2975 
2976   unsigned Format = 0;
2977   if (IsTyped) {
2978     Format = MI.getOperand(5 + OpOffset).getImm();
2979     ++OpOffset;
2980   }
2981 
2982   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
2983 
2984   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
2985   if (TotalOffset != 0)
2986     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
2987 
2988   unsigned Opc;
2989   if (IsTyped) {
2990     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
2991                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
2992   } else if (IsFormat) {
2993     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
2994                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
2995   } else {
2996     switch (MemSize) {
2997     case 1:
2998       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
2999       break;
3000     case 2:
3001       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3002       break;
3003     default:
3004       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3005       break;
3006     }
3007   }
3008 
3009   if (!VIndex)
3010     VIndex = B.buildConstant(S32, 0).getReg(0);
3011 
3012   auto MIB = B.buildInstr(Opc)
3013     .addUse(VData)              // vdata
3014     .addUse(RSrc)               // rsrc
3015     .addUse(VIndex)             // vindex
3016     .addUse(VOffset)            // voffset
3017     .addUse(SOffset)            // soffset
3018     .addImm(ImmOffset);         // offset(imm)
3019 
3020   if (IsTyped)
3021     MIB.addImm(Format);
3022 
3023   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3024      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3025      .addMemOperand(MMO);
3026 
3027   MI.eraseFromParent();
3028   return true;
3029 }
3030 
3031 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3032                                              MachineRegisterInfo &MRI,
3033                                              MachineIRBuilder &B,
3034                                              bool IsFormat,
3035                                              bool IsTyped) const {
3036   B.setInstr(MI);
3037 
3038   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3039   MachineMemOperand *MMO = *MI.memoperands_begin();
3040   const int MemSize = MMO->getSize();
3041   const LLT S32 = LLT::scalar(32);
3042 
3043   Register Dst = MI.getOperand(0).getReg();
3044   Register RSrc = MI.getOperand(2).getReg();
3045 
3046   // The typed intrinsics add an immediate after the registers.
3047   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3048 
3049   // The struct intrinsic variants add one additional operand over raw.
3050   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3051   Register VIndex;
3052   int OpOffset = 0;
3053   if (HasVIndex) {
3054     VIndex = MI.getOperand(3).getReg();
3055     OpOffset = 1;
3056   }
3057 
3058   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3059   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3060 
3061   unsigned Format = 0;
3062   if (IsTyped) {
3063     Format = MI.getOperand(5 + OpOffset).getImm();
3064     ++OpOffset;
3065   }
3066 
3067   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3068   unsigned ImmOffset;
3069   unsigned TotalOffset;
3070 
3071   LLT Ty = MRI.getType(Dst);
3072   LLT EltTy = Ty.getScalarType();
3073   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3074   const bool Unpacked = ST.hasUnpackedD16VMem();
3075 
3076   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3077   if (TotalOffset != 0)
3078     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3079 
3080   unsigned Opc;
3081 
3082   if (IsTyped) {
3083     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3084                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3085   } else if (IsFormat) {
3086     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3087                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3088   } else {
3089     switch (MemSize) {
3090     case 1:
3091       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3092       break;
3093     case 2:
3094       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3095       break;
3096     default:
3097       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3098       break;
3099     }
3100   }
3101 
3102   Register LoadDstReg;
3103 
3104   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3105   LLT UnpackedTy = Ty.changeElementSize(32);
3106 
3107   if (IsExtLoad)
3108     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3109   else if (Unpacked && IsD16 && Ty.isVector())
3110     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3111   else
3112     LoadDstReg = Dst;
3113 
3114   if (!VIndex)
3115     VIndex = B.buildConstant(S32, 0).getReg(0);
3116 
3117   auto MIB = B.buildInstr(Opc)
3118     .addDef(LoadDstReg)         // vdata
3119     .addUse(RSrc)               // rsrc
3120     .addUse(VIndex)             // vindex
3121     .addUse(VOffset)            // voffset
3122     .addUse(SOffset)            // soffset
3123     .addImm(ImmOffset);         // offset(imm)
3124 
3125   if (IsTyped)
3126     MIB.addImm(Format);
3127 
3128   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3129      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3130      .addMemOperand(MMO);
3131 
3132   if (LoadDstReg != Dst) {
3133     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3134 
3135     // Widen result for extending loads was widened.
3136     if (IsExtLoad)
3137       B.buildTrunc(Dst, LoadDstReg);
3138     else {
3139       // Repack to original 16-bit vector result
3140       // FIXME: G_TRUNC should work, but legalization currently fails
3141       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3142       SmallVector<Register, 4> Repack;
3143       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3144         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3145       B.buildMerge(Dst, Repack);
3146     }
3147   }
3148 
3149   MI.eraseFromParent();
3150   return true;
3151 }
3152 
3153 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3154                                                MachineIRBuilder &B,
3155                                                bool IsInc) const {
3156   B.setInstr(MI);
3157   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3158                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3159   B.buildInstr(Opc)
3160     .addDef(MI.getOperand(0).getReg())
3161     .addUse(MI.getOperand(2).getReg())
3162     .addUse(MI.getOperand(3).getReg())
3163     .cloneMemRefs(MI);
3164   MI.eraseFromParent();
3165   return true;
3166 }
3167 
3168 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3169   switch (IntrID) {
3170   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3171   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3172     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3173   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3174   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3175     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3176   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3177   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3178     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3179   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3180   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3181     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3182   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3183   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3184     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3185   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3186   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3187     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3188   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3189   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3190     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3191   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3192   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3193     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3194   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3195   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3196     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3197   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3198   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3199     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3200   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3201   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3202     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3203   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3204   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3205     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3206   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3207   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3208     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3209   default:
3210     llvm_unreachable("unhandled atomic opcode");
3211   }
3212 }
3213 
3214 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3215                                                MachineIRBuilder &B,
3216                                                Intrinsic::ID IID) const {
3217   B.setInstr(MI);
3218 
3219   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3220                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3221 
3222   Register Dst = MI.getOperand(0).getReg();
3223   Register VData = MI.getOperand(2).getReg();
3224 
3225   Register CmpVal;
3226   int OpOffset = 0;
3227 
3228   if (IsCmpSwap) {
3229     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3230     ++OpOffset;
3231   }
3232 
3233   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3234   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
3235 
3236   // The struct intrinsic variants add one additional operand over raw.
3237   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3238   Register VIndex;
3239   if (HasVIndex) {
3240     VIndex = MI.getOperand(4 + OpOffset).getReg();
3241     ++OpOffset;
3242   }
3243 
3244   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3245   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3246   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3247 
3248   MachineMemOperand *MMO = *MI.memoperands_begin();
3249 
3250   unsigned ImmOffset;
3251   unsigned TotalOffset;
3252   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3253   if (TotalOffset != 0)
3254     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3255 
3256   if (!VIndex)
3257     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3258 
3259   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
3260     .addDef(Dst)
3261     .addUse(VData); // vdata
3262 
3263   if (IsCmpSwap)
3264     MIB.addReg(CmpVal);
3265 
3266   MIB.addUse(RSrc)               // rsrc
3267      .addUse(VIndex)             // vindex
3268      .addUse(VOffset)            // voffset
3269      .addUse(SOffset)            // soffset
3270      .addImm(ImmOffset)          // offset(imm)
3271      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3272      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3273      .addMemOperand(MMO);
3274 
3275   MI.eraseFromParent();
3276   return true;
3277 }
3278 
3279 // Produce a vector of s16 elements from s32 pieces.
3280 static void truncToS16Vector(MachineIRBuilder &B, Register DstReg,
3281                              ArrayRef<Register> UnmergeParts) {
3282   const LLT S16 = LLT::scalar(16);
3283 
3284   SmallVector<Register, 4> RemergeParts(UnmergeParts.size());
3285   for (int I = 0, E = UnmergeParts.size(); I != E; ++I)
3286     RemergeParts[I] = B.buildTrunc(S16, UnmergeParts[I]).getReg(0);
3287 
3288   B.buildBuildVector(DstReg, RemergeParts);
3289 }
3290 
3291 /// Convert a set of s32 registers to a result vector with s16 elements.
3292 static void bitcastToS16Vector(MachineIRBuilder &B, Register DstReg,
3293                                ArrayRef<Register> UnmergeParts) {
3294   MachineRegisterInfo &MRI = *B.getMRI();
3295   const LLT V2S16 = LLT::vector(2, 16);
3296   LLT TargetTy = MRI.getType(DstReg);
3297   int NumElts = UnmergeParts.size();
3298 
3299   if (NumElts == 1) {
3300     assert(TargetTy == V2S16);
3301     B.buildBitcast(DstReg, UnmergeParts[0]);
3302     return;
3303   }
3304 
3305   SmallVector<Register, 4> RemergeParts(NumElts);
3306   for (int I = 0; I != NumElts; ++I)
3307     RemergeParts[I] = B.buildBitcast(V2S16, UnmergeParts[I]).getReg(0);
3308 
3309   if (TargetTy.getSizeInBits() == 32u * NumElts) {
3310     B.buildConcatVectors(DstReg, RemergeParts);
3311     return;
3312   }
3313 
3314   const LLT V3S16 = LLT::vector(3, 16);
3315   const LLT V6S16 = LLT::vector(6, 16);
3316 
3317   // Widen to v6s16 and unpack v3 parts.
3318   assert(TargetTy == V3S16);
3319 
3320   RemergeParts.push_back(B.buildUndef(V2S16).getReg(0));
3321   auto Concat = B.buildConcatVectors(V6S16, RemergeParts);
3322   B.buildUnmerge({DstReg, MRI.createGenericVirtualRegister(V3S16)}, Concat);
3323 }
3324 
3325 // FIXME: Just vector trunc should be sufficent, but legalization currently
3326 // broken.
3327 static void repackUnpackedD16Load(MachineIRBuilder &B, Register DstReg,
3328                                   Register WideDstReg) {
3329   const LLT S32 = LLT::scalar(32);
3330   const LLT S16 = LLT::scalar(16);
3331 
3332   auto Unmerge = B.buildUnmerge(S32, WideDstReg);
3333 
3334   int NumOps = Unmerge->getNumOperands() - 1;
3335   SmallVector<Register, 4> RemergeParts(NumOps);
3336   for (int I = 0; I != NumOps; ++I)
3337     RemergeParts[I] = B.buildTrunc(S16, Unmerge.getReg(I)).getReg(0);
3338 
3339   B.buildBuildVector(DstReg, RemergeParts);
3340 }
3341 
3342 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3343     MachineInstr &MI, MachineIRBuilder &B,
3344     GISelChangeObserver &Observer,
3345     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3346   bool IsTFE = MI.getNumExplicitDefs() == 2;
3347 
3348   // We are only processing the operands of d16 image operations on subtargets
3349   // that use the unpacked register layout, or need to repack the TFE result.
3350 
3351   // TODO: Need to handle a16 images too
3352   // TODO: Do we need to guard against already legalized intrinsics?
3353   if (!IsTFE && !ST.hasUnpackedD16VMem())
3354     return true;
3355 
3356   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3357     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3358 
3359   if (BaseOpcode->Atomic) // No d16 atomics, or TFE.
3360     return true;
3361 
3362   B.setInstr(MI);
3363 
3364   MachineRegisterInfo *MRI = B.getMRI();
3365   const LLT S32 = LLT::scalar(32);
3366   const LLT S16 = LLT::scalar(16);
3367 
3368   if (BaseOpcode->Store) { // No TFE for stores?
3369     Register VData = MI.getOperand(1).getReg();
3370     LLT Ty = MRI->getType(VData);
3371     if (!Ty.isVector() || Ty.getElementType() != S16)
3372       return true;
3373 
3374     B.setInstr(MI);
3375 
3376     Observer.changingInstr(MI);
3377     MI.getOperand(1).setReg(handleD16VData(B, *MRI, VData));
3378     Observer.changedInstr(MI);
3379     return true;
3380   }
3381 
3382   Register DstReg = MI.getOperand(0).getReg();
3383   LLT Ty = MRI->getType(DstReg);
3384   const LLT EltTy = Ty.getScalarType();
3385   const bool IsD16 = Ty.getScalarType() == S16;
3386   const unsigned NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
3387 
3388   if (IsTFE) {
3389     // In the IR, TFE is supposed to be used with a 2 element struct return
3390     // type. The intruction really returns these two values in one contiguous
3391     // register, with one additional dword beyond the loaded data. Rewrite the
3392     // return type to use a single register result.
3393     Register Dst1Reg = MI.getOperand(1).getReg();
3394     if (MRI->getType(Dst1Reg) != S32)
3395       return false;
3396 
3397     // TODO: Make sure the TFE operand bit is set.
3398 
3399     // The raw dword aligned data component of the load. The only legal cases
3400     // where this matters should be when using the packed D16 format, for
3401     // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
3402     LLT RoundedTy;
3403     LLT TFETy;
3404 
3405     if (IsD16 && ST.hasUnpackedD16VMem()) {
3406       RoundedTy = LLT::scalarOrVector(NumElts, 32);
3407       TFETy = LLT::vector(NumElts + 1, 32);
3408     } else {
3409       unsigned EltSize = Ty.getScalarSizeInBits();
3410       unsigned RoundedElts = (Ty.getSizeInBits() + 31) / 32;
3411       unsigned RoundedSize = 32 * RoundedElts;
3412       RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
3413       TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
3414     }
3415 
3416     Register TFEReg = MRI->createGenericVirtualRegister(TFETy);
3417     Observer.changingInstr(MI);
3418 
3419     MI.getOperand(0).setReg(TFEReg);
3420     MI.RemoveOperand(1);
3421 
3422     Observer.changedInstr(MI);
3423 
3424     // Insert after the instruction.
3425     B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3426 
3427     // Now figure out how to copy the new result register back into the old
3428     // result.
3429 
3430     SmallVector<Register, 5> UnmergeResults(TFETy.getNumElements(), Dst1Reg);
3431     int NumDataElts = TFETy.getNumElements() - 1;
3432 
3433     if (!Ty.isVector()) {
3434       // Simplest case is a trivial unmerge (plus a truncate for d16).
3435       UnmergeResults[0] = Ty == S32 ?
3436         DstReg : MRI->createGenericVirtualRegister(S32);
3437 
3438       B.buildUnmerge(UnmergeResults, TFEReg);
3439       if (Ty != S32)
3440         B.buildTrunc(DstReg, UnmergeResults[0]);
3441       return true;
3442     }
3443 
3444     // We have to repack into a new vector of some kind.
3445     for (int I = 0; I != NumDataElts; ++I)
3446       UnmergeResults[I] = MRI->createGenericVirtualRegister(S32);
3447     B.buildUnmerge(UnmergeResults, TFEReg);
3448 
3449     // Drop the final TFE element.
3450     ArrayRef<Register> DataPart(UnmergeResults.data(), NumDataElts);
3451 
3452     if (EltTy == S32)
3453       B.buildBuildVector(DstReg, DataPart);
3454     else if (ST.hasUnpackedD16VMem())
3455       truncToS16Vector(B, DstReg, DataPart);
3456     else
3457       bitcastToS16Vector(B, DstReg, DataPart);
3458 
3459     return true;
3460   }
3461 
3462   // Must be an image load.
3463   if (!Ty.isVector() || Ty.getElementType() != S16)
3464     return true;
3465 
3466   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3467 
3468   LLT WidenedTy = Ty.changeElementType(S32);
3469   Register WideDstReg = MRI->createGenericVirtualRegister(WidenedTy);
3470 
3471   Observer.changingInstr(MI);
3472   MI.getOperand(0).setReg(WideDstReg);
3473   Observer.changedInstr(MI);
3474 
3475   repackUnpackedD16Load(B, DstReg, WideDstReg);
3476   return true;
3477 }
3478 
3479 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
3480   MachineInstr &MI, MachineIRBuilder &B,
3481   GISelChangeObserver &Observer) const {
3482   Register Dst = MI.getOperand(0).getReg();
3483   LLT Ty = B.getMRI()->getType(Dst);
3484   unsigned Size = Ty.getSizeInBits();
3485   MachineFunction &MF = B.getMF();
3486 
3487   Observer.changingInstr(MI);
3488 
3489   // FIXME: We don't really need this intermediate instruction. The intrinsic
3490   // should be fixed to have a memory operand. Since it's readnone, we're not
3491   // allowed to add one.
3492   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
3493   MI.RemoveOperand(1); // Remove intrinsic ID
3494 
3495   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
3496   // TODO: Should this use datalayout alignment?
3497   const unsigned MemSize = (Size + 7) / 8;
3498   const unsigned MemAlign = 4;
3499   MachineMemOperand *MMO = MF.getMachineMemOperand(
3500     MachinePointerInfo(),
3501     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
3502     MachineMemOperand::MOInvariant, MemSize, MemAlign);
3503   MI.addMemOperand(MF, MMO);
3504 
3505   // There are no 96-bit result scalar loads, but widening to 128-bit should
3506   // always be legal. We may need to restore this to a 96-bit result if it turns
3507   // out this needs to be converted to a vector load during RegBankSelect.
3508   if (!isPowerOf2_32(Size)) {
3509     LegalizerHelper Helper(MF, *this, Observer, B);
3510     B.setInstr(MI);
3511 
3512     if (Ty.isVector())
3513       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
3514     else
3515       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
3516   }
3517 
3518   Observer.changedInstr(MI);
3519   return true;
3520 }
3521 
3522 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
3523                                             MachineIRBuilder &B,
3524                                             GISelChangeObserver &Observer) const {
3525   MachineRegisterInfo &MRI = *B.getMRI();
3526 
3527   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
3528   auto IntrID = MI.getIntrinsicID();
3529   switch (IntrID) {
3530   case Intrinsic::amdgcn_if:
3531   case Intrinsic::amdgcn_else: {
3532     MachineInstr *Br = nullptr;
3533     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
3534       const SIRegisterInfo *TRI
3535         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
3536 
3537       B.setInstr(*BrCond);
3538       Register Def = MI.getOperand(1).getReg();
3539       Register Use = MI.getOperand(3).getReg();
3540 
3541       MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB();
3542       if (Br)
3543         BrTarget = Br->getOperand(0).getMBB();
3544 
3545       if (IntrID == Intrinsic::amdgcn_if) {
3546         B.buildInstr(AMDGPU::SI_IF)
3547           .addDef(Def)
3548           .addUse(Use)
3549           .addMBB(BrTarget);
3550       } else {
3551         B.buildInstr(AMDGPU::SI_ELSE)
3552           .addDef(Def)
3553           .addUse(Use)
3554           .addMBB(BrTarget)
3555           .addImm(0);
3556       }
3557 
3558       if (Br)
3559         Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB());
3560 
3561       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
3562       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
3563       MI.eraseFromParent();
3564       BrCond->eraseFromParent();
3565       return true;
3566     }
3567 
3568     return false;
3569   }
3570   case Intrinsic::amdgcn_loop: {
3571     MachineInstr *Br = nullptr;
3572     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
3573       const SIRegisterInfo *TRI
3574         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
3575 
3576       B.setInstr(*BrCond);
3577 
3578       MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB();
3579       if (Br)
3580         BrTarget = Br->getOperand(0).getMBB();
3581 
3582       Register Reg = MI.getOperand(2).getReg();
3583       B.buildInstr(AMDGPU::SI_LOOP)
3584         .addUse(Reg)
3585         .addMBB(BrTarget);
3586 
3587       if (Br)
3588         Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB());
3589 
3590       MI.eraseFromParent();
3591       BrCond->eraseFromParent();
3592       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
3593       return true;
3594     }
3595 
3596     return false;
3597   }
3598   case Intrinsic::amdgcn_kernarg_segment_ptr:
3599     return legalizePreloadedArgIntrin(
3600       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
3601   case Intrinsic::amdgcn_implicitarg_ptr:
3602     return legalizeImplicitArgPtr(MI, MRI, B);
3603   case Intrinsic::amdgcn_workitem_id_x:
3604     return legalizePreloadedArgIntrin(MI, MRI, B,
3605                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
3606   case Intrinsic::amdgcn_workitem_id_y:
3607     return legalizePreloadedArgIntrin(MI, MRI, B,
3608                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
3609   case Intrinsic::amdgcn_workitem_id_z:
3610     return legalizePreloadedArgIntrin(MI, MRI, B,
3611                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
3612   case Intrinsic::amdgcn_workgroup_id_x:
3613     return legalizePreloadedArgIntrin(MI, MRI, B,
3614                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
3615   case Intrinsic::amdgcn_workgroup_id_y:
3616     return legalizePreloadedArgIntrin(MI, MRI, B,
3617                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
3618   case Intrinsic::amdgcn_workgroup_id_z:
3619     return legalizePreloadedArgIntrin(MI, MRI, B,
3620                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
3621   case Intrinsic::amdgcn_dispatch_ptr:
3622     return legalizePreloadedArgIntrin(MI, MRI, B,
3623                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
3624   case Intrinsic::amdgcn_queue_ptr:
3625     return legalizePreloadedArgIntrin(MI, MRI, B,
3626                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
3627   case Intrinsic::amdgcn_implicit_buffer_ptr:
3628     return legalizePreloadedArgIntrin(
3629       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
3630   case Intrinsic::amdgcn_dispatch_id:
3631     return legalizePreloadedArgIntrin(MI, MRI, B,
3632                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
3633   case Intrinsic::amdgcn_fdiv_fast:
3634     return legalizeFDIVFastIntrin(MI, MRI, B);
3635   case Intrinsic::amdgcn_is_shared:
3636     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
3637   case Intrinsic::amdgcn_is_private:
3638     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
3639   case Intrinsic::amdgcn_wavefrontsize: {
3640     B.setInstr(MI);
3641     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
3642     MI.eraseFromParent();
3643     return true;
3644   }
3645   case Intrinsic::amdgcn_s_buffer_load:
3646     return legalizeSBufferLoad(MI, B, Observer);
3647   case Intrinsic::amdgcn_raw_buffer_store:
3648   case Intrinsic::amdgcn_struct_buffer_store:
3649     return legalizeBufferStore(MI, MRI, B, false, false);
3650   case Intrinsic::amdgcn_raw_buffer_store_format:
3651   case Intrinsic::amdgcn_struct_buffer_store_format:
3652     return legalizeBufferStore(MI, MRI, B, false, true);
3653   case Intrinsic::amdgcn_raw_tbuffer_store:
3654   case Intrinsic::amdgcn_struct_tbuffer_store:
3655     return legalizeBufferStore(MI, MRI, B, true, true);
3656   case Intrinsic::amdgcn_raw_buffer_load:
3657   case Intrinsic::amdgcn_struct_buffer_load:
3658     return legalizeBufferLoad(MI, MRI, B, false, false);
3659   case Intrinsic::amdgcn_raw_buffer_load_format:
3660   case Intrinsic::amdgcn_struct_buffer_load_format:
3661     return legalizeBufferLoad(MI, MRI, B, true, false);
3662   case Intrinsic::amdgcn_raw_tbuffer_load:
3663   case Intrinsic::amdgcn_struct_tbuffer_load:
3664     return legalizeBufferLoad(MI, MRI, B, true, true);
3665   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3666   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3667   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3668   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3669   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3670   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3671   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3672   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3673   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3674   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3675   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3676   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3677   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3678   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3679   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3680   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3681   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3682   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3683   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3684   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3685   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3686   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3687   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3688   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3689   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3690   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3691     return legalizeBufferAtomic(MI, B, IntrID);
3692   case Intrinsic::amdgcn_atomic_inc:
3693     return legalizeAtomicIncDec(MI, B, true);
3694   case Intrinsic::amdgcn_atomic_dec:
3695     return legalizeAtomicIncDec(MI, B, false);
3696   default: {
3697     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
3698             AMDGPU::getImageDimIntrinsicInfo(IntrID))
3699       return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr);
3700     return true;
3701   }
3702   }
3703 
3704   return true;
3705 }
3706