1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
19 #endif
20 
21 #include "AMDGPULegalizerInfo.h"
22 
23 #include "AMDGPU.h"
24 #include "AMDGPUGlobalISelUtils.h"
25 #include "AMDGPUTargetMachine.h"
26 #include "SIMachineFunctionInfo.h"
27 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
28 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
29 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
30 #include "llvm/CodeGen/TargetOpcodes.h"
31 #include "llvm/CodeGen/ValueTypes.h"
32 #include "llvm/IR/DerivedTypes.h"
33 #include "llvm/IR/DiagnosticInfo.h"
34 #include "llvm/IR/Type.h"
35 #include "llvm/Support/Debug.h"
36 
37 #define DEBUG_TYPE "amdgpu-legalinfo"
38 
39 using namespace llvm;
40 using namespace LegalizeActions;
41 using namespace LegalizeMutations;
42 using namespace LegalityPredicates;
43 using namespace MIPatternMatch;
44 
45 // Round the number of elements to the next power of two elements
46 static LLT getPow2VectorType(LLT Ty) {
47   unsigned NElts = Ty.getNumElements();
48   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
49   return Ty.changeNumElements(Pow2NElts);
50 }
51 
52 // Round the number of bits to the next power of two bits
53 static LLT getPow2ScalarType(LLT Ty) {
54   unsigned Bits = Ty.getSizeInBits();
55   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
56   return LLT::scalar(Pow2Bits);
57 }
58 
59 static LegalityPredicate isMultiple32(unsigned TypeIdx,
60                                       unsigned MaxSize = 1024) {
61   return [=](const LegalityQuery &Query) {
62     const LLT Ty = Query.Types[TypeIdx];
63     const LLT EltTy = Ty.getScalarType();
64     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
65   };
66 }
67 
68 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
69   return [=](const LegalityQuery &Query) {
70     return Query.Types[TypeIdx].getSizeInBits() == Size;
71   };
72 }
73 
74 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
75   return [=](const LegalityQuery &Query) {
76     const LLT Ty = Query.Types[TypeIdx];
77     return Ty.isVector() &&
78            Ty.getNumElements() % 2 != 0 &&
79            Ty.getElementType().getSizeInBits() < 32 &&
80            Ty.getSizeInBits() % 32 != 0;
81   };
82 }
83 
84 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
85   return [=](const LegalityQuery &Query) {
86     const LLT Ty = Query.Types[TypeIdx];
87     const LLT EltTy = Ty.getScalarType();
88     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
89   };
90 }
91 
92 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
93   return [=](const LegalityQuery &Query) {
94     const LLT Ty = Query.Types[TypeIdx];
95     const LLT EltTy = Ty.getElementType();
96     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
97   };
98 }
99 
100 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
101   return [=](const LegalityQuery &Query) {
102     const LLT Ty = Query.Types[TypeIdx];
103     const LLT EltTy = Ty.getElementType();
104     unsigned Size = Ty.getSizeInBits();
105     unsigned Pieces = (Size + 63) / 64;
106     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
107     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
108   };
109 }
110 
111 // Increase the number of vector elements to reach the next multiple of 32-bit
112 // type.
113 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
114   return [=](const LegalityQuery &Query) {
115     const LLT Ty = Query.Types[TypeIdx];
116 
117     const LLT EltTy = Ty.getElementType();
118     const int Size = Ty.getSizeInBits();
119     const int EltSize = EltTy.getSizeInBits();
120     const int NextMul32 = (Size + 31) / 32;
121 
122     assert(EltSize < 32);
123 
124     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
125     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
126   };
127 }
128 
129 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
130   return [=](const LegalityQuery &Query) {
131     const LLT QueryTy = Query.Types[TypeIdx];
132     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
133   };
134 }
135 
136 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
137   return [=](const LegalityQuery &Query) {
138     const LLT QueryTy = Query.Types[TypeIdx];
139     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
140   };
141 }
142 
143 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
144   return [=](const LegalityQuery &Query) {
145     const LLT QueryTy = Query.Types[TypeIdx];
146     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
147   };
148 }
149 
150 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
151 // v2s16.
152 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
153   return [=](const LegalityQuery &Query) {
154     const LLT Ty = Query.Types[TypeIdx];
155     if (Ty.isVector()) {
156       const int EltSize = Ty.getElementType().getSizeInBits();
157       return EltSize == 32 || EltSize == 64 ||
158             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
159              EltSize == 128 || EltSize == 256;
160     }
161 
162     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
163   };
164 }
165 
166 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
167   return [=](const LegalityQuery &Query) {
168     const LLT QueryTy = Query.Types[TypeIdx];
169     return QueryTy.isVector() && QueryTy.getElementType() == Type;
170   };
171 }
172 
173 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
174   return [=](const LegalityQuery &Query) {
175     const LLT Ty = Query.Types[TypeIdx];
176     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
177            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
178   };
179 }
180 
181 static LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1) {
182   return [=](const LegalityQuery &Query) {
183     return Query.Types[TypeIdx0].getSizeInBits() <
184            Query.Types[TypeIdx1].getSizeInBits();
185   };
186 }
187 
188 static LegalityPredicate greaterThan(unsigned TypeIdx0, unsigned TypeIdx1) {
189   return [=](const LegalityQuery &Query) {
190     return Query.Types[TypeIdx0].getSizeInBits() >
191            Query.Types[TypeIdx1].getSizeInBits();
192   };
193 }
194 
195 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
196                                          const GCNTargetMachine &TM)
197   :  ST(ST_) {
198   using namespace TargetOpcode;
199 
200   auto GetAddrSpacePtr = [&TM](unsigned AS) {
201     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
202   };
203 
204   const LLT S1 = LLT::scalar(1);
205   const LLT S16 = LLT::scalar(16);
206   const LLT S32 = LLT::scalar(32);
207   const LLT S64 = LLT::scalar(64);
208   const LLT S128 = LLT::scalar(128);
209   const LLT S256 = LLT::scalar(256);
210   const LLT S1024 = LLT::scalar(1024);
211 
212   const LLT V2S16 = LLT::vector(2, 16);
213   const LLT V4S16 = LLT::vector(4, 16);
214 
215   const LLT V2S32 = LLT::vector(2, 32);
216   const LLT V3S32 = LLT::vector(3, 32);
217   const LLT V4S32 = LLT::vector(4, 32);
218   const LLT V5S32 = LLT::vector(5, 32);
219   const LLT V6S32 = LLT::vector(6, 32);
220   const LLT V7S32 = LLT::vector(7, 32);
221   const LLT V8S32 = LLT::vector(8, 32);
222   const LLT V9S32 = LLT::vector(9, 32);
223   const LLT V10S32 = LLT::vector(10, 32);
224   const LLT V11S32 = LLT::vector(11, 32);
225   const LLT V12S32 = LLT::vector(12, 32);
226   const LLT V13S32 = LLT::vector(13, 32);
227   const LLT V14S32 = LLT::vector(14, 32);
228   const LLT V15S32 = LLT::vector(15, 32);
229   const LLT V16S32 = LLT::vector(16, 32);
230   const LLT V32S32 = LLT::vector(32, 32);
231 
232   const LLT V2S64 = LLT::vector(2, 64);
233   const LLT V3S64 = LLT::vector(3, 64);
234   const LLT V4S64 = LLT::vector(4, 64);
235   const LLT V5S64 = LLT::vector(5, 64);
236   const LLT V6S64 = LLT::vector(6, 64);
237   const LLT V7S64 = LLT::vector(7, 64);
238   const LLT V8S64 = LLT::vector(8, 64);
239   const LLT V16S64 = LLT::vector(16, 64);
240 
241   std::initializer_list<LLT> AllS32Vectors =
242     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
243      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
244   std::initializer_list<LLT> AllS64Vectors =
245     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
246 
247   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
248   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
249   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
250   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
251   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
252   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
253   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
254 
255   const LLT CodePtr = FlatPtr;
256 
257   const std::initializer_list<LLT> AddrSpaces64 = {
258     GlobalPtr, ConstantPtr, FlatPtr
259   };
260 
261   const std::initializer_list<LLT> AddrSpaces32 = {
262     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
263   };
264 
265   const std::initializer_list<LLT> FPTypesBase = {
266     S32, S64
267   };
268 
269   const std::initializer_list<LLT> FPTypes16 = {
270     S32, S64, S16
271   };
272 
273   const std::initializer_list<LLT> FPTypesPK16 = {
274     S32, S64, S16, V2S16
275   };
276 
277   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
278 
279   setAction({G_BRCOND, S1}, Legal); // VCC branches
280   setAction({G_BRCOND, S32}, Legal); // SCC branches
281 
282   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
283   // elements for v3s16
284   getActionDefinitionsBuilder(G_PHI)
285     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
286     .legalFor(AllS32Vectors)
287     .legalFor(AllS64Vectors)
288     .legalFor(AddrSpaces64)
289     .legalFor(AddrSpaces32)
290     .clampScalar(0, S32, S256)
291     .widenScalarToNextPow2(0, 32)
292     .clampMaxNumElements(0, S32, 16)
293     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
294     .legalIf(isPointer(0));
295 
296   if (ST.has16BitInsts()) {
297     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
298       .legalFor({S32, S16})
299       .clampScalar(0, S16, S32)
300       .scalarize(0)
301       .widenScalarToNextPow2(0, 32);
302   } else {
303     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
304       .legalFor({S32})
305       .clampScalar(0, S32, S32)
306       .scalarize(0);
307   }
308 
309   // FIXME: Not really legal. Placeholder for custom lowering.
310   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
311     .customFor({S32, S64})
312     .clampScalar(0, S32, S64)
313     .widenScalarToNextPow2(0, 32)
314     .scalarize(0);
315 
316   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
317     .legalFor({S32})
318     .clampScalar(0, S32, S32)
319     .scalarize(0);
320 
321   // Report legal for any types we can handle anywhere. For the cases only legal
322   // on the SALU, RegBankSelect will be able to re-legalize.
323   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
324     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
325     .clampScalar(0, S32, S64)
326     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
327     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
328     .widenScalarToNextPow2(0)
329     .scalarize(0);
330 
331   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
332                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
333     .legalFor({{S32, S1}, {S32, S32}})
334     .clampScalar(0, S32, S32)
335     .scalarize(0); // TODO: Implement.
336 
337   getActionDefinitionsBuilder(G_BITCAST)
338     // Don't worry about the size constraint.
339     .legalIf(all(isRegisterType(0), isRegisterType(1)))
340     .lower();
341 
342 
343   getActionDefinitionsBuilder(G_CONSTANT)
344     .legalFor({S1, S32, S64, S16, GlobalPtr,
345                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
346     .clampScalar(0, S32, S64)
347     .widenScalarToNextPow2(0)
348     .legalIf(isPointer(0));
349 
350   getActionDefinitionsBuilder(G_FCONSTANT)
351     .legalFor({S32, S64, S16})
352     .clampScalar(0, S16, S64);
353 
354   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
355     .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
356                ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
357     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
358     .clampScalarOrElt(0, S32, S1024)
359     .legalIf(isMultiple32(0))
360     .widenScalarToNextPow2(0, 32)
361     .clampMaxNumElements(0, S32, 16);
362 
363   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
364   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
365     .unsupportedFor({PrivatePtr})
366     .custom();
367   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
368 
369   auto &FPOpActions = getActionDefinitionsBuilder(
370     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
371     .legalFor({S32, S64});
372   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
373     .customFor({S32, S64});
374   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
375     .customFor({S32, S64});
376 
377   if (ST.has16BitInsts()) {
378     if (ST.hasVOP3PInsts())
379       FPOpActions.legalFor({S16, V2S16});
380     else
381       FPOpActions.legalFor({S16});
382 
383     TrigActions.customFor({S16});
384     FDIVActions.customFor({S16});
385   }
386 
387   auto &MinNumMaxNum = getActionDefinitionsBuilder({
388       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
389 
390   if (ST.hasVOP3PInsts()) {
391     MinNumMaxNum.customFor(FPTypesPK16)
392       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
393       .clampMaxNumElements(0, S16, 2)
394       .clampScalar(0, S16, S64)
395       .scalarize(0);
396   } else if (ST.has16BitInsts()) {
397     MinNumMaxNum.customFor(FPTypes16)
398       .clampScalar(0, S16, S64)
399       .scalarize(0);
400   } else {
401     MinNumMaxNum.customFor(FPTypesBase)
402       .clampScalar(0, S32, S64)
403       .scalarize(0);
404   }
405 
406   if (ST.hasVOP3PInsts())
407     FPOpActions.clampMaxNumElements(0, S16, 2);
408 
409   FPOpActions
410     .scalarize(0)
411     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
412 
413   TrigActions
414     .scalarize(0)
415     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
416 
417   FDIVActions
418     .scalarize(0)
419     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
420 
421   getActionDefinitionsBuilder({G_FNEG, G_FABS})
422     .legalFor(FPTypesPK16)
423     .clampMaxNumElements(0, S16, 2)
424     .scalarize(0)
425     .clampScalar(0, S16, S64);
426 
427   if (ST.has16BitInsts()) {
428     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
429       .legalFor({S32, S64, S16})
430       .scalarize(0)
431       .clampScalar(0, S16, S64);
432   } else {
433     getActionDefinitionsBuilder(G_FSQRT)
434       .legalFor({S32, S64})
435       .scalarize(0)
436       .clampScalar(0, S32, S64);
437 
438     if (ST.hasFractBug()) {
439       getActionDefinitionsBuilder(G_FFLOOR)
440         .customFor({S64})
441         .legalFor({S32, S64})
442         .scalarize(0)
443         .clampScalar(0, S32, S64);
444     } else {
445       getActionDefinitionsBuilder(G_FFLOOR)
446         .legalFor({S32, S64})
447         .scalarize(0)
448         .clampScalar(0, S32, S64);
449     }
450   }
451 
452   getActionDefinitionsBuilder(G_FPTRUNC)
453     .legalFor({{S32, S64}, {S16, S32}})
454     .scalarize(0)
455     .lower();
456 
457   getActionDefinitionsBuilder(G_FPEXT)
458     .legalFor({{S64, S32}, {S32, S16}})
459     .lowerFor({{S64, S16}}) // FIXME: Implement
460     .scalarize(0);
461 
462   getActionDefinitionsBuilder(G_FSUB)
463       // Use actual fsub instruction
464       .legalFor({S32})
465       // Must use fadd + fneg
466       .lowerFor({S64, S16, V2S16})
467       .scalarize(0)
468       .clampScalar(0, S32, S64);
469 
470   // Whether this is legal depends on the floating point mode for the function.
471   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
472   if (ST.hasMadF16())
473     FMad.customFor({S32, S16});
474   else
475     FMad.customFor({S32});
476   FMad.scalarize(0)
477       .lower();
478 
479   getActionDefinitionsBuilder(G_TRUNC)
480     .alwaysLegal();
481 
482   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
483     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
484                {S32, S1}, {S64, S1}, {S16, S1}})
485     .scalarize(0)
486     .clampScalar(0, S32, S64)
487     .widenScalarToNextPow2(1, 32);
488 
489   // TODO: Split s1->s64 during regbankselect for VALU.
490   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
491     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
492     .lowerFor({{S32, S64}})
493     .lowerIf(typeIs(1, S1))
494     .customFor({{S64, S64}});
495   if (ST.has16BitInsts())
496     IToFP.legalFor({{S16, S16}});
497   IToFP.clampScalar(1, S32, S64)
498        .scalarize(0)
499        .widenScalarToNextPow2(1);
500 
501   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
502     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
503     .customFor({{S64, S64}});
504   if (ST.has16BitInsts())
505     FPToI.legalFor({{S16, S16}});
506   else
507     FPToI.minScalar(1, S32);
508 
509   FPToI.minScalar(0, S32)
510        .scalarize(0)
511        .lower();
512 
513   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
514     .scalarize(0)
515     .lower();
516 
517   if (ST.has16BitInsts()) {
518     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
519       .legalFor({S16, S32, S64})
520       .clampScalar(0, S16, S64)
521       .scalarize(0);
522   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
523     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
524       .legalFor({S32, S64})
525       .clampScalar(0, S32, S64)
526       .scalarize(0);
527   } else {
528     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
529       .legalFor({S32})
530       .customFor({S64})
531       .clampScalar(0, S32, S64)
532       .scalarize(0);
533   }
534 
535   getActionDefinitionsBuilder({G_PTR_ADD, G_PTR_MASK})
536     .scalarize(0)
537     .alwaysLegal();
538 
539   auto &CmpBuilder =
540     getActionDefinitionsBuilder(G_ICMP)
541     // The compare output type differs based on the register bank of the output,
542     // so make both s1 and s32 legal.
543     //
544     // Scalar compares producing output in scc will be promoted to s32, as that
545     // is the allocatable register type that will be needed for the copy from
546     // scc. This will be promoted during RegBankSelect, and we assume something
547     // before that won't try to use s32 result types.
548     //
549     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
550     // bank.
551     .legalForCartesianProduct(
552       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
553     .legalForCartesianProduct(
554       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
555   if (ST.has16BitInsts()) {
556     CmpBuilder.legalFor({{S1, S16}});
557   }
558 
559   CmpBuilder
560     .widenScalarToNextPow2(1)
561     .clampScalar(1, S32, S64)
562     .scalarize(0)
563     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
564 
565   getActionDefinitionsBuilder(G_FCMP)
566     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
567     .widenScalarToNextPow2(1)
568     .clampScalar(1, S32, S64)
569     .scalarize(0);
570 
571   // FIXME: fpow has a selection pattern that should move to custom lowering.
572   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
573   if (ST.has16BitInsts())
574     Exp2Ops.legalFor({S32, S16});
575   else
576     Exp2Ops.legalFor({S32});
577   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
578   Exp2Ops.scalarize(0);
579 
580   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
581   if (ST.has16BitInsts())
582     ExpOps.customFor({{S32}, {S16}});
583   else
584     ExpOps.customFor({S32});
585   ExpOps.clampScalar(0, MinScalarFPTy, S32)
586         .scalarize(0);
587 
588   // The 64-bit versions produce 32-bit results, but only on the SALU.
589   getActionDefinitionsBuilder(G_CTPOP)
590     .legalFor({{S32, S32}, {S32, S64}})
591     .clampScalar(0, S32, S32)
592     .clampScalar(1, S32, S64)
593     .scalarize(0)
594     .widenScalarToNextPow2(0, 32)
595     .widenScalarToNextPow2(1, 32);
596 
597   // The hardware instructions return a different result on 0 than the generic
598   // instructions expect. The hardware produces -1, but these produce the
599   // bitwidth.
600   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
601     .scalarize(0)
602     .clampScalar(0, S32, S32)
603     .clampScalar(1, S32, S64)
604     .widenScalarToNextPow2(0, 32)
605     .widenScalarToNextPow2(1, 32)
606     .lower();
607 
608   // The 64-bit versions produce 32-bit results, but only on the SALU.
609   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
610     .legalFor({{S32, S32}, {S32, S64}})
611     .clampScalar(0, S32, S32)
612     .clampScalar(1, S32, S64)
613     .scalarize(0)
614     .widenScalarToNextPow2(0, 32)
615     .widenScalarToNextPow2(1, 32);
616 
617   getActionDefinitionsBuilder(G_BITREVERSE)
618     .legalFor({S32})
619     .clampScalar(0, S32, S32)
620     .scalarize(0);
621 
622   if (ST.has16BitInsts()) {
623     getActionDefinitionsBuilder(G_BSWAP)
624       .legalFor({S16, S32, V2S16})
625       .clampMaxNumElements(0, S16, 2)
626       // FIXME: Fixing non-power-of-2 before clamp is workaround for
627       // narrowScalar limitation.
628       .widenScalarToNextPow2(0)
629       .clampScalar(0, S16, S32)
630       .scalarize(0);
631 
632     if (ST.hasVOP3PInsts()) {
633       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
634         .legalFor({S32, S16, V2S16})
635         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
636         .clampMaxNumElements(0, S16, 2)
637         .clampScalar(0, S16, S32)
638         .widenScalarToNextPow2(0)
639         .scalarize(0);
640     } else {
641       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
642         .legalFor({S32, S16})
643         .widenScalarToNextPow2(0)
644         .clampScalar(0, S16, S32)
645         .scalarize(0);
646     }
647   } else {
648     // TODO: Should have same legality without v_perm_b32
649     getActionDefinitionsBuilder(G_BSWAP)
650       .legalFor({S32})
651       .lowerIf(narrowerThan(0, 32))
652       // FIXME: Fixing non-power-of-2 before clamp is workaround for
653       // narrowScalar limitation.
654       .widenScalarToNextPow2(0)
655       .maxScalar(0, S32)
656       .scalarize(0)
657       .lower();
658 
659     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
660       .legalFor({S32})
661       .clampScalar(0, S32, S32)
662       .widenScalarToNextPow2(0)
663       .scalarize(0);
664   }
665 
666   getActionDefinitionsBuilder(G_INTTOPTR)
667     // List the common cases
668     .legalForCartesianProduct(AddrSpaces64, {S64})
669     .legalForCartesianProduct(AddrSpaces32, {S32})
670     .scalarize(0)
671     // Accept any address space as long as the size matches
672     .legalIf(sameSize(0, 1))
673     .widenScalarIf(smallerThan(1, 0),
674       [](const LegalityQuery &Query) {
675         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
676       })
677     .narrowScalarIf(greaterThan(1, 0),
678       [](const LegalityQuery &Query) {
679         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
680       });
681 
682   getActionDefinitionsBuilder(G_PTRTOINT)
683     // List the common cases
684     .legalForCartesianProduct(AddrSpaces64, {S64})
685     .legalForCartesianProduct(AddrSpaces32, {S32})
686     .scalarize(0)
687     // Accept any address space as long as the size matches
688     .legalIf(sameSize(0, 1))
689     .widenScalarIf(smallerThan(0, 1),
690       [](const LegalityQuery &Query) {
691         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
692       })
693     .narrowScalarIf(
694       greaterThan(0, 1),
695       [](const LegalityQuery &Query) {
696         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
697       });
698 
699   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
700     .scalarize(0)
701     .custom();
702 
703   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
704   // handle some operations by just promoting the register during
705   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
706   auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned {
707     switch (AS) {
708     // FIXME: Private element size.
709     case AMDGPUAS::PRIVATE_ADDRESS:
710       return 32;
711     // FIXME: Check subtarget
712     case AMDGPUAS::LOCAL_ADDRESS:
713       return ST.useDS128() ? 128 : 64;
714 
715     // Treat constant and global as identical. SMRD loads are sometimes usable
716     // for global loads (ideally constant address space should be eliminated)
717     // depending on the context. Legality cannot be context dependent, but
718     // RegBankSelect can split the load as necessary depending on the pointer
719     // register bank/uniformity and if the memory is invariant or not written in
720     // a kernel.
721     case AMDGPUAS::CONSTANT_ADDRESS:
722     case AMDGPUAS::GLOBAL_ADDRESS:
723       return IsLoad ? 512 : 128;
724     default:
725       return 128;
726     }
727   };
728 
729   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
730                                     bool IsLoad) -> bool {
731     const LLT DstTy = Query.Types[0];
732 
733     // Split vector extloads.
734     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
735     unsigned Align = Query.MMODescrs[0].AlignInBits;
736 
737     if (MemSize < DstTy.getSizeInBits())
738       MemSize = std::max(MemSize, Align);
739 
740     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
741       return true;
742 
743     const LLT PtrTy = Query.Types[1];
744     unsigned AS = PtrTy.getAddressSpace();
745     if (MemSize > maxSizeForAddrSpace(AS, IsLoad))
746       return true;
747 
748     // Catch weird sized loads that don't evenly divide into the access sizes
749     // TODO: May be able to widen depending on alignment etc.
750     unsigned NumRegs = (MemSize + 31) / 32;
751     if (NumRegs == 3) {
752       if (!ST.hasDwordx3LoadStores())
753         return true;
754     } else {
755       // If the alignment allows, these should have been widened.
756       if (!isPowerOf2_32(NumRegs))
757         return true;
758     }
759 
760     if (Align < MemSize) {
761       const SITargetLowering *TLI = ST.getTargetLowering();
762       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
763     }
764 
765     return false;
766   };
767 
768   const auto shouldWidenLoadResult = [=](const LegalityQuery &Query) -> bool {
769     unsigned Size = Query.Types[0].getSizeInBits();
770     if (isPowerOf2_32(Size))
771       return false;
772 
773     if (Size == 96 && ST.hasDwordx3LoadStores())
774       return false;
775 
776     unsigned AddrSpace = Query.Types[1].getAddressSpace();
777     if (Size >= maxSizeForAddrSpace(AddrSpace, true))
778       return false;
779 
780     unsigned Align = Query.MMODescrs[0].AlignInBits;
781     unsigned RoundedSize = NextPowerOf2(Size);
782     return (Align >= RoundedSize);
783   };
784 
785   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
786   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
787   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
788 
789   // TODO: Refine based on subtargets which support unaligned access or 128-bit
790   // LDS
791   // TODO: Unsupported flat for SI.
792 
793   for (unsigned Op : {G_LOAD, G_STORE}) {
794     const bool IsStore = Op == G_STORE;
795 
796     auto &Actions = getActionDefinitionsBuilder(Op);
797     // Whitelist the common cases.
798     // TODO: Loads to s16 on gfx9
799     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
800                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
801                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
802                                       {S128, GlobalPtr, 128, GlobalAlign32},
803                                       {S64, GlobalPtr, 64, GlobalAlign32},
804                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
805                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
806                                       {S32, GlobalPtr, 8, GlobalAlign8},
807                                       {S32, GlobalPtr, 16, GlobalAlign16},
808 
809                                       {S32, LocalPtr, 32, 32},
810                                       {S64, LocalPtr, 64, 32},
811                                       {V2S32, LocalPtr, 64, 32},
812                                       {S32, LocalPtr, 8, 8},
813                                       {S32, LocalPtr, 16, 16},
814                                       {V2S16, LocalPtr, 32, 32},
815 
816                                       {S32, PrivatePtr, 32, 32},
817                                       {S32, PrivatePtr, 8, 8},
818                                       {S32, PrivatePtr, 16, 16},
819                                       {V2S16, PrivatePtr, 32, 32},
820 
821                                       {S32, FlatPtr, 32, GlobalAlign32},
822                                       {S32, FlatPtr, 16, GlobalAlign16},
823                                       {S32, FlatPtr, 8, GlobalAlign8},
824                                       {V2S16, FlatPtr, 32, GlobalAlign32},
825 
826                                       {S32, ConstantPtr, 32, GlobalAlign32},
827                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
828                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
829                                       {S64, ConstantPtr, 64, GlobalAlign32},
830                                       {S128, ConstantPtr, 128, GlobalAlign32},
831                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
832     Actions
833         .customIf(typeIs(1, Constant32Ptr))
834         // Widen suitably aligned loads by loading extra elements.
835         .moreElementsIf([=](const LegalityQuery &Query) {
836             const LLT Ty = Query.Types[0];
837             return Op == G_LOAD && Ty.isVector() &&
838                    shouldWidenLoadResult(Query);
839           }, moreElementsToNextPow2(0))
840         .widenScalarIf([=](const LegalityQuery &Query) {
841             const LLT Ty = Query.Types[0];
842             return Op == G_LOAD && !Ty.isVector() &&
843                    shouldWidenLoadResult(Query);
844           }, widenScalarOrEltToNextPow2(0))
845         .narrowScalarIf(
846             [=](const LegalityQuery &Query) -> bool {
847               return !Query.Types[0].isVector() &&
848                      needToSplitMemOp(Query, Op == G_LOAD);
849             },
850             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
851               const LLT DstTy = Query.Types[0];
852               const LLT PtrTy = Query.Types[1];
853 
854               const unsigned DstSize = DstTy.getSizeInBits();
855               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
856 
857               // Split extloads.
858               if (DstSize > MemSize)
859                 return std::make_pair(0, LLT::scalar(MemSize));
860 
861               if (!isPowerOf2_32(DstSize)) {
862                 // We're probably decomposing an odd sized store. Try to split
863                 // to the widest type. TODO: Account for alignment. As-is it
864                 // should be OK, since the new parts will be further legalized.
865                 unsigned FloorSize = PowerOf2Floor(DstSize);
866                 return std::make_pair(0, LLT::scalar(FloorSize));
867               }
868 
869               if (DstSize > 32 && (DstSize % 32 != 0)) {
870                 // FIXME: Need a way to specify non-extload of larger size if
871                 // suitably aligned.
872                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
873               }
874 
875               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
876                                                      Op == G_LOAD);
877               if (MemSize > MaxSize)
878                 return std::make_pair(0, LLT::scalar(MaxSize));
879 
880               unsigned Align = Query.MMODescrs[0].AlignInBits;
881               return std::make_pair(0, LLT::scalar(Align));
882             })
883         .fewerElementsIf(
884             [=](const LegalityQuery &Query) -> bool {
885               return Query.Types[0].isVector() &&
886                      needToSplitMemOp(Query, Op == G_LOAD);
887             },
888             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
889               const LLT DstTy = Query.Types[0];
890               const LLT PtrTy = Query.Types[1];
891 
892               LLT EltTy = DstTy.getElementType();
893               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
894                                                      Op == G_LOAD);
895 
896               // FIXME: Handle widened to power of 2 results better. This ends
897               // up scalarizing.
898               // FIXME: 3 element stores scalarized on SI
899 
900               // Split if it's too large for the address space.
901               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
902                 unsigned NumElts = DstTy.getNumElements();
903                 unsigned EltSize = EltTy.getSizeInBits();
904 
905                 if (MaxSize % EltSize == 0) {
906                   return std::make_pair(
907                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
908                 }
909 
910                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
911 
912                 // FIXME: Refine when odd breakdowns handled
913                 // The scalars will need to be re-legalized.
914                 if (NumPieces == 1 || NumPieces >= NumElts ||
915                     NumElts % NumPieces != 0)
916                   return std::make_pair(0, EltTy);
917 
918                 return std::make_pair(0,
919                                       LLT::vector(NumElts / NumPieces, EltTy));
920               }
921 
922               // FIXME: We could probably handle weird extending loads better.
923               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
924               if (DstTy.getSizeInBits() > MemSize)
925                 return std::make_pair(0, EltTy);
926 
927               unsigned EltSize = EltTy.getSizeInBits();
928               unsigned DstSize = DstTy.getSizeInBits();
929               if (!isPowerOf2_32(DstSize)) {
930                 // We're probably decomposing an odd sized store. Try to split
931                 // to the widest type. TODO: Account for alignment. As-is it
932                 // should be OK, since the new parts will be further legalized.
933                 unsigned FloorSize = PowerOf2Floor(DstSize);
934                 return std::make_pair(
935                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
936               }
937 
938               // Need to split because of alignment.
939               unsigned Align = Query.MMODescrs[0].AlignInBits;
940               if (EltSize > Align &&
941                   (EltSize / Align < DstTy.getNumElements())) {
942                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
943               }
944 
945               // May need relegalization for the scalars.
946               return std::make_pair(0, EltTy);
947             })
948         .minScalar(0, S32);
949 
950     if (IsStore)
951       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
952 
953     // TODO: Need a bitcast lower option?
954     Actions
955         .legalIf([=](const LegalityQuery &Query) {
956           const LLT Ty0 = Query.Types[0];
957           unsigned Size = Ty0.getSizeInBits();
958           unsigned MemSize = Query.MMODescrs[0].SizeInBits;
959           unsigned Align = Query.MMODescrs[0].AlignInBits;
960 
961           // FIXME: Widening store from alignment not valid.
962           if (MemSize < Size)
963             MemSize = std::max(MemSize, Align);
964 
965           // No extending vector loads.
966           if (Size > MemSize && Ty0.isVector())
967             return false;
968 
969           switch (MemSize) {
970           case 8:
971           case 16:
972             return Size == 32;
973           case 32:
974           case 64:
975           case 128:
976             return true;
977           case 96:
978             return ST.hasDwordx3LoadStores();
979           case 256:
980           case 512:
981             return true;
982           default:
983             return false;
984           }
985         })
986         .widenScalarToNextPow2(0)
987         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
988   }
989 
990   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
991                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
992                                                   {S32, GlobalPtr, 16, 2 * 8},
993                                                   {S32, LocalPtr, 8, 8},
994                                                   {S32, LocalPtr, 16, 16},
995                                                   {S32, PrivatePtr, 8, 8},
996                                                   {S32, PrivatePtr, 16, 16},
997                                                   {S32, ConstantPtr, 8, 8},
998                                                   {S32, ConstantPtr, 16, 2 * 8}});
999   if (ST.hasFlatAddressSpace()) {
1000     ExtLoads.legalForTypesWithMemDesc(
1001         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1002   }
1003 
1004   ExtLoads.clampScalar(0, S32, S32)
1005           .widenScalarToNextPow2(0)
1006           .unsupportedIfMemSizeNotPow2()
1007           .lower();
1008 
1009   auto &Atomics = getActionDefinitionsBuilder(
1010     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1011      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1012      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1013      G_ATOMICRMW_UMIN})
1014     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1015                {S64, GlobalPtr}, {S64, LocalPtr}});
1016   if (ST.hasFlatAddressSpace()) {
1017     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1018   }
1019 
1020   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1021     .legalFor({{S32, LocalPtr}});
1022 
1023   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1024   // demarshalling
1025   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1026     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1027                 {S32, FlatPtr}, {S64, FlatPtr}})
1028     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1029                {S32, RegionPtr}, {S64, RegionPtr}});
1030   // TODO: Pointer types, any 32-bit or 64-bit vector
1031 
1032   // Condition should be s32 for scalar, s1 for vector.
1033   getActionDefinitionsBuilder(G_SELECT)
1034     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1035           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1036           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1037     .clampScalar(0, S16, S64)
1038     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1039     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1040     .scalarize(1)
1041     .clampMaxNumElements(0, S32, 2)
1042     .clampMaxNumElements(0, LocalPtr, 2)
1043     .clampMaxNumElements(0, PrivatePtr, 2)
1044     .scalarize(0)
1045     .widenScalarToNextPow2(0)
1046     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1047 
1048   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1049   // be more flexible with the shift amount type.
1050   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1051     .legalFor({{S32, S32}, {S64, S32}});
1052   if (ST.has16BitInsts()) {
1053     if (ST.hasVOP3PInsts()) {
1054       Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
1055             .clampMaxNumElements(0, S16, 2);
1056     } else
1057       Shifts.legalFor({{S16, S32}, {S16, S16}});
1058 
1059     // TODO: Support 16-bit shift amounts
1060     Shifts.clampScalar(1, S32, S32);
1061     Shifts.clampScalar(0, S16, S64);
1062     Shifts.widenScalarToNextPow2(0, 16);
1063   } else {
1064     // Make sure we legalize the shift amount type first, as the general
1065     // expansion for the shifted type will produce much worse code if it hasn't
1066     // been truncated already.
1067     Shifts.clampScalar(1, S32, S32);
1068     Shifts.clampScalar(0, S32, S64);
1069     Shifts.widenScalarToNextPow2(0, 32);
1070   }
1071   Shifts.scalarize(0);
1072 
1073   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1074     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1075     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1076     unsigned IdxTypeIdx = 2;
1077 
1078     getActionDefinitionsBuilder(Op)
1079       .customIf([=](const LegalityQuery &Query) {
1080           const LLT EltTy = Query.Types[EltTypeIdx];
1081           const LLT VecTy = Query.Types[VecTypeIdx];
1082           const LLT IdxTy = Query.Types[IdxTypeIdx];
1083           return (EltTy.getSizeInBits() == 16 ||
1084                   EltTy.getSizeInBits() % 32 == 0) &&
1085                  VecTy.getSizeInBits() % 32 == 0 &&
1086                  VecTy.getSizeInBits() <= 1024 &&
1087                  IdxTy.getSizeInBits() == 32;
1088         })
1089       .clampScalar(EltTypeIdx, S32, S64)
1090       .clampScalar(VecTypeIdx, S32, S64)
1091       .clampScalar(IdxTypeIdx, S32, S32);
1092   }
1093 
1094   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1095     .unsupportedIf([=](const LegalityQuery &Query) {
1096         const LLT &EltTy = Query.Types[1].getElementType();
1097         return Query.Types[0] != EltTy;
1098       });
1099 
1100   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1101     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1102     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1103 
1104     // FIXME: Doesn't handle extract of illegal sizes.
1105     getActionDefinitionsBuilder(Op)
1106       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1107       // FIXME: Multiples of 16 should not be legal.
1108       .legalIf([=](const LegalityQuery &Query) {
1109           const LLT BigTy = Query.Types[BigTyIdx];
1110           const LLT LitTy = Query.Types[LitTyIdx];
1111           return (BigTy.getSizeInBits() % 32 == 0) &&
1112                  (LitTy.getSizeInBits() % 16 == 0);
1113         })
1114       .widenScalarIf(
1115         [=](const LegalityQuery &Query) {
1116           const LLT BigTy = Query.Types[BigTyIdx];
1117           return (BigTy.getScalarSizeInBits() < 16);
1118         },
1119         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1120       .widenScalarIf(
1121         [=](const LegalityQuery &Query) {
1122           const LLT LitTy = Query.Types[LitTyIdx];
1123           return (LitTy.getScalarSizeInBits() < 16);
1124         },
1125         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1126       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1127       .widenScalarToNextPow2(BigTyIdx, 32);
1128 
1129   }
1130 
1131   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1132     .legalForCartesianProduct(AllS32Vectors, {S32})
1133     .legalForCartesianProduct(AllS64Vectors, {S64})
1134     .clampNumElements(0, V16S32, V32S32)
1135     .clampNumElements(0, V2S64, V16S64)
1136     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1137 
1138   if (ST.hasScalarPackInsts()) {
1139     BuildVector
1140       // FIXME: Should probably widen s1 vectors straight to s32
1141       .minScalarOrElt(0, S16)
1142       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1143       .minScalar(1, S32);
1144 
1145     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1146       .legalFor({V2S16, S32})
1147       .lower();
1148     BuildVector.minScalarOrElt(0, S32);
1149   } else {
1150     BuildVector.customFor({V2S16, S16});
1151     BuildVector.minScalarOrElt(0, S32);
1152 
1153     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1154       .customFor({V2S16, S32})
1155       .lower();
1156   }
1157 
1158   BuildVector.legalIf(isRegisterType(0));
1159 
1160   // FIXME: Clamp maximum size
1161   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1162     .legalIf(isRegisterType(0));
1163 
1164   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1165   // pre-legalize.
1166   if (ST.hasVOP3PInsts()) {
1167     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1168       .customFor({V2S16, V2S16})
1169       .lower();
1170   } else
1171     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1172 
1173   // Merge/Unmerge
1174   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1175     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1176     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1177 
1178     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1179       const LLT &Ty = Query.Types[TypeIdx];
1180       if (Ty.isVector()) {
1181         const LLT &EltTy = Ty.getElementType();
1182         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
1183           return true;
1184         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1185           return true;
1186       }
1187       return false;
1188     };
1189 
1190     auto &Builder = getActionDefinitionsBuilder(Op)
1191       // Try to widen to s16 first for small types.
1192       // TODO: Only do this on targets with legal s16 shifts
1193       .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1194 
1195       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1196       .lowerFor({{S16, V2S16}})
1197       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1198       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1199                            elementTypeIs(1, S16)),
1200                        changeTo(1, V2S16))
1201       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1202       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1203       // valid.
1204       .clampScalar(LitTyIdx, S32, S256)
1205       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1206       // Break up vectors with weird elements into scalars
1207       .fewerElementsIf(
1208         [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
1209         scalarize(0))
1210       .fewerElementsIf(
1211         [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
1212         scalarize(1))
1213       .clampScalar(BigTyIdx, S32, S1024);
1214 
1215     if (Op == G_MERGE_VALUES) {
1216       Builder.widenScalarIf(
1217         // TODO: Use 16-bit shifts if legal for 8-bit values?
1218         [=](const LegalityQuery &Query) {
1219           const LLT Ty = Query.Types[LitTyIdx];
1220           return Ty.getSizeInBits() < 32;
1221         },
1222         changeTo(LitTyIdx, S32));
1223     }
1224 
1225     Builder.widenScalarIf(
1226       [=](const LegalityQuery &Query) {
1227         const LLT Ty = Query.Types[BigTyIdx];
1228         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1229           Ty.getSizeInBits() % 16 != 0;
1230       },
1231       [=](const LegalityQuery &Query) {
1232         // Pick the next power of 2, or a multiple of 64 over 128.
1233         // Whichever is smaller.
1234         const LLT &Ty = Query.Types[BigTyIdx];
1235         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1236         if (NewSizeInBits >= 256) {
1237           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1238           if (RoundedTo < NewSizeInBits)
1239             NewSizeInBits = RoundedTo;
1240         }
1241         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1242       })
1243       .legalIf([=](const LegalityQuery &Query) {
1244           const LLT &BigTy = Query.Types[BigTyIdx];
1245           const LLT &LitTy = Query.Types[LitTyIdx];
1246 
1247           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1248             return false;
1249           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1250             return false;
1251 
1252           return BigTy.getSizeInBits() % 16 == 0 &&
1253                  LitTy.getSizeInBits() % 16 == 0 &&
1254                  BigTy.getSizeInBits() <= 1024;
1255         })
1256       // Any vectors left are the wrong size. Scalarize them.
1257       .scalarize(0)
1258       .scalarize(1);
1259   }
1260 
1261   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1262   // RegBankSelect.
1263   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1264     .legalFor({{S32}, {S64}});
1265 
1266   if (ST.hasVOP3PInsts()) {
1267     SextInReg.lowerFor({{V2S16}})
1268       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1269       // get more vector shift opportunities, since we'll get those when
1270       // expanded.
1271       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1272   } else if (ST.has16BitInsts()) {
1273     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1274   } else {
1275     // Prefer to promote to s32 before lowering if we don't have 16-bit
1276     // shifts. This avoid a lot of intermediate truncate and extend operations.
1277     SextInReg.lowerFor({{S32}, {S64}});
1278   }
1279 
1280   SextInReg
1281     .scalarize(0)
1282     .clampScalar(0, S32, S64)
1283     .lower();
1284 
1285   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1286     .legalFor({S64});
1287 
1288   getActionDefinitionsBuilder({
1289       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1290       G_FCOPYSIGN,
1291 
1292       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1293       G_READ_REGISTER,
1294       G_WRITE_REGISTER,
1295 
1296       G_SADDO, G_SSUBO,
1297 
1298        // TODO: Implement
1299       G_FMINIMUM, G_FMAXIMUM
1300     }).lower();
1301 
1302   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1303         G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1304         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1305     .unsupported();
1306 
1307   computeTables();
1308   verify(*ST.getInstrInfo());
1309 }
1310 
1311 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1312                                          MachineRegisterInfo &MRI,
1313                                          MachineIRBuilder &B,
1314                                          GISelChangeObserver &Observer) const {
1315   switch (MI.getOpcode()) {
1316   case TargetOpcode::G_ADDRSPACE_CAST:
1317     return legalizeAddrSpaceCast(MI, MRI, B);
1318   case TargetOpcode::G_FRINT:
1319     return legalizeFrint(MI, MRI, B);
1320   case TargetOpcode::G_FCEIL:
1321     return legalizeFceil(MI, MRI, B);
1322   case TargetOpcode::G_INTRINSIC_TRUNC:
1323     return legalizeIntrinsicTrunc(MI, MRI, B);
1324   case TargetOpcode::G_SITOFP:
1325     return legalizeITOFP(MI, MRI, B, true);
1326   case TargetOpcode::G_UITOFP:
1327     return legalizeITOFP(MI, MRI, B, false);
1328   case TargetOpcode::G_FPTOSI:
1329     return legalizeFPTOI(MI, MRI, B, true);
1330   case TargetOpcode::G_FPTOUI:
1331     return legalizeFPTOI(MI, MRI, B, false);
1332   case TargetOpcode::G_FMINNUM:
1333   case TargetOpcode::G_FMAXNUM:
1334   case TargetOpcode::G_FMINNUM_IEEE:
1335   case TargetOpcode::G_FMAXNUM_IEEE:
1336     return legalizeMinNumMaxNum(MI, MRI, B);
1337   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1338     return legalizeExtractVectorElt(MI, MRI, B);
1339   case TargetOpcode::G_INSERT_VECTOR_ELT:
1340     return legalizeInsertVectorElt(MI, MRI, B);
1341   case TargetOpcode::G_SHUFFLE_VECTOR:
1342     return legalizeShuffleVector(MI, MRI, B);
1343   case TargetOpcode::G_FSIN:
1344   case TargetOpcode::G_FCOS:
1345     return legalizeSinCos(MI, MRI, B);
1346   case TargetOpcode::G_GLOBAL_VALUE:
1347     return legalizeGlobalValue(MI, MRI, B);
1348   case TargetOpcode::G_LOAD:
1349     return legalizeLoad(MI, MRI, B, Observer);
1350   case TargetOpcode::G_FMAD:
1351     return legalizeFMad(MI, MRI, B);
1352   case TargetOpcode::G_FDIV:
1353     return legalizeFDIV(MI, MRI, B);
1354   case TargetOpcode::G_UDIV:
1355   case TargetOpcode::G_UREM:
1356     return legalizeUDIV_UREM(MI, MRI, B);
1357   case TargetOpcode::G_SDIV:
1358   case TargetOpcode::G_SREM:
1359     return legalizeSDIV_SREM(MI, MRI, B);
1360   case TargetOpcode::G_ATOMIC_CMPXCHG:
1361     return legalizeAtomicCmpXChg(MI, MRI, B);
1362   case TargetOpcode::G_FLOG:
1363     return legalizeFlog(MI, B, 1.0f / numbers::log2ef);
1364   case TargetOpcode::G_FLOG10:
1365     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1366   case TargetOpcode::G_FEXP:
1367     return legalizeFExp(MI, B);
1368   case TargetOpcode::G_FPOW:
1369     return legalizeFPow(MI, B);
1370   case TargetOpcode::G_FFLOOR:
1371     return legalizeFFloor(MI, MRI, B);
1372   case TargetOpcode::G_BUILD_VECTOR:
1373     return legalizeBuildVector(MI, MRI, B);
1374   default:
1375     return false;
1376   }
1377 
1378   llvm_unreachable("expected switch to return");
1379 }
1380 
1381 Register AMDGPULegalizerInfo::getSegmentAperture(
1382   unsigned AS,
1383   MachineRegisterInfo &MRI,
1384   MachineIRBuilder &B) const {
1385   MachineFunction &MF = B.getMF();
1386   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1387   const LLT S32 = LLT::scalar(32);
1388 
1389   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1390 
1391   if (ST.hasApertureRegs()) {
1392     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1393     // getreg.
1394     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1395         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1396         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1397     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1398         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1399         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1400     unsigned Encoding =
1401         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1402         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1403         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1404 
1405     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1406 
1407     B.buildInstr(AMDGPU::S_GETREG_B32)
1408       .addDef(GetReg)
1409       .addImm(Encoding);
1410     MRI.setType(GetReg, S32);
1411 
1412     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1413     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1414   }
1415 
1416   Register QueuePtr = MRI.createGenericVirtualRegister(
1417     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1418 
1419   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1420   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1421     return Register();
1422 
1423   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1424   // private_segment_aperture_base_hi.
1425   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1426 
1427   // TODO: can we be smarter about machine pointer info?
1428   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1429   MachineMemOperand *MMO = MF.getMachineMemOperand(
1430     PtrInfo,
1431     MachineMemOperand::MOLoad |
1432     MachineMemOperand::MODereferenceable |
1433     MachineMemOperand::MOInvariant,
1434     4,
1435     MinAlign(64, StructOffset));
1436 
1437   Register LoadAddr;
1438 
1439   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1440   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1441 }
1442 
1443 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1444   MachineInstr &MI, MachineRegisterInfo &MRI,
1445   MachineIRBuilder &B) const {
1446   MachineFunction &MF = B.getMF();
1447 
1448   B.setInstr(MI);
1449 
1450   const LLT S32 = LLT::scalar(32);
1451   Register Dst = MI.getOperand(0).getReg();
1452   Register Src = MI.getOperand(1).getReg();
1453 
1454   LLT DstTy = MRI.getType(Dst);
1455   LLT SrcTy = MRI.getType(Src);
1456   unsigned DestAS = DstTy.getAddressSpace();
1457   unsigned SrcAS = SrcTy.getAddressSpace();
1458 
1459   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1460   // vector element.
1461   assert(!DstTy.isVector());
1462 
1463   const AMDGPUTargetMachine &TM
1464     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1465 
1466   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1467   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1468     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1469     return true;
1470   }
1471 
1472   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1473     // Truncate.
1474     B.buildExtract(Dst, Src, 0);
1475     MI.eraseFromParent();
1476     return true;
1477   }
1478 
1479   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1480     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1481     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1482 
1483     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1484     // another. Merge operands are required to be the same type, but creating an
1485     // extra ptrtoint would be kind of pointless.
1486     auto HighAddr = B.buildConstant(
1487       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1488     B.buildMerge(Dst, {Src, HighAddr});
1489     MI.eraseFromParent();
1490     return true;
1491   }
1492 
1493   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1494     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1495            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1496     unsigned NullVal = TM.getNullPointerValue(DestAS);
1497 
1498     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1499     auto FlatNull = B.buildConstant(SrcTy, 0);
1500 
1501     // Extract low 32-bits of the pointer.
1502     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1503 
1504     auto CmpRes =
1505         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1506     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1507 
1508     MI.eraseFromParent();
1509     return true;
1510   }
1511 
1512   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1513     return false;
1514 
1515   if (!ST.hasFlatAddressSpace())
1516     return false;
1517 
1518   auto SegmentNull =
1519       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1520   auto FlatNull =
1521       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1522 
1523   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1524   if (!ApertureReg.isValid())
1525     return false;
1526 
1527   auto CmpRes =
1528       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1529 
1530   // Coerce the type of the low half of the result so we can use merge_values.
1531   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1532 
1533   // TODO: Should we allow mismatched types but matching sizes in merges to
1534   // avoid the ptrtoint?
1535   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1536   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1537 
1538   MI.eraseFromParent();
1539   return true;
1540 }
1541 
1542 bool AMDGPULegalizerInfo::legalizeFrint(
1543   MachineInstr &MI, MachineRegisterInfo &MRI,
1544   MachineIRBuilder &B) const {
1545   B.setInstr(MI);
1546 
1547   Register Src = MI.getOperand(1).getReg();
1548   LLT Ty = MRI.getType(Src);
1549   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1550 
1551   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1552   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1553 
1554   auto C1 = B.buildFConstant(Ty, C1Val);
1555   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1556 
1557   // TODO: Should this propagate fast-math-flags?
1558   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1559   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1560 
1561   auto C2 = B.buildFConstant(Ty, C2Val);
1562   auto Fabs = B.buildFAbs(Ty, Src);
1563 
1564   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1565   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1566   return true;
1567 }
1568 
1569 bool AMDGPULegalizerInfo::legalizeFceil(
1570   MachineInstr &MI, MachineRegisterInfo &MRI,
1571   MachineIRBuilder &B) const {
1572   B.setInstr(MI);
1573 
1574   const LLT S1 = LLT::scalar(1);
1575   const LLT S64 = LLT::scalar(64);
1576 
1577   Register Src = MI.getOperand(1).getReg();
1578   assert(MRI.getType(Src) == S64);
1579 
1580   // result = trunc(src)
1581   // if (src > 0.0 && src != result)
1582   //   result += 1.0
1583 
1584   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1585 
1586   const auto Zero = B.buildFConstant(S64, 0.0);
1587   const auto One = B.buildFConstant(S64, 1.0);
1588   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1589   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1590   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1591   auto Add = B.buildSelect(S64, And, One, Zero);
1592 
1593   // TODO: Should this propagate fast-math-flags?
1594   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1595   return true;
1596 }
1597 
1598 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1599                                               MachineIRBuilder &B) {
1600   const unsigned FractBits = 52;
1601   const unsigned ExpBits = 11;
1602   LLT S32 = LLT::scalar(32);
1603 
1604   auto Const0 = B.buildConstant(S32, FractBits - 32);
1605   auto Const1 = B.buildConstant(S32, ExpBits);
1606 
1607   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1608     .addUse(Const0.getReg(0))
1609     .addUse(Const1.getReg(0));
1610 
1611   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1612 }
1613 
1614 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1615   MachineInstr &MI, MachineRegisterInfo &MRI,
1616   MachineIRBuilder &B) const {
1617   B.setInstr(MI);
1618 
1619   const LLT S1 = LLT::scalar(1);
1620   const LLT S32 = LLT::scalar(32);
1621   const LLT S64 = LLT::scalar(64);
1622 
1623   Register Src = MI.getOperand(1).getReg();
1624   assert(MRI.getType(Src) == S64);
1625 
1626   // TODO: Should this use extract since the low half is unused?
1627   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1628   Register Hi = Unmerge.getReg(1);
1629 
1630   // Extract the upper half, since this is where we will find the sign and
1631   // exponent.
1632   auto Exp = extractF64Exponent(Hi, B);
1633 
1634   const unsigned FractBits = 52;
1635 
1636   // Extract the sign bit.
1637   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1638   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1639 
1640   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1641 
1642   const auto Zero32 = B.buildConstant(S32, 0);
1643 
1644   // Extend back to 64-bits.
1645   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1646 
1647   auto Shr = B.buildAShr(S64, FractMask, Exp);
1648   auto Not = B.buildNot(S64, Shr);
1649   auto Tmp0 = B.buildAnd(S64, Src, Not);
1650   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1651 
1652   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1653   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1654 
1655   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1656   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1657   return true;
1658 }
1659 
1660 bool AMDGPULegalizerInfo::legalizeITOFP(
1661   MachineInstr &MI, MachineRegisterInfo &MRI,
1662   MachineIRBuilder &B, bool Signed) const {
1663   B.setInstr(MI);
1664 
1665   Register Dst = MI.getOperand(0).getReg();
1666   Register Src = MI.getOperand(1).getReg();
1667 
1668   const LLT S64 = LLT::scalar(64);
1669   const LLT S32 = LLT::scalar(32);
1670 
1671   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1672 
1673   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1674 
1675   auto CvtHi = Signed ?
1676     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1677     B.buildUITOFP(S64, Unmerge.getReg(1));
1678 
1679   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1680 
1681   auto ThirtyTwo = B.buildConstant(S32, 32);
1682   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1683     .addUse(CvtHi.getReg(0))
1684     .addUse(ThirtyTwo.getReg(0));
1685 
1686   // TODO: Should this propagate fast-math-flags?
1687   B.buildFAdd(Dst, LdExp, CvtLo);
1688   MI.eraseFromParent();
1689   return true;
1690 }
1691 
1692 // TODO: Copied from DAG implementation. Verify logic and document how this
1693 // actually works.
1694 bool AMDGPULegalizerInfo::legalizeFPTOI(
1695   MachineInstr &MI, MachineRegisterInfo &MRI,
1696   MachineIRBuilder &B, bool Signed) const {
1697   B.setInstr(MI);
1698 
1699   Register Dst = MI.getOperand(0).getReg();
1700   Register Src = MI.getOperand(1).getReg();
1701 
1702   const LLT S64 = LLT::scalar(64);
1703   const LLT S32 = LLT::scalar(32);
1704 
1705   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1706 
1707   unsigned Flags = MI.getFlags();
1708 
1709   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
1710   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
1711   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
1712 
1713   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
1714   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
1715   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
1716 
1717   auto Hi = Signed ?
1718     B.buildFPTOSI(S32, FloorMul) :
1719     B.buildFPTOUI(S32, FloorMul);
1720   auto Lo = B.buildFPTOUI(S32, Fma);
1721 
1722   B.buildMerge(Dst, { Lo, Hi });
1723   MI.eraseFromParent();
1724 
1725   return true;
1726 }
1727 
1728 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1729   MachineInstr &MI, MachineRegisterInfo &MRI,
1730   MachineIRBuilder &B) const {
1731   MachineFunction &MF = B.getMF();
1732   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1733 
1734   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1735                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1736 
1737   // With ieee_mode disabled, the instructions have the correct behavior
1738   // already for G_FMINNUM/G_FMAXNUM
1739   if (!MFI->getMode().IEEE)
1740     return !IsIEEEOp;
1741 
1742   if (IsIEEEOp)
1743     return true;
1744 
1745   MachineIRBuilder HelperBuilder(MI);
1746   GISelObserverWrapper DummyObserver;
1747   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1748   HelperBuilder.setInstr(MI);
1749   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1750 }
1751 
1752 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1753   MachineInstr &MI, MachineRegisterInfo &MRI,
1754   MachineIRBuilder &B) const {
1755   // TODO: Should move some of this into LegalizerHelper.
1756 
1757   // TODO: Promote dynamic indexing of s16 to s32
1758 
1759   // FIXME: Artifact combiner probably should have replaced the truncated
1760   // constant before this, so we shouldn't need
1761   // getConstantVRegValWithLookThrough.
1762   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1763     MI.getOperand(2).getReg(), MRI);
1764   if (!IdxVal) // Dynamic case will be selected to register indexing.
1765     return true;
1766 
1767   Register Dst = MI.getOperand(0).getReg();
1768   Register Vec = MI.getOperand(1).getReg();
1769 
1770   LLT VecTy = MRI.getType(Vec);
1771   LLT EltTy = VecTy.getElementType();
1772   assert(EltTy == MRI.getType(Dst));
1773 
1774   B.setInstr(MI);
1775 
1776   if (IdxVal->Value < VecTy.getNumElements())
1777     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
1778   else
1779     B.buildUndef(Dst);
1780 
1781   MI.eraseFromParent();
1782   return true;
1783 }
1784 
1785 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1786   MachineInstr &MI, MachineRegisterInfo &MRI,
1787   MachineIRBuilder &B) const {
1788   // TODO: Should move some of this into LegalizerHelper.
1789 
1790   // TODO: Promote dynamic indexing of s16 to s32
1791 
1792   // FIXME: Artifact combiner probably should have replaced the truncated
1793   // constant before this, so we shouldn't need
1794   // getConstantVRegValWithLookThrough.
1795   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1796     MI.getOperand(3).getReg(), MRI);
1797   if (!IdxVal) // Dynamic case will be selected to register indexing.
1798     return true;
1799 
1800   Register Dst = MI.getOperand(0).getReg();
1801   Register Vec = MI.getOperand(1).getReg();
1802   Register Ins = MI.getOperand(2).getReg();
1803 
1804   LLT VecTy = MRI.getType(Vec);
1805   LLT EltTy = VecTy.getElementType();
1806   assert(EltTy == MRI.getType(Ins));
1807 
1808   B.setInstr(MI);
1809 
1810   if (IdxVal->Value < VecTy.getNumElements())
1811     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
1812   else
1813     B.buildUndef(Dst);
1814 
1815   MI.eraseFromParent();
1816   return true;
1817 }
1818 
1819 static bool isLegalVOP3PShuffleMask(ArrayRef<int> Mask) {
1820   assert(Mask.size() == 2);
1821 
1822   // If one half is undef, the other is trivially in the same reg.
1823   if (Mask[0] == -1 || Mask[1] == -1)
1824     return true;
1825   return ((Mask[0] == 0 || Mask[0] == 1) && (Mask[1] == 0 || Mask[1] == 1)) ||
1826          ((Mask[0] == 2 || Mask[0] == 3) && (Mask[1] == 2 || Mask[1] == 3));
1827 }
1828 
1829 bool AMDGPULegalizerInfo::legalizeShuffleVector(
1830   MachineInstr &MI, MachineRegisterInfo &MRI,
1831   MachineIRBuilder &B) const {
1832   const LLT V2S16 = LLT::vector(2, 16);
1833 
1834   Register Dst = MI.getOperand(0).getReg();
1835   Register Src0 = MI.getOperand(1).getReg();
1836   LLT DstTy = MRI.getType(Dst);
1837   LLT SrcTy = MRI.getType(Src0);
1838 
1839   if (SrcTy == V2S16 && DstTy == V2S16 &&
1840       isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
1841     return true;
1842 
1843   MachineIRBuilder HelperBuilder(MI);
1844   GISelObserverWrapper DummyObserver;
1845   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
1846   HelperBuilder.setInstr(MI);
1847   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
1848 }
1849 
1850 bool AMDGPULegalizerInfo::legalizeSinCos(
1851   MachineInstr &MI, MachineRegisterInfo &MRI,
1852   MachineIRBuilder &B) const {
1853   B.setInstr(MI);
1854 
1855   Register DstReg = MI.getOperand(0).getReg();
1856   Register SrcReg = MI.getOperand(1).getReg();
1857   LLT Ty = MRI.getType(DstReg);
1858   unsigned Flags = MI.getFlags();
1859 
1860   Register TrigVal;
1861   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1862   if (ST.hasTrigReducedRange()) {
1863     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1864     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1865       .addUse(MulVal.getReg(0))
1866       .setMIFlags(Flags).getReg(0);
1867   } else
1868     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1869 
1870   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1871     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1872   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1873     .addUse(TrigVal)
1874     .setMIFlags(Flags);
1875   MI.eraseFromParent();
1876   return true;
1877 }
1878 
1879 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1880   Register DstReg, LLT PtrTy,
1881   MachineIRBuilder &B, const GlobalValue *GV,
1882   unsigned Offset, unsigned GAFlags) const {
1883   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1884   // to the following code sequence:
1885   //
1886   // For constant address space:
1887   //   s_getpc_b64 s[0:1]
1888   //   s_add_u32 s0, s0, $symbol
1889   //   s_addc_u32 s1, s1, 0
1890   //
1891   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1892   //   a fixup or relocation is emitted to replace $symbol with a literal
1893   //   constant, which is a pc-relative offset from the encoding of the $symbol
1894   //   operand to the global variable.
1895   //
1896   // For global address space:
1897   //   s_getpc_b64 s[0:1]
1898   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1899   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1900   //
1901   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1902   //   fixups or relocations are emitted to replace $symbol@*@lo and
1903   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1904   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
1905   //   operand to the global variable.
1906   //
1907   // What we want here is an offset from the value returned by s_getpc
1908   // (which is the address of the s_add_u32 instruction) to the global
1909   // variable, but since the encoding of $symbol starts 4 bytes after the start
1910   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1911   // small. This requires us to add 4 to the global variable offset in order to
1912   // compute the correct address.
1913 
1914   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1915 
1916   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1917     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1918 
1919   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1920     .addDef(PCReg);
1921 
1922   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1923   if (GAFlags == SIInstrInfo::MO_NONE)
1924     MIB.addImm(0);
1925   else
1926     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1927 
1928   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1929 
1930   if (PtrTy.getSizeInBits() == 32)
1931     B.buildExtract(DstReg, PCReg, 0);
1932   return true;
1933  }
1934 
1935 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1936   MachineInstr &MI, MachineRegisterInfo &MRI,
1937   MachineIRBuilder &B) const {
1938   Register DstReg = MI.getOperand(0).getReg();
1939   LLT Ty = MRI.getType(DstReg);
1940   unsigned AS = Ty.getAddressSpace();
1941 
1942   const GlobalValue *GV = MI.getOperand(1).getGlobal();
1943   MachineFunction &MF = B.getMF();
1944   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1945   B.setInstr(MI);
1946 
1947   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1948     if (!MFI->isEntryFunction()) {
1949       const Function &Fn = MF.getFunction();
1950       DiagnosticInfoUnsupported BadLDSDecl(
1951         Fn, "local memory global used by non-kernel function", MI.getDebugLoc());
1952       Fn.getContext().diagnose(BadLDSDecl);
1953     }
1954 
1955     // TODO: We could emit code to handle the initialization somewhere.
1956     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1957       const SITargetLowering *TLI = ST.getTargetLowering();
1958       if (!TLI->shouldUseLDSConstAddress(GV)) {
1959         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
1960         return true; // Leave in place;
1961       }
1962 
1963       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1964       MI.eraseFromParent();
1965       return true;
1966     }
1967 
1968     const Function &Fn = MF.getFunction();
1969     DiagnosticInfoUnsupported BadInit(
1970       Fn, "unsupported initializer for address space", MI.getDebugLoc());
1971     Fn.getContext().diagnose(BadInit);
1972     return true;
1973   }
1974 
1975   const SITargetLowering *TLI = ST.getTargetLowering();
1976 
1977   if (TLI->shouldEmitFixup(GV)) {
1978     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
1979     MI.eraseFromParent();
1980     return true;
1981   }
1982 
1983   if (TLI->shouldEmitPCReloc(GV)) {
1984     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
1985     MI.eraseFromParent();
1986     return true;
1987   }
1988 
1989   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1990   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
1991 
1992   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
1993     MachinePointerInfo::getGOT(MF),
1994     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1995     MachineMemOperand::MOInvariant,
1996     8 /*Size*/, 8 /*Align*/);
1997 
1998   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
1999 
2000   if (Ty.getSizeInBits() == 32) {
2001     // Truncate if this is a 32-bit constant adrdess.
2002     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2003     B.buildExtract(DstReg, Load, 0);
2004   } else
2005     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2006 
2007   MI.eraseFromParent();
2008   return true;
2009 }
2010 
2011 bool AMDGPULegalizerInfo::legalizeLoad(
2012   MachineInstr &MI, MachineRegisterInfo &MRI,
2013   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
2014   B.setInstr(MI);
2015   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2016   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
2017   Observer.changingInstr(MI);
2018   MI.getOperand(1).setReg(Cast.getReg(0));
2019   Observer.changedInstr(MI);
2020   return true;
2021 }
2022 
2023 bool AMDGPULegalizerInfo::legalizeFMad(
2024   MachineInstr &MI, MachineRegisterInfo &MRI,
2025   MachineIRBuilder &B) const {
2026   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2027   assert(Ty.isScalar());
2028 
2029   MachineFunction &MF = B.getMF();
2030   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2031 
2032   // TODO: Always legal with future ftz flag.
2033   // FIXME: Do we need just output?
2034   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2035     return true;
2036   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2037     return true;
2038 
2039   MachineIRBuilder HelperBuilder(MI);
2040   GISelObserverWrapper DummyObserver;
2041   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2042   HelperBuilder.setMBB(*MI.getParent());
2043   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2044 }
2045 
2046 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2047   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2048   Register DstReg = MI.getOperand(0).getReg();
2049   Register PtrReg = MI.getOperand(1).getReg();
2050   Register CmpVal = MI.getOperand(2).getReg();
2051   Register NewVal = MI.getOperand(3).getReg();
2052 
2053   assert(SITargetLowering::isFlatGlobalAddrSpace(
2054            MRI.getType(PtrReg).getAddressSpace()) &&
2055          "this should not have been custom lowered");
2056 
2057   LLT ValTy = MRI.getType(CmpVal);
2058   LLT VecTy = LLT::vector(2, ValTy);
2059 
2060   B.setInstr(MI);
2061   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2062 
2063   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2064     .addDef(DstReg)
2065     .addUse(PtrReg)
2066     .addUse(PackedVal)
2067     .setMemRefs(MI.memoperands());
2068 
2069   MI.eraseFromParent();
2070   return true;
2071 }
2072 
2073 bool AMDGPULegalizerInfo::legalizeFlog(
2074   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2075   Register Dst = MI.getOperand(0).getReg();
2076   Register Src = MI.getOperand(1).getReg();
2077   LLT Ty = B.getMRI()->getType(Dst);
2078   unsigned Flags = MI.getFlags();
2079   B.setInstr(MI);
2080 
2081   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2082   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2083 
2084   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2085   MI.eraseFromParent();
2086   return true;
2087 }
2088 
2089 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2090                                        MachineIRBuilder &B) const {
2091   Register Dst = MI.getOperand(0).getReg();
2092   Register Src = MI.getOperand(1).getReg();
2093   unsigned Flags = MI.getFlags();
2094   LLT Ty = B.getMRI()->getType(Dst);
2095   B.setInstr(MI);
2096 
2097   auto K = B.buildFConstant(Ty, numbers::log2e);
2098   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2099   B.buildFExp2(Dst, Mul, Flags);
2100   MI.eraseFromParent();
2101   return true;
2102 }
2103 
2104 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2105                                        MachineIRBuilder &B) const {
2106   Register Dst = MI.getOperand(0).getReg();
2107   Register Src0 = MI.getOperand(1).getReg();
2108   Register Src1 = MI.getOperand(2).getReg();
2109   unsigned Flags = MI.getFlags();
2110   LLT Ty = B.getMRI()->getType(Dst);
2111   B.setInstr(MI);
2112   const LLT S16 = LLT::scalar(16);
2113   const LLT S32 = LLT::scalar(32);
2114 
2115   if (Ty == S32) {
2116     auto Log = B.buildFLog2(S32, Src0, Flags);
2117     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2118       .addUse(Log.getReg(0))
2119       .addUse(Src1)
2120       .setMIFlags(Flags);
2121     B.buildFExp2(Dst, Mul, Flags);
2122   } else if (Ty == S16) {
2123     // There's no f16 fmul_legacy, so we need to convert for it.
2124     auto Log = B.buildFLog2(S16, Src0, Flags);
2125     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2126     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2127     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2128       .addUse(Ext0.getReg(0))
2129       .addUse(Ext1.getReg(0))
2130       .setMIFlags(Flags);
2131 
2132     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2133   } else
2134     return false;
2135 
2136   MI.eraseFromParent();
2137   return true;
2138 }
2139 
2140 // Find a source register, ignoring any possible source modifiers.
2141 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2142   Register ModSrc = OrigSrc;
2143   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2144     ModSrc = SrcFNeg->getOperand(1).getReg();
2145     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2146       ModSrc = SrcFAbs->getOperand(1).getReg();
2147   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2148     ModSrc = SrcFAbs->getOperand(1).getReg();
2149   return ModSrc;
2150 }
2151 
2152 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2153                                          MachineRegisterInfo &MRI,
2154                                          MachineIRBuilder &B) const {
2155   B.setInstr(MI);
2156 
2157   const LLT S1 = LLT::scalar(1);
2158   const LLT S64 = LLT::scalar(64);
2159   Register Dst = MI.getOperand(0).getReg();
2160   Register OrigSrc = MI.getOperand(1).getReg();
2161   unsigned Flags = MI.getFlags();
2162   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2163          "this should not have been custom lowered");
2164 
2165   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2166   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2167   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2168   // V_FRACT bug is:
2169   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2170   //
2171   // Convert floor(x) to (x - fract(x))
2172 
2173   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2174     .addUse(OrigSrc)
2175     .setMIFlags(Flags);
2176 
2177   // Give source modifier matching some assistance before obscuring a foldable
2178   // pattern.
2179 
2180   // TODO: We can avoid the neg on the fract? The input sign to fract
2181   // shouldn't matter?
2182   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2183 
2184   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2185 
2186   Register Min = MRI.createGenericVirtualRegister(S64);
2187 
2188   // We don't need to concern ourselves with the snan handling difference, so
2189   // use the one which will directly select.
2190   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2191   if (MFI->getMode().IEEE)
2192     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2193   else
2194     B.buildFMinNum(Min, Fract, Const, Flags);
2195 
2196   Register CorrectedFract = Min;
2197   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2198     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2199     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2200   }
2201 
2202   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2203   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2204 
2205   MI.eraseFromParent();
2206   return true;
2207 }
2208 
2209 // Turn an illegal packed v2s16 build vector into bit operations.
2210 // TODO: This should probably be a bitcast action in LegalizerHelper.
2211 bool AMDGPULegalizerInfo::legalizeBuildVector(
2212   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2213   Register Dst = MI.getOperand(0).getReg();
2214   LLT DstTy = MRI.getType(Dst);
2215   const LLT S32 = LLT::scalar(32);
2216   const LLT V2S16 = LLT::vector(2, 16);
2217   (void)DstTy;
2218   (void)V2S16;
2219   assert(DstTy == V2S16);
2220 
2221   Register Src0 = MI.getOperand(1).getReg();
2222   Register Src1 = MI.getOperand(2).getReg();
2223   assert(MRI.getType(Src0) == LLT::scalar(16));
2224 
2225   B.setInstr(MI);
2226   auto Merge = B.buildMerge(S32, {Src0, Src1});
2227   B.buildBitcast(Dst, Merge);
2228 
2229   MI.eraseFromParent();
2230   return true;
2231 }
2232 
2233 // Return the use branch instruction, otherwise null if the usage is invalid.
2234 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2235                                        MachineRegisterInfo &MRI,
2236                                        MachineInstr *&Br) {
2237   Register CondDef = MI.getOperand(0).getReg();
2238   if (!MRI.hasOneNonDBGUse(CondDef))
2239     return nullptr;
2240 
2241   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2242   if (UseMI.getParent() != MI.getParent() ||
2243       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2244     return nullptr;
2245 
2246   // Make sure the cond br is followed by a G_BR
2247   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2248   if (Next != MI.getParent()->end()) {
2249     if (Next->getOpcode() != AMDGPU::G_BR)
2250       return nullptr;
2251     Br = &*Next;
2252   }
2253 
2254   return &UseMI;
2255 }
2256 
2257 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
2258                                                 Register Reg, LLT Ty) const {
2259   Register LiveIn = MRI.getLiveInVirtReg(Reg);
2260   if (LiveIn)
2261     return LiveIn;
2262 
2263   Register NewReg = MRI.createGenericVirtualRegister(Ty);
2264   MRI.addLiveIn(Reg, NewReg);
2265   return NewReg;
2266 }
2267 
2268 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2269                                          const ArgDescriptor *Arg) const {
2270   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2271     return false; // TODO: Handle these
2272 
2273   assert(Arg->getRegister().isPhysical());
2274 
2275   MachineRegisterInfo &MRI = *B.getMRI();
2276 
2277   LLT Ty = MRI.getType(DstReg);
2278   Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
2279 
2280   if (Arg->isMasked()) {
2281     // TODO: Should we try to emit this once in the entry block?
2282     const LLT S32 = LLT::scalar(32);
2283     const unsigned Mask = Arg->getMask();
2284     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2285 
2286     Register AndMaskSrc = LiveIn;
2287 
2288     if (Shift != 0) {
2289       auto ShiftAmt = B.buildConstant(S32, Shift);
2290       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2291     }
2292 
2293     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2294   } else
2295     B.buildCopy(DstReg, LiveIn);
2296 
2297   // Insert the argument copy if it doens't already exist.
2298   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
2299   if (!MRI.getVRegDef(LiveIn)) {
2300     // FIXME: Should have scoped insert pt
2301     MachineBasicBlock &OrigInsBB = B.getMBB();
2302     auto OrigInsPt = B.getInsertPt();
2303 
2304     MachineBasicBlock &EntryMBB = B.getMF().front();
2305     EntryMBB.addLiveIn(Arg->getRegister());
2306     B.setInsertPt(EntryMBB, EntryMBB.begin());
2307     B.buildCopy(LiveIn, Arg->getRegister());
2308 
2309     B.setInsertPt(OrigInsBB, OrigInsPt);
2310   }
2311 
2312   return true;
2313 }
2314 
2315 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2316   MachineInstr &MI,
2317   MachineRegisterInfo &MRI,
2318   MachineIRBuilder &B,
2319   AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2320   B.setInstr(MI);
2321 
2322   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2323 
2324   const ArgDescriptor *Arg;
2325   const TargetRegisterClass *RC;
2326   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
2327   if (!Arg) {
2328     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
2329     return false;
2330   }
2331 
2332   if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
2333     MI.eraseFromParent();
2334     return true;
2335   }
2336 
2337   return false;
2338 }
2339 
2340 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2341                                        MachineRegisterInfo &MRI,
2342                                        MachineIRBuilder &B) const {
2343   B.setInstr(MI);
2344   Register Dst = MI.getOperand(0).getReg();
2345   LLT DstTy = MRI.getType(Dst);
2346   LLT S16 = LLT::scalar(16);
2347   LLT S32 = LLT::scalar(32);
2348   LLT S64 = LLT::scalar(64);
2349 
2350   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2351     return true;
2352 
2353   if (DstTy == S16)
2354     return legalizeFDIV16(MI, MRI, B);
2355   if (DstTy == S32)
2356     return legalizeFDIV32(MI, MRI, B);
2357   if (DstTy == S64)
2358     return legalizeFDIV64(MI, MRI, B);
2359 
2360   return false;
2361 }
2362 
2363 static Register buildDivRCP(MachineIRBuilder &B, Register Src) {
2364   const LLT S32 = LLT::scalar(32);
2365 
2366   auto Cvt0 = B.buildUITOFP(S32, Src);
2367   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Cvt0});
2368   auto FPUIntMaxPlus1 = B.buildFConstant(S32, BitsToFloat(0x4f800000));
2369   auto Mul = B.buildFMul(S32, RcpIFlag, FPUIntMaxPlus1);
2370   return B.buildFPTOUI(S32, Mul).getReg(0);
2371 }
2372 
2373 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2374                                                   Register DstReg,
2375                                                   Register Num,
2376                                                   Register Den,
2377                                                   bool IsRem) const {
2378   const LLT S1 = LLT::scalar(1);
2379   const LLT S32 = LLT::scalar(32);
2380 
2381   // RCP =  URECIP(Den) = 2^32 / Den + e
2382   // e is rounding error.
2383   auto RCP = buildDivRCP(B, Den);
2384 
2385   // RCP_LO = mul(RCP, Den)
2386   auto RCP_LO = B.buildMul(S32, RCP, Den);
2387 
2388   // RCP_HI = mulhu (RCP, Den) */
2389   auto RCP_HI = B.buildUMulH(S32, RCP, Den);
2390 
2391   // NEG_RCP_LO = -RCP_LO
2392   auto Zero = B.buildConstant(S32, 0);
2393   auto NEG_RCP_LO = B.buildSub(S32, Zero, RCP_LO);
2394 
2395   // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
2396   auto CmpRcpHiZero = B.buildICmp(CmpInst::ICMP_EQ, S1, RCP_HI, Zero);
2397   auto ABS_RCP_LO = B.buildSelect(S32, CmpRcpHiZero, NEG_RCP_LO, RCP_LO);
2398 
2399   // Calculate the rounding error from the URECIP instruction
2400   // E = mulhu(ABS_RCP_LO, RCP)
2401   auto E = B.buildUMulH(S32, ABS_RCP_LO, RCP);
2402 
2403   // RCP_A_E = RCP + E
2404   auto RCP_A_E = B.buildAdd(S32, RCP, E);
2405 
2406   // RCP_S_E = RCP - E
2407   auto RCP_S_E = B.buildSub(S32, RCP, E);
2408 
2409   // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
2410   auto Tmp0 = B.buildSelect(S32, CmpRcpHiZero, RCP_A_E, RCP_S_E);
2411 
2412   // Quotient = mulhu(Tmp0, Num)stmp
2413   auto Quotient = B.buildUMulH(S32, Tmp0, Num);
2414 
2415   // Num_S_Remainder = Quotient * Den
2416   auto Num_S_Remainder = B.buildMul(S32, Quotient, Den);
2417 
2418   // Remainder = Num - Num_S_Remainder
2419   auto Remainder = B.buildSub(S32, Num, Num_S_Remainder);
2420 
2421   // Remainder_GE_Den = Remainder >= Den
2422   auto Remainder_GE_Den = B.buildICmp(CmpInst::ICMP_UGE, S1, Remainder, Den);
2423 
2424   // Remainder_GE_Zero = Num >= Num_S_Remainder;
2425   auto Remainder_GE_Zero = B.buildICmp(CmpInst::ICMP_UGE, S1,
2426                                        Num, Num_S_Remainder);
2427 
2428   // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
2429   auto Tmp1 = B.buildAnd(S1, Remainder_GE_Den, Remainder_GE_Zero);
2430 
2431   // Calculate Division result:
2432 
2433   // Quotient_A_One = Quotient + 1
2434   auto One = B.buildConstant(S32, 1);
2435   auto Quotient_A_One = B.buildAdd(S32, Quotient, One);
2436 
2437   // Quotient_S_One = Quotient - 1
2438   auto Quotient_S_One = B.buildSub(S32, Quotient, One);
2439 
2440   // Div = (Tmp1 == 0 ? Quotient_A_One : Quotient)
2441   auto Div = B.buildSelect(S32, Tmp1, Quotient, Quotient_A_One);
2442 
2443   // Div = (Remainder_GE_Zero ? Div : Quotient_S_One)
2444   if (IsRem) {
2445     Div = B.buildSelect(S32, Remainder_GE_Zero, Div, Quotient_S_One);
2446 
2447     // Calculate Rem result:
2448     auto Remainder_S_Den = B.buildSub(S32, Remainder, Den);
2449 
2450     // Remainder_A_Den = Remainder + Den
2451     auto Remainder_A_Den = B.buildAdd(S32, Remainder, Den);
2452 
2453     // Rem = (Tmp1 ? Remainder_S_Den : Remainder)
2454     auto Rem = B.buildSelect(S32, Tmp1, Remainder_S_Den, Remainder);
2455 
2456     // Rem = (Remainder_GE_Zero ? Rem : Remainder_A_Den)
2457     B.buildSelect(DstReg, Remainder_GE_Zero, Rem, Remainder_A_Den);
2458   } else {
2459     B.buildSelect(DstReg, Remainder_GE_Zero, Div, Quotient_S_One);
2460   }
2461 }
2462 
2463 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2464                                               MachineRegisterInfo &MRI,
2465                                               MachineIRBuilder &B) const {
2466   B.setInstr(MI);
2467   const bool IsRem = MI.getOpcode() == AMDGPU::G_UREM;
2468   Register DstReg = MI.getOperand(0).getReg();
2469   Register Num = MI.getOperand(1).getReg();
2470   Register Den = MI.getOperand(2).getReg();
2471   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsRem);
2472   MI.eraseFromParent();
2473   return true;
2474 }
2475 
2476 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2477                                             MachineRegisterInfo &MRI,
2478                                             MachineIRBuilder &B) const {
2479   if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32))
2480     return legalizeUDIV_UREM32(MI, MRI, B);
2481   return false;
2482 }
2483 
2484 bool AMDGPULegalizerInfo::legalizeSDIV_SREM32(MachineInstr &MI,
2485                                               MachineRegisterInfo &MRI,
2486                                               MachineIRBuilder &B) const {
2487   B.setInstr(MI);
2488   const LLT S32 = LLT::scalar(32);
2489 
2490   const bool IsRem = MI.getOpcode() == AMDGPU::G_SREM;
2491   Register DstReg = MI.getOperand(0).getReg();
2492   Register LHS = MI.getOperand(1).getReg();
2493   Register RHS = MI.getOperand(2).getReg();
2494 
2495   auto ThirtyOne = B.buildConstant(S32, 31);
2496   auto LHSign = B.buildAShr(S32, LHS, ThirtyOne);
2497   auto RHSign = B.buildAShr(S32, LHS, ThirtyOne);
2498 
2499   LHS = B.buildAdd(S32, LHS, LHSign).getReg(0);
2500   RHS = B.buildAdd(S32, RHS, RHSign).getReg(0);
2501 
2502   LHS = B.buildXor(S32, LHS, LHSign).getReg(0);
2503   RHS = B.buildXor(S32, RHS, RHSign).getReg(0);
2504 
2505   Register UDivRem = MRI.createGenericVirtualRegister(S32);
2506   legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsRem);
2507 
2508   if (IsRem) {
2509     auto RSign = LHSign; // Remainder sign is the same as LHS
2510     UDivRem = B.buildXor(S32, UDivRem, RSign).getReg(0);
2511     B.buildSub(DstReg, UDivRem, RSign);
2512   } else {
2513     auto DSign = B.buildXor(S32, LHSign, RHSign);
2514     UDivRem = B.buildXor(S32, UDivRem, DSign).getReg(0);
2515     B.buildSub(DstReg, UDivRem, DSign);
2516   }
2517 
2518   MI.eraseFromParent();
2519   return true;
2520 }
2521 
2522 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2523                                             MachineRegisterInfo &MRI,
2524                                             MachineIRBuilder &B) const {
2525   if (MRI.getType(MI.getOperand(0).getReg()) == LLT::scalar(32))
2526     return legalizeSDIV_SREM32(MI, MRI, B);
2527   return false;
2528 }
2529 
2530 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2531                                                  MachineRegisterInfo &MRI,
2532                                                  MachineIRBuilder &B) const {
2533   Register Res = MI.getOperand(0).getReg();
2534   Register LHS = MI.getOperand(1).getReg();
2535   Register RHS = MI.getOperand(2).getReg();
2536 
2537   uint16_t Flags = MI.getFlags();
2538 
2539   LLT ResTy = MRI.getType(Res);
2540   LLT S32 = LLT::scalar(32);
2541   LLT S64 = LLT::scalar(64);
2542 
2543   const MachineFunction &MF = B.getMF();
2544   bool Unsafe =
2545     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2546 
2547   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2548     return false;
2549 
2550   if (!Unsafe && ResTy == S32 &&
2551       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2552     return false;
2553 
2554   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2555     // 1 / x -> RCP(x)
2556     if (CLHS->isExactlyValue(1.0)) {
2557       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2558         .addUse(RHS)
2559         .setMIFlags(Flags);
2560 
2561       MI.eraseFromParent();
2562       return true;
2563     }
2564 
2565     // -1 / x -> RCP( FNEG(x) )
2566     if (CLHS->isExactlyValue(-1.0)) {
2567       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2568       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2569         .addUse(FNeg.getReg(0))
2570         .setMIFlags(Flags);
2571 
2572       MI.eraseFromParent();
2573       return true;
2574     }
2575   }
2576 
2577   // x / y -> x * (1.0 / y)
2578   if (Unsafe) {
2579     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2580       .addUse(RHS)
2581       .setMIFlags(Flags);
2582     B.buildFMul(Res, LHS, RCP, Flags);
2583 
2584     MI.eraseFromParent();
2585     return true;
2586   }
2587 
2588   return false;
2589 }
2590 
2591 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2592                                          MachineRegisterInfo &MRI,
2593                                          MachineIRBuilder &B) const {
2594   B.setInstr(MI);
2595   Register Res = MI.getOperand(0).getReg();
2596   Register LHS = MI.getOperand(1).getReg();
2597   Register RHS = MI.getOperand(2).getReg();
2598 
2599   uint16_t Flags = MI.getFlags();
2600 
2601   LLT S16 = LLT::scalar(16);
2602   LLT S32 = LLT::scalar(32);
2603 
2604   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2605   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2606 
2607   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2608     .addUse(RHSExt.getReg(0))
2609     .setMIFlags(Flags);
2610 
2611   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2612   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2613 
2614   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2615     .addUse(RDst.getReg(0))
2616     .addUse(RHS)
2617     .addUse(LHS)
2618     .setMIFlags(Flags);
2619 
2620   MI.eraseFromParent();
2621   return true;
2622 }
2623 
2624 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2625 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2626 static void toggleSPDenormMode(bool Enable,
2627                                MachineIRBuilder &B,
2628                                const GCNSubtarget &ST,
2629                                AMDGPU::SIModeRegisterDefaults Mode) {
2630   // Set SP denorm mode to this value.
2631   unsigned SPDenormMode =
2632     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
2633 
2634   if (ST.hasDenormModeInst()) {
2635     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2636     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
2637 
2638     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2639     B.buildInstr(AMDGPU::S_DENORM_MODE)
2640       .addImm(NewDenormModeValue);
2641 
2642   } else {
2643     // Select FP32 bit field in mode register.
2644     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2645                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2646                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2647 
2648     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2649       .addImm(SPDenormMode)
2650       .addImm(SPDenormModeBitField);
2651   }
2652 }
2653 
2654 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2655                                          MachineRegisterInfo &MRI,
2656                                          MachineIRBuilder &B) const {
2657   B.setInstr(MI);
2658   Register Res = MI.getOperand(0).getReg();
2659   Register LHS = MI.getOperand(1).getReg();
2660   Register RHS = MI.getOperand(2).getReg();
2661   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2662   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2663 
2664   uint16_t Flags = MI.getFlags();
2665 
2666   LLT S32 = LLT::scalar(32);
2667   LLT S1 = LLT::scalar(1);
2668 
2669   auto One = B.buildFConstant(S32, 1.0f);
2670 
2671   auto DenominatorScaled =
2672     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2673       .addUse(RHS)
2674       .addUse(LHS)
2675       .addImm(1)
2676       .setMIFlags(Flags);
2677   auto NumeratorScaled =
2678     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2679       .addUse(LHS)
2680       .addUse(RHS)
2681       .addImm(0)
2682       .setMIFlags(Flags);
2683 
2684   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2685     .addUse(DenominatorScaled.getReg(0))
2686     .setMIFlags(Flags);
2687   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2688 
2689   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2690   // aren't modeled as reading it.
2691   if (!Mode.allFP32Denormals())
2692     toggleSPDenormMode(true, B, ST, Mode);
2693 
2694   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2695   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2696   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2697   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2698   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2699   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2700 
2701   if (!Mode.allFP32Denormals())
2702     toggleSPDenormMode(false, B, ST, Mode);
2703 
2704   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2705     .addUse(Fma4.getReg(0))
2706     .addUse(Fma1.getReg(0))
2707     .addUse(Fma3.getReg(0))
2708     .addUse(NumeratorScaled.getReg(1))
2709     .setMIFlags(Flags);
2710 
2711   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2712     .addUse(Fmas.getReg(0))
2713     .addUse(RHS)
2714     .addUse(LHS)
2715     .setMIFlags(Flags);
2716 
2717   MI.eraseFromParent();
2718   return true;
2719 }
2720 
2721 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
2722                                          MachineRegisterInfo &MRI,
2723                                          MachineIRBuilder &B) const {
2724   B.setInstr(MI);
2725   Register Res = MI.getOperand(0).getReg();
2726   Register LHS = MI.getOperand(1).getReg();
2727   Register RHS = MI.getOperand(2).getReg();
2728 
2729   uint16_t Flags = MI.getFlags();
2730 
2731   LLT S64 = LLT::scalar(64);
2732   LLT S1 = LLT::scalar(1);
2733 
2734   auto One = B.buildFConstant(S64, 1.0);
2735 
2736   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2737     .addUse(LHS)
2738     .addUse(RHS)
2739     .addImm(1)
2740     .setMIFlags(Flags);
2741 
2742   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
2743 
2744   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
2745     .addUse(DivScale0.getReg(0))
2746     .setMIFlags(Flags);
2747 
2748   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
2749   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
2750   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
2751 
2752   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2753     .addUse(LHS)
2754     .addUse(RHS)
2755     .addImm(0)
2756     .setMIFlags(Flags);
2757 
2758   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
2759   auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags);
2760   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
2761 
2762   Register Scale;
2763   if (!ST.hasUsableDivScaleConditionOutput()) {
2764     // Workaround a hardware bug on SI where the condition output from div_scale
2765     // is not usable.
2766 
2767     LLT S32 = LLT::scalar(32);
2768 
2769     auto NumUnmerge = B.buildUnmerge(S32, LHS);
2770     auto DenUnmerge = B.buildUnmerge(S32, RHS);
2771     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
2772     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
2773 
2774     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
2775                               Scale1Unmerge.getReg(1));
2776     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
2777                               Scale0Unmerge.getReg(1));
2778     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
2779   } else {
2780     Scale = DivScale1.getReg(1);
2781   }
2782 
2783   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
2784     .addUse(Fma4.getReg(0))
2785     .addUse(Fma3.getReg(0))
2786     .addUse(Mul.getReg(0))
2787     .addUse(Scale)
2788     .setMIFlags(Flags);
2789 
2790   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
2791     .addUse(Fmas.getReg(0))
2792     .addUse(RHS)
2793     .addUse(LHS)
2794     .setMIFlags(Flags);
2795 
2796   MI.eraseFromParent();
2797   return true;
2798 }
2799 
2800 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
2801                                                  MachineRegisterInfo &MRI,
2802                                                  MachineIRBuilder &B) const {
2803   B.setInstr(MI);
2804   Register Res = MI.getOperand(0).getReg();
2805   Register LHS = MI.getOperand(2).getReg();
2806   Register RHS = MI.getOperand(3).getReg();
2807   uint16_t Flags = MI.getFlags();
2808 
2809   LLT S32 = LLT::scalar(32);
2810   LLT S1 = LLT::scalar(1);
2811 
2812   auto Abs = B.buildFAbs(S32, RHS, Flags);
2813   const APFloat C0Val(1.0f);
2814 
2815   auto C0 = B.buildConstant(S32, 0x6f800000);
2816   auto C1 = B.buildConstant(S32, 0x2f800000);
2817   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
2818 
2819   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
2820   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
2821 
2822   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
2823 
2824   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2825     .addUse(Mul0.getReg(0))
2826     .setMIFlags(Flags);
2827 
2828   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
2829 
2830   B.buildFMul(Res, Sel, Mul1, Flags);
2831 
2832   MI.eraseFromParent();
2833   return true;
2834 }
2835 
2836 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
2837                                                  MachineRegisterInfo &MRI,
2838                                                  MachineIRBuilder &B) const {
2839   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2840   if (!MFI->isEntryFunction()) {
2841     return legalizePreloadedArgIntrin(MI, MRI, B,
2842                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
2843   }
2844 
2845   B.setInstr(MI);
2846 
2847   uint64_t Offset =
2848     ST.getTargetLowering()->getImplicitParameterOffset(
2849       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
2850   Register DstReg = MI.getOperand(0).getReg();
2851   LLT DstTy = MRI.getType(DstReg);
2852   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
2853 
2854   const ArgDescriptor *Arg;
2855   const TargetRegisterClass *RC;
2856   std::tie(Arg, RC)
2857     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2858   if (!Arg)
2859     return false;
2860 
2861   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
2862   if (!loadInputValue(KernargPtrReg, B, Arg))
2863     return false;
2864 
2865   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
2866   MI.eraseFromParent();
2867   return true;
2868 }
2869 
2870 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
2871                                               MachineRegisterInfo &MRI,
2872                                               MachineIRBuilder &B,
2873                                               unsigned AddrSpace) const {
2874   B.setInstr(MI);
2875   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
2876   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
2877   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
2878   MI.eraseFromParent();
2879   return true;
2880 }
2881 
2882 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
2883 // offset (the offset that is included in bounds checking and swizzling, to be
2884 // split between the instruction's voffset and immoffset fields) and soffset
2885 // (the offset that is excluded from bounds checking and swizzling, to go in
2886 // the instruction's soffset field).  This function takes the first kind of
2887 // offset and figures out how to split it between voffset and immoffset.
2888 std::tuple<Register, unsigned, unsigned>
2889 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
2890                                         Register OrigOffset) const {
2891   const unsigned MaxImm = 4095;
2892   Register BaseReg;
2893   unsigned TotalConstOffset;
2894   MachineInstr *OffsetDef;
2895   const LLT S32 = LLT::scalar(32);
2896 
2897   std::tie(BaseReg, TotalConstOffset, OffsetDef)
2898     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
2899 
2900   unsigned ImmOffset = TotalConstOffset;
2901 
2902   // If the immediate value is too big for the immoffset field, put the value
2903   // and -4096 into the immoffset field so that the value that is copied/added
2904   // for the voffset field is a multiple of 4096, and it stands more chance
2905   // of being CSEd with the copy/add for another similar load/store.
2906   // However, do not do that rounding down to a multiple of 4096 if that is a
2907   // negative number, as it appears to be illegal to have a negative offset
2908   // in the vgpr, even if adding the immediate offset makes it positive.
2909   unsigned Overflow = ImmOffset & ~MaxImm;
2910   ImmOffset -= Overflow;
2911   if ((int32_t)Overflow < 0) {
2912     Overflow += ImmOffset;
2913     ImmOffset = 0;
2914   }
2915 
2916   if (Overflow != 0) {
2917     if (!BaseReg) {
2918       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
2919     } else {
2920       auto OverflowVal = B.buildConstant(S32, Overflow);
2921       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
2922     }
2923   }
2924 
2925   if (!BaseReg)
2926     BaseReg = B.buildConstant(S32, 0).getReg(0);
2927 
2928   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
2929 }
2930 
2931 /// Handle register layout difference for f16 images for some subtargets.
2932 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
2933                                              MachineRegisterInfo &MRI,
2934                                              Register Reg) const {
2935   if (!ST.hasUnpackedD16VMem())
2936     return Reg;
2937 
2938   const LLT S16 = LLT::scalar(16);
2939   const LLT S32 = LLT::scalar(32);
2940   LLT StoreVT = MRI.getType(Reg);
2941   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
2942 
2943   auto Unmerge = B.buildUnmerge(S16, Reg);
2944 
2945   SmallVector<Register, 4> WideRegs;
2946   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
2947     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
2948 
2949   int NumElts = StoreVT.getNumElements();
2950 
2951   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
2952 }
2953 
2954 Register AMDGPULegalizerInfo::fixStoreSourceType(
2955   MachineIRBuilder &B, Register VData, bool IsFormat) const {
2956   MachineRegisterInfo *MRI = B.getMRI();
2957   LLT Ty = MRI->getType(VData);
2958 
2959   const LLT S16 = LLT::scalar(16);
2960 
2961   // Fixup illegal register types for i8 stores.
2962   if (Ty == LLT::scalar(8) || Ty == S16) {
2963     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
2964     return AnyExt;
2965   }
2966 
2967   if (Ty.isVector()) {
2968     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
2969       if (IsFormat)
2970         return handleD16VData(B, *MRI, VData);
2971     }
2972   }
2973 
2974   return VData;
2975 }
2976 
2977 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
2978                                               MachineRegisterInfo &MRI,
2979                                               MachineIRBuilder &B,
2980                                               bool IsTyped,
2981                                               bool IsFormat) const {
2982   B.setInstr(MI);
2983 
2984   Register VData = MI.getOperand(1).getReg();
2985   LLT Ty = MRI.getType(VData);
2986   LLT EltTy = Ty.getScalarType();
2987   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
2988   const LLT S32 = LLT::scalar(32);
2989 
2990   VData = fixStoreSourceType(B, VData, IsFormat);
2991   Register RSrc = MI.getOperand(2).getReg();
2992 
2993   MachineMemOperand *MMO = *MI.memoperands_begin();
2994   const int MemSize = MMO->getSize();
2995 
2996   unsigned ImmOffset;
2997   unsigned TotalOffset;
2998 
2999   // The typed intrinsics add an immediate after the registers.
3000   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3001 
3002   // The struct intrinsic variants add one additional operand over raw.
3003   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3004   Register VIndex;
3005   int OpOffset = 0;
3006   if (HasVIndex) {
3007     VIndex = MI.getOperand(3).getReg();
3008     OpOffset = 1;
3009   }
3010 
3011   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3012   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3013 
3014   unsigned Format = 0;
3015   if (IsTyped) {
3016     Format = MI.getOperand(5 + OpOffset).getImm();
3017     ++OpOffset;
3018   }
3019 
3020   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3021 
3022   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3023   if (TotalOffset != 0)
3024     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3025 
3026   unsigned Opc;
3027   if (IsTyped) {
3028     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3029                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3030   } else if (IsFormat) {
3031     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3032                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3033   } else {
3034     switch (MemSize) {
3035     case 1:
3036       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3037       break;
3038     case 2:
3039       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3040       break;
3041     default:
3042       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3043       break;
3044     }
3045   }
3046 
3047   if (!VIndex)
3048     VIndex = B.buildConstant(S32, 0).getReg(0);
3049 
3050   auto MIB = B.buildInstr(Opc)
3051     .addUse(VData)              // vdata
3052     .addUse(RSrc)               // rsrc
3053     .addUse(VIndex)             // vindex
3054     .addUse(VOffset)            // voffset
3055     .addUse(SOffset)            // soffset
3056     .addImm(ImmOffset);         // offset(imm)
3057 
3058   if (IsTyped)
3059     MIB.addImm(Format);
3060 
3061   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3062      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3063      .addMemOperand(MMO);
3064 
3065   MI.eraseFromParent();
3066   return true;
3067 }
3068 
3069 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3070                                              MachineRegisterInfo &MRI,
3071                                              MachineIRBuilder &B,
3072                                              bool IsFormat,
3073                                              bool IsTyped) const {
3074   B.setInstr(MI);
3075 
3076   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3077   MachineMemOperand *MMO = *MI.memoperands_begin();
3078   const int MemSize = MMO->getSize();
3079   const LLT S32 = LLT::scalar(32);
3080 
3081   Register Dst = MI.getOperand(0).getReg();
3082   Register RSrc = MI.getOperand(2).getReg();
3083 
3084   // The typed intrinsics add an immediate after the registers.
3085   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3086 
3087   // The struct intrinsic variants add one additional operand over raw.
3088   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3089   Register VIndex;
3090   int OpOffset = 0;
3091   if (HasVIndex) {
3092     VIndex = MI.getOperand(3).getReg();
3093     OpOffset = 1;
3094   }
3095 
3096   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3097   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3098 
3099   unsigned Format = 0;
3100   if (IsTyped) {
3101     Format = MI.getOperand(5 + OpOffset).getImm();
3102     ++OpOffset;
3103   }
3104 
3105   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3106   unsigned ImmOffset;
3107   unsigned TotalOffset;
3108 
3109   LLT Ty = MRI.getType(Dst);
3110   LLT EltTy = Ty.getScalarType();
3111   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3112   const bool Unpacked = ST.hasUnpackedD16VMem();
3113 
3114   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3115   if (TotalOffset != 0)
3116     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3117 
3118   unsigned Opc;
3119 
3120   if (IsTyped) {
3121     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3122                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3123   } else if (IsFormat) {
3124     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3125                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3126   } else {
3127     switch (MemSize) {
3128     case 1:
3129       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3130       break;
3131     case 2:
3132       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3133       break;
3134     default:
3135       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3136       break;
3137     }
3138   }
3139 
3140   Register LoadDstReg;
3141 
3142   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3143   LLT UnpackedTy = Ty.changeElementSize(32);
3144 
3145   if (IsExtLoad)
3146     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3147   else if (Unpacked && IsD16 && Ty.isVector())
3148     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3149   else
3150     LoadDstReg = Dst;
3151 
3152   if (!VIndex)
3153     VIndex = B.buildConstant(S32, 0).getReg(0);
3154 
3155   auto MIB = B.buildInstr(Opc)
3156     .addDef(LoadDstReg)         // vdata
3157     .addUse(RSrc)               // rsrc
3158     .addUse(VIndex)             // vindex
3159     .addUse(VOffset)            // voffset
3160     .addUse(SOffset)            // soffset
3161     .addImm(ImmOffset);         // offset(imm)
3162 
3163   if (IsTyped)
3164     MIB.addImm(Format);
3165 
3166   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3167      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3168      .addMemOperand(MMO);
3169 
3170   if (LoadDstReg != Dst) {
3171     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3172 
3173     // Widen result for extending loads was widened.
3174     if (IsExtLoad)
3175       B.buildTrunc(Dst, LoadDstReg);
3176     else {
3177       // Repack to original 16-bit vector result
3178       // FIXME: G_TRUNC should work, but legalization currently fails
3179       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3180       SmallVector<Register, 4> Repack;
3181       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3182         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3183       B.buildMerge(Dst, Repack);
3184     }
3185   }
3186 
3187   MI.eraseFromParent();
3188   return true;
3189 }
3190 
3191 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3192                                                MachineIRBuilder &B,
3193                                                bool IsInc) const {
3194   B.setInstr(MI);
3195   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3196                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3197   B.buildInstr(Opc)
3198     .addDef(MI.getOperand(0).getReg())
3199     .addUse(MI.getOperand(2).getReg())
3200     .addUse(MI.getOperand(3).getReg())
3201     .cloneMemRefs(MI);
3202   MI.eraseFromParent();
3203   return true;
3204 }
3205 
3206 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3207   switch (IntrID) {
3208   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3209   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3210     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3211   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3212   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3213     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3214   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3215   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3216     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3217   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3218   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3219     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3220   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3221   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3222     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3223   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3224   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3225     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3226   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3227   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3228     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3229   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3230   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3231     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3232   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3233   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3234     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3235   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3236   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3237     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3238   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3239   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3240     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3241   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3242   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3243     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3244   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3245   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3246     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3247   default:
3248     llvm_unreachable("unhandled atomic opcode");
3249   }
3250 }
3251 
3252 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3253                                                MachineIRBuilder &B,
3254                                                Intrinsic::ID IID) const {
3255   B.setInstr(MI);
3256 
3257   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3258                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3259 
3260   Register Dst = MI.getOperand(0).getReg();
3261   Register VData = MI.getOperand(2).getReg();
3262 
3263   Register CmpVal;
3264   int OpOffset = 0;
3265 
3266   if (IsCmpSwap) {
3267     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3268     ++OpOffset;
3269   }
3270 
3271   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3272   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
3273 
3274   // The struct intrinsic variants add one additional operand over raw.
3275   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3276   Register VIndex;
3277   if (HasVIndex) {
3278     VIndex = MI.getOperand(4 + OpOffset).getReg();
3279     ++OpOffset;
3280   }
3281 
3282   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3283   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3284   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3285 
3286   MachineMemOperand *MMO = *MI.memoperands_begin();
3287 
3288   unsigned ImmOffset;
3289   unsigned TotalOffset;
3290   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3291   if (TotalOffset != 0)
3292     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3293 
3294   if (!VIndex)
3295     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3296 
3297   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
3298     .addDef(Dst)
3299     .addUse(VData); // vdata
3300 
3301   if (IsCmpSwap)
3302     MIB.addReg(CmpVal);
3303 
3304   MIB.addUse(RSrc)               // rsrc
3305      .addUse(VIndex)             // vindex
3306      .addUse(VOffset)            // voffset
3307      .addUse(SOffset)            // soffset
3308      .addImm(ImmOffset)          // offset(imm)
3309      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3310      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3311      .addMemOperand(MMO);
3312 
3313   MI.eraseFromParent();
3314   return true;
3315 }
3316 
3317 // Produce a vector of s16 elements from s32 pieces.
3318 static void truncToS16Vector(MachineIRBuilder &B, Register DstReg,
3319                              ArrayRef<Register> UnmergeParts) {
3320   const LLT S16 = LLT::scalar(16);
3321 
3322   SmallVector<Register, 4> RemergeParts(UnmergeParts.size());
3323   for (int I = 0, E = UnmergeParts.size(); I != E; ++I)
3324     RemergeParts[I] = B.buildTrunc(S16, UnmergeParts[I]).getReg(0);
3325 
3326   B.buildBuildVector(DstReg, RemergeParts);
3327 }
3328 
3329 /// Convert a set of s32 registers to a result vector with s16 elements.
3330 static void bitcastToS16Vector(MachineIRBuilder &B, Register DstReg,
3331                                ArrayRef<Register> UnmergeParts) {
3332   MachineRegisterInfo &MRI = *B.getMRI();
3333   const LLT V2S16 = LLT::vector(2, 16);
3334   LLT TargetTy = MRI.getType(DstReg);
3335   int NumElts = UnmergeParts.size();
3336 
3337   if (NumElts == 1) {
3338     assert(TargetTy == V2S16);
3339     B.buildBitcast(DstReg, UnmergeParts[0]);
3340     return;
3341   }
3342 
3343   SmallVector<Register, 4> RemergeParts(NumElts);
3344   for (int I = 0; I != NumElts; ++I)
3345     RemergeParts[I] = B.buildBitcast(V2S16, UnmergeParts[I]).getReg(0);
3346 
3347   if (TargetTy.getSizeInBits() == 32u * NumElts) {
3348     B.buildConcatVectors(DstReg, RemergeParts);
3349     return;
3350   }
3351 
3352   const LLT V3S16 = LLT::vector(3, 16);
3353   const LLT V6S16 = LLT::vector(6, 16);
3354 
3355   // Widen to v6s16 and unpack v3 parts.
3356   assert(TargetTy == V3S16);
3357 
3358   RemergeParts.push_back(B.buildUndef(V2S16).getReg(0));
3359   auto Concat = B.buildConcatVectors(V6S16, RemergeParts);
3360   B.buildUnmerge({DstReg, MRI.createGenericVirtualRegister(V3S16)}, Concat);
3361 }
3362 
3363 // FIXME: Just vector trunc should be sufficent, but legalization currently
3364 // broken.
3365 static void repackUnpackedD16Load(MachineIRBuilder &B, Register DstReg,
3366                                   Register WideDstReg) {
3367   const LLT S32 = LLT::scalar(32);
3368   const LLT S16 = LLT::scalar(16);
3369 
3370   auto Unmerge = B.buildUnmerge(S32, WideDstReg);
3371 
3372   int NumOps = Unmerge->getNumOperands() - 1;
3373   SmallVector<Register, 4> RemergeParts(NumOps);
3374   for (int I = 0; I != NumOps; ++I)
3375     RemergeParts[I] = B.buildTrunc(S16, Unmerge.getReg(I)).getReg(0);
3376 
3377   B.buildBuildVector(DstReg, RemergeParts);
3378 }
3379 
3380 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3381     MachineInstr &MI, MachineIRBuilder &B,
3382     GISelChangeObserver &Observer,
3383     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3384   bool IsTFE = MI.getNumExplicitDefs() == 2;
3385 
3386   // We are only processing the operands of d16 image operations on subtargets
3387   // that use the unpacked register layout, or need to repack the TFE result.
3388 
3389   // TODO: Need to handle a16 images too
3390   // TODO: Do we need to guard against already legalized intrinsics?
3391   if (!IsTFE && !ST.hasUnpackedD16VMem())
3392     return true;
3393 
3394   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3395     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3396 
3397   if (BaseOpcode->Atomic) // No d16 atomics, or TFE.
3398     return true;
3399 
3400   B.setInstr(MI);
3401 
3402   MachineRegisterInfo *MRI = B.getMRI();
3403   const LLT S32 = LLT::scalar(32);
3404   const LLT S16 = LLT::scalar(16);
3405 
3406   if (BaseOpcode->Store) { // No TFE for stores?
3407     Register VData = MI.getOperand(1).getReg();
3408     LLT Ty = MRI->getType(VData);
3409     if (!Ty.isVector() || Ty.getElementType() != S16)
3410       return true;
3411 
3412     B.setInstr(MI);
3413 
3414     Observer.changingInstr(MI);
3415     MI.getOperand(1).setReg(handleD16VData(B, *MRI, VData));
3416     Observer.changedInstr(MI);
3417     return true;
3418   }
3419 
3420   Register DstReg = MI.getOperand(0).getReg();
3421   LLT Ty = MRI->getType(DstReg);
3422   const LLT EltTy = Ty.getScalarType();
3423   const bool IsD16 = Ty.getScalarType() == S16;
3424   const unsigned NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
3425 
3426   if (IsTFE) {
3427     // In the IR, TFE is supposed to be used with a 2 element struct return
3428     // type. The intruction really returns these two values in one contiguous
3429     // register, with one additional dword beyond the loaded data. Rewrite the
3430     // return type to use a single register result.
3431     Register Dst1Reg = MI.getOperand(1).getReg();
3432     if (MRI->getType(Dst1Reg) != S32)
3433       return false;
3434 
3435     // TODO: Make sure the TFE operand bit is set.
3436 
3437     // The raw dword aligned data component of the load. The only legal cases
3438     // where this matters should be when using the packed D16 format, for
3439     // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
3440     LLT RoundedTy;
3441     LLT TFETy;
3442 
3443     if (IsD16 && ST.hasUnpackedD16VMem()) {
3444       RoundedTy = LLT::scalarOrVector(NumElts, 32);
3445       TFETy = LLT::vector(NumElts + 1, 32);
3446     } else {
3447       unsigned EltSize = Ty.getScalarSizeInBits();
3448       unsigned RoundedElts = (Ty.getSizeInBits() + 31) / 32;
3449       unsigned RoundedSize = 32 * RoundedElts;
3450       RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
3451       TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
3452     }
3453 
3454     Register TFEReg = MRI->createGenericVirtualRegister(TFETy);
3455     Observer.changingInstr(MI);
3456 
3457     MI.getOperand(0).setReg(TFEReg);
3458     MI.RemoveOperand(1);
3459 
3460     Observer.changedInstr(MI);
3461 
3462     // Insert after the instruction.
3463     B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3464 
3465     // Now figure out how to copy the new result register back into the old
3466     // result.
3467 
3468     SmallVector<Register, 5> UnmergeResults(TFETy.getNumElements(), Dst1Reg);
3469     int NumDataElts = TFETy.getNumElements() - 1;
3470 
3471     if (!Ty.isVector()) {
3472       // Simplest case is a trivial unmerge (plus a truncate for d16).
3473       UnmergeResults[0] = Ty == S32 ?
3474         DstReg : MRI->createGenericVirtualRegister(S32);
3475 
3476       B.buildUnmerge(UnmergeResults, TFEReg);
3477       if (Ty != S32)
3478         B.buildTrunc(DstReg, UnmergeResults[0]);
3479       return true;
3480     }
3481 
3482     // We have to repack into a new vector of some kind.
3483     for (int I = 0; I != NumDataElts; ++I)
3484       UnmergeResults[I] = MRI->createGenericVirtualRegister(S32);
3485     B.buildUnmerge(UnmergeResults, TFEReg);
3486 
3487     // Drop the final TFE element.
3488     ArrayRef<Register> DataPart(UnmergeResults.data(), NumDataElts);
3489 
3490     if (EltTy == S32)
3491       B.buildBuildVector(DstReg, DataPart);
3492     else if (ST.hasUnpackedD16VMem())
3493       truncToS16Vector(B, DstReg, DataPart);
3494     else
3495       bitcastToS16Vector(B, DstReg, DataPart);
3496 
3497     return true;
3498   }
3499 
3500   // Must be an image load.
3501   if (!Ty.isVector() || Ty.getElementType() != S16)
3502     return true;
3503 
3504   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3505 
3506   LLT WidenedTy = Ty.changeElementType(S32);
3507   Register WideDstReg = MRI->createGenericVirtualRegister(WidenedTy);
3508 
3509   Observer.changingInstr(MI);
3510   MI.getOperand(0).setReg(WideDstReg);
3511   Observer.changedInstr(MI);
3512 
3513   repackUnpackedD16Load(B, DstReg, WideDstReg);
3514   return true;
3515 }
3516 
3517 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
3518   MachineInstr &MI, MachineIRBuilder &B,
3519   GISelChangeObserver &Observer) const {
3520   Register Dst = MI.getOperand(0).getReg();
3521   LLT Ty = B.getMRI()->getType(Dst);
3522   unsigned Size = Ty.getSizeInBits();
3523   MachineFunction &MF = B.getMF();
3524 
3525   Observer.changingInstr(MI);
3526 
3527   // FIXME: We don't really need this intermediate instruction. The intrinsic
3528   // should be fixed to have a memory operand. Since it's readnone, we're not
3529   // allowed to add one.
3530   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
3531   MI.RemoveOperand(1); // Remove intrinsic ID
3532 
3533   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
3534   // TODO: Should this use datalayout alignment?
3535   const unsigned MemSize = (Size + 7) / 8;
3536   const unsigned MemAlign = 4;
3537   MachineMemOperand *MMO = MF.getMachineMemOperand(
3538     MachinePointerInfo(),
3539     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
3540     MachineMemOperand::MOInvariant, MemSize, MemAlign);
3541   MI.addMemOperand(MF, MMO);
3542 
3543   // There are no 96-bit result scalar loads, but widening to 128-bit should
3544   // always be legal. We may need to restore this to a 96-bit result if it turns
3545   // out this needs to be converted to a vector load during RegBankSelect.
3546   if (!isPowerOf2_32(Size)) {
3547     LegalizerHelper Helper(MF, *this, Observer, B);
3548     B.setInstr(MI);
3549 
3550     if (Ty.isVector())
3551       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
3552     else
3553       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
3554   }
3555 
3556   Observer.changedInstr(MI);
3557   return true;
3558 }
3559 
3560 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
3561                                             MachineIRBuilder &B,
3562                                             GISelChangeObserver &Observer) const {
3563   MachineRegisterInfo &MRI = *B.getMRI();
3564 
3565   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
3566   auto IntrID = MI.getIntrinsicID();
3567   switch (IntrID) {
3568   case Intrinsic::amdgcn_if:
3569   case Intrinsic::amdgcn_else: {
3570     MachineInstr *Br = nullptr;
3571     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
3572       const SIRegisterInfo *TRI
3573         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
3574 
3575       B.setInstr(*BrCond);
3576       Register Def = MI.getOperand(1).getReg();
3577       Register Use = MI.getOperand(3).getReg();
3578 
3579       MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB();
3580       if (Br)
3581         BrTarget = Br->getOperand(0).getMBB();
3582 
3583       if (IntrID == Intrinsic::amdgcn_if) {
3584         B.buildInstr(AMDGPU::SI_IF)
3585           .addDef(Def)
3586           .addUse(Use)
3587           .addMBB(BrTarget);
3588       } else {
3589         B.buildInstr(AMDGPU::SI_ELSE)
3590           .addDef(Def)
3591           .addUse(Use)
3592           .addMBB(BrTarget)
3593           .addImm(0);
3594       }
3595 
3596       if (Br)
3597         Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB());
3598 
3599       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
3600       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
3601       MI.eraseFromParent();
3602       BrCond->eraseFromParent();
3603       return true;
3604     }
3605 
3606     return false;
3607   }
3608   case Intrinsic::amdgcn_loop: {
3609     MachineInstr *Br = nullptr;
3610     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
3611       const SIRegisterInfo *TRI
3612         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
3613 
3614       B.setInstr(*BrCond);
3615 
3616       MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB();
3617       if (Br)
3618         BrTarget = Br->getOperand(0).getMBB();
3619 
3620       Register Reg = MI.getOperand(2).getReg();
3621       B.buildInstr(AMDGPU::SI_LOOP)
3622         .addUse(Reg)
3623         .addMBB(BrTarget);
3624 
3625       if (Br)
3626         Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB());
3627 
3628       MI.eraseFromParent();
3629       BrCond->eraseFromParent();
3630       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
3631       return true;
3632     }
3633 
3634     return false;
3635   }
3636   case Intrinsic::amdgcn_kernarg_segment_ptr:
3637     return legalizePreloadedArgIntrin(
3638       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
3639   case Intrinsic::amdgcn_implicitarg_ptr:
3640     return legalizeImplicitArgPtr(MI, MRI, B);
3641   case Intrinsic::amdgcn_workitem_id_x:
3642     return legalizePreloadedArgIntrin(MI, MRI, B,
3643                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
3644   case Intrinsic::amdgcn_workitem_id_y:
3645     return legalizePreloadedArgIntrin(MI, MRI, B,
3646                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
3647   case Intrinsic::amdgcn_workitem_id_z:
3648     return legalizePreloadedArgIntrin(MI, MRI, B,
3649                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
3650   case Intrinsic::amdgcn_workgroup_id_x:
3651     return legalizePreloadedArgIntrin(MI, MRI, B,
3652                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
3653   case Intrinsic::amdgcn_workgroup_id_y:
3654     return legalizePreloadedArgIntrin(MI, MRI, B,
3655                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
3656   case Intrinsic::amdgcn_workgroup_id_z:
3657     return legalizePreloadedArgIntrin(MI, MRI, B,
3658                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
3659   case Intrinsic::amdgcn_dispatch_ptr:
3660     return legalizePreloadedArgIntrin(MI, MRI, B,
3661                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
3662   case Intrinsic::amdgcn_queue_ptr:
3663     return legalizePreloadedArgIntrin(MI, MRI, B,
3664                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
3665   case Intrinsic::amdgcn_implicit_buffer_ptr:
3666     return legalizePreloadedArgIntrin(
3667       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
3668   case Intrinsic::amdgcn_dispatch_id:
3669     return legalizePreloadedArgIntrin(MI, MRI, B,
3670                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
3671   case Intrinsic::amdgcn_fdiv_fast:
3672     return legalizeFDIVFastIntrin(MI, MRI, B);
3673   case Intrinsic::amdgcn_is_shared:
3674     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
3675   case Intrinsic::amdgcn_is_private:
3676     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
3677   case Intrinsic::amdgcn_wavefrontsize: {
3678     B.setInstr(MI);
3679     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
3680     MI.eraseFromParent();
3681     return true;
3682   }
3683   case Intrinsic::amdgcn_s_buffer_load:
3684     return legalizeSBufferLoad(MI, B, Observer);
3685   case Intrinsic::amdgcn_raw_buffer_store:
3686   case Intrinsic::amdgcn_struct_buffer_store:
3687     return legalizeBufferStore(MI, MRI, B, false, false);
3688   case Intrinsic::amdgcn_raw_buffer_store_format:
3689   case Intrinsic::amdgcn_struct_buffer_store_format:
3690     return legalizeBufferStore(MI, MRI, B, false, true);
3691   case Intrinsic::amdgcn_raw_tbuffer_store:
3692   case Intrinsic::amdgcn_struct_tbuffer_store:
3693     return legalizeBufferStore(MI, MRI, B, true, true);
3694   case Intrinsic::amdgcn_raw_buffer_load:
3695   case Intrinsic::amdgcn_struct_buffer_load:
3696     return legalizeBufferLoad(MI, MRI, B, false, false);
3697   case Intrinsic::amdgcn_raw_buffer_load_format:
3698   case Intrinsic::amdgcn_struct_buffer_load_format:
3699     return legalizeBufferLoad(MI, MRI, B, true, false);
3700   case Intrinsic::amdgcn_raw_tbuffer_load:
3701   case Intrinsic::amdgcn_struct_tbuffer_load:
3702     return legalizeBufferLoad(MI, MRI, B, true, true);
3703   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3704   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3705   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3706   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3707   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3708   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3709   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3710   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3711   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3712   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3713   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3714   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3715   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3716   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3717   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3718   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3719   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3720   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3721   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3722   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3723   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3724   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3725   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3726   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3727   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3728   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3729     return legalizeBufferAtomic(MI, B, IntrID);
3730   case Intrinsic::amdgcn_atomic_inc:
3731     return legalizeAtomicIncDec(MI, B, true);
3732   case Intrinsic::amdgcn_atomic_dec:
3733     return legalizeAtomicIncDec(MI, B, false);
3734   default: {
3735     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
3736             AMDGPU::getImageDimIntrinsicInfo(IntrID))
3737       return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr);
3738     return true;
3739   }
3740   }
3741 
3742   return true;
3743 }
3744