1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
19 #endif
20 
21 #include "AMDGPULegalizerInfo.h"
22 
23 #include "AMDGPU.h"
24 #include "AMDGPUGlobalISelUtils.h"
25 #include "AMDGPUTargetMachine.h"
26 #include "SIMachineFunctionInfo.h"
27 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
28 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
29 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
30 #include "llvm/CodeGen/TargetOpcodes.h"
31 #include "llvm/CodeGen/ValueTypes.h"
32 #include "llvm/IR/DerivedTypes.h"
33 #include "llvm/IR/DiagnosticInfo.h"
34 #include "llvm/IR/Type.h"
35 #include "llvm/Support/Debug.h"
36 
37 #define DEBUG_TYPE "amdgpu-legalinfo"
38 
39 using namespace llvm;
40 using namespace LegalizeActions;
41 using namespace LegalizeMutations;
42 using namespace LegalityPredicates;
43 using namespace MIPatternMatch;
44 
45 // Round the number of elements to the next power of two elements
46 static LLT getPow2VectorType(LLT Ty) {
47   unsigned NElts = Ty.getNumElements();
48   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
49   return Ty.changeNumElements(Pow2NElts);
50 }
51 
52 // Round the number of bits to the next power of two bits
53 static LLT getPow2ScalarType(LLT Ty) {
54   unsigned Bits = Ty.getSizeInBits();
55   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
56   return LLT::scalar(Pow2Bits);
57 }
58 
59 static LegalityPredicate isMultiple32(unsigned TypeIdx,
60                                       unsigned MaxSize = 1024) {
61   return [=](const LegalityQuery &Query) {
62     const LLT Ty = Query.Types[TypeIdx];
63     const LLT EltTy = Ty.getScalarType();
64     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
65   };
66 }
67 
68 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
69   return [=](const LegalityQuery &Query) {
70     return Query.Types[TypeIdx].getSizeInBits() == Size;
71   };
72 }
73 
74 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
75   return [=](const LegalityQuery &Query) {
76     const LLT Ty = Query.Types[TypeIdx];
77     return Ty.isVector() &&
78            Ty.getNumElements() % 2 != 0 &&
79            Ty.getElementType().getSizeInBits() < 32 &&
80            Ty.getSizeInBits() % 32 != 0;
81   };
82 }
83 
84 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
85   return [=](const LegalityQuery &Query) {
86     const LLT Ty = Query.Types[TypeIdx];
87     const LLT EltTy = Ty.getScalarType();
88     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
89   };
90 }
91 
92 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
93   return [=](const LegalityQuery &Query) {
94     const LLT Ty = Query.Types[TypeIdx];
95     const LLT EltTy = Ty.getElementType();
96     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
97   };
98 }
99 
100 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
101   return [=](const LegalityQuery &Query) {
102     const LLT Ty = Query.Types[TypeIdx];
103     const LLT EltTy = Ty.getElementType();
104     unsigned Size = Ty.getSizeInBits();
105     unsigned Pieces = (Size + 63) / 64;
106     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
107     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
108   };
109 }
110 
111 // Increase the number of vector elements to reach the next multiple of 32-bit
112 // type.
113 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
114   return [=](const LegalityQuery &Query) {
115     const LLT Ty = Query.Types[TypeIdx];
116 
117     const LLT EltTy = Ty.getElementType();
118     const int Size = Ty.getSizeInBits();
119     const int EltSize = EltTy.getSizeInBits();
120     const int NextMul32 = (Size + 31) / 32;
121 
122     assert(EltSize < 32);
123 
124     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
125     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
126   };
127 }
128 
129 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
130   return [=](const LegalityQuery &Query) {
131     const LLT QueryTy = Query.Types[TypeIdx];
132     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
133   };
134 }
135 
136 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
137   return [=](const LegalityQuery &Query) {
138     const LLT QueryTy = Query.Types[TypeIdx];
139     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
140   };
141 }
142 
143 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
144   return [=](const LegalityQuery &Query) {
145     const LLT QueryTy = Query.Types[TypeIdx];
146     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
147   };
148 }
149 
150 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
151 // v2s16.
152 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
153   return [=](const LegalityQuery &Query) {
154     const LLT Ty = Query.Types[TypeIdx];
155     if (Ty.isVector()) {
156       const int EltSize = Ty.getElementType().getSizeInBits();
157       return EltSize == 32 || EltSize == 64 ||
158             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
159              EltSize == 128 || EltSize == 256;
160     }
161 
162     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
163   };
164 }
165 
166 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
167   return [=](const LegalityQuery &Query) {
168     const LLT QueryTy = Query.Types[TypeIdx];
169     return QueryTy.isVector() && QueryTy.getElementType() == Type;
170   };
171 }
172 
173 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
174   return [=](const LegalityQuery &Query) {
175     const LLT Ty = Query.Types[TypeIdx];
176     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
177            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
178   };
179 }
180 
181 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
182                                          const GCNTargetMachine &TM)
183   :  ST(ST_) {
184   using namespace TargetOpcode;
185 
186   auto GetAddrSpacePtr = [&TM](unsigned AS) {
187     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
188   };
189 
190   const LLT S1 = LLT::scalar(1);
191   const LLT S16 = LLT::scalar(16);
192   const LLT S32 = LLT::scalar(32);
193   const LLT S64 = LLT::scalar(64);
194   const LLT S96 = LLT::scalar(96);
195   const LLT S128 = LLT::scalar(128);
196   const LLT S256 = LLT::scalar(256);
197   const LLT S1024 = LLT::scalar(1024);
198 
199   const LLT V2S16 = LLT::vector(2, 16);
200   const LLT V4S16 = LLT::vector(4, 16);
201 
202   const LLT V2S32 = LLT::vector(2, 32);
203   const LLT V3S32 = LLT::vector(3, 32);
204   const LLT V4S32 = LLT::vector(4, 32);
205   const LLT V5S32 = LLT::vector(5, 32);
206   const LLT V6S32 = LLT::vector(6, 32);
207   const LLT V7S32 = LLT::vector(7, 32);
208   const LLT V8S32 = LLT::vector(8, 32);
209   const LLT V9S32 = LLT::vector(9, 32);
210   const LLT V10S32 = LLT::vector(10, 32);
211   const LLT V11S32 = LLT::vector(11, 32);
212   const LLT V12S32 = LLT::vector(12, 32);
213   const LLT V13S32 = LLT::vector(13, 32);
214   const LLT V14S32 = LLT::vector(14, 32);
215   const LLT V15S32 = LLT::vector(15, 32);
216   const LLT V16S32 = LLT::vector(16, 32);
217   const LLT V32S32 = LLT::vector(32, 32);
218 
219   const LLT V2S64 = LLT::vector(2, 64);
220   const LLT V3S64 = LLT::vector(3, 64);
221   const LLT V4S64 = LLT::vector(4, 64);
222   const LLT V5S64 = LLT::vector(5, 64);
223   const LLT V6S64 = LLT::vector(6, 64);
224   const LLT V7S64 = LLT::vector(7, 64);
225   const LLT V8S64 = LLT::vector(8, 64);
226   const LLT V16S64 = LLT::vector(16, 64);
227 
228   std::initializer_list<LLT> AllS32Vectors =
229     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
230      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
231   std::initializer_list<LLT> AllS64Vectors =
232     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
233 
234   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
235   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
236   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
237   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
238   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
239   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
240   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
241 
242   const LLT CodePtr = FlatPtr;
243 
244   const std::initializer_list<LLT> AddrSpaces64 = {
245     GlobalPtr, ConstantPtr, FlatPtr
246   };
247 
248   const std::initializer_list<LLT> AddrSpaces32 = {
249     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
250   };
251 
252   const std::initializer_list<LLT> FPTypesBase = {
253     S32, S64
254   };
255 
256   const std::initializer_list<LLT> FPTypes16 = {
257     S32, S64, S16
258   };
259 
260   const std::initializer_list<LLT> FPTypesPK16 = {
261     S32, S64, S16, V2S16
262   };
263 
264   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
265 
266   setAction({G_BRCOND, S1}, Legal); // VCC branches
267   setAction({G_BRCOND, S32}, Legal); // SCC branches
268 
269   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
270   // elements for v3s16
271   getActionDefinitionsBuilder(G_PHI)
272     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
273     .legalFor(AllS32Vectors)
274     .legalFor(AllS64Vectors)
275     .legalFor(AddrSpaces64)
276     .legalFor(AddrSpaces32)
277     .clampScalar(0, S32, S256)
278     .widenScalarToNextPow2(0, 32)
279     .clampMaxNumElements(0, S32, 16)
280     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
281     .legalIf(isPointer(0));
282 
283   if (ST.has16BitInsts()) {
284     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
285       .legalFor({S32, S16})
286       .clampScalar(0, S16, S32)
287       .scalarize(0)
288       .widenScalarToNextPow2(0, 32);
289   } else {
290     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
291       .legalFor({S32})
292       .clampScalar(0, S32, S32)
293       .scalarize(0);
294   }
295 
296   // FIXME: Not really legal. Placeholder for custom lowering.
297   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
298     .legalFor({S32, S64})
299     .clampScalar(0, S32, S64)
300     .widenScalarToNextPow2(0, 32)
301     .scalarize(0);
302 
303   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
304     .legalFor({S32})
305     .clampScalar(0, S32, S32)
306     .scalarize(0);
307 
308   // Report legal for any types we can handle anywhere. For the cases only legal
309   // on the SALU, RegBankSelect will be able to re-legalize.
310   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
311     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
312     .clampScalar(0, S32, S64)
313     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
314     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
315     .widenScalarToNextPow2(0)
316     .scalarize(0);
317 
318   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
319                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
320     .legalFor({{S32, S1}, {S32, S32}})
321     .clampScalar(0, S32, S32)
322     .scalarize(0); // TODO: Implement.
323 
324   getActionDefinitionsBuilder(G_BITCAST)
325     // Don't worry about the size constraint.
326     .legalIf(all(isRegisterType(0), isRegisterType(1)))
327     .lower();
328 
329 
330   getActionDefinitionsBuilder(G_CONSTANT)
331     .legalFor({S1, S32, S64, S16, GlobalPtr,
332                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
333     .clampScalar(0, S32, S64)
334     .widenScalarToNextPow2(0)
335     .legalIf(isPointer(0));
336 
337   getActionDefinitionsBuilder(G_FCONSTANT)
338     .legalFor({S32, S64, S16})
339     .clampScalar(0, S16, S64);
340 
341   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
342     .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
343                ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
344     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
345     .clampScalarOrElt(0, S32, S1024)
346     .legalIf(isMultiple32(0))
347     .widenScalarToNextPow2(0, 32)
348     .clampMaxNumElements(0, S32, 16);
349 
350   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
351   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
352     .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr});
353   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
354 
355   auto &FPOpActions = getActionDefinitionsBuilder(
356     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
357     .legalFor({S32, S64});
358   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
359     .customFor({S32, S64});
360   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
361     .customFor({S32, S64});
362 
363   if (ST.has16BitInsts()) {
364     if (ST.hasVOP3PInsts())
365       FPOpActions.legalFor({S16, V2S16});
366     else
367       FPOpActions.legalFor({S16});
368 
369     TrigActions.customFor({S16});
370     FDIVActions.customFor({S16});
371   }
372 
373   auto &MinNumMaxNum = getActionDefinitionsBuilder({
374       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
375 
376   if (ST.hasVOP3PInsts()) {
377     MinNumMaxNum.customFor(FPTypesPK16)
378       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
379       .clampMaxNumElements(0, S16, 2)
380       .clampScalar(0, S16, S64)
381       .scalarize(0);
382   } else if (ST.has16BitInsts()) {
383     MinNumMaxNum.customFor(FPTypes16)
384       .clampScalar(0, S16, S64)
385       .scalarize(0);
386   } else {
387     MinNumMaxNum.customFor(FPTypesBase)
388       .clampScalar(0, S32, S64)
389       .scalarize(0);
390   }
391 
392   if (ST.hasVOP3PInsts())
393     FPOpActions.clampMaxNumElements(0, S16, 2);
394 
395   FPOpActions
396     .scalarize(0)
397     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
398 
399   TrigActions
400     .scalarize(0)
401     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
402 
403   FDIVActions
404     .scalarize(0)
405     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
406 
407   getActionDefinitionsBuilder({G_FNEG, G_FABS})
408     .legalFor(FPTypesPK16)
409     .clampMaxNumElements(0, S16, 2)
410     .scalarize(0)
411     .clampScalar(0, S16, S64);
412 
413   if (ST.has16BitInsts()) {
414     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
415       .legalFor({S32, S64, S16})
416       .scalarize(0)
417       .clampScalar(0, S16, S64);
418   } else {
419     getActionDefinitionsBuilder(G_FSQRT)
420       .legalFor({S32, S64})
421       .scalarize(0)
422       .clampScalar(0, S32, S64);
423 
424     if (ST.hasFractBug()) {
425       getActionDefinitionsBuilder(G_FFLOOR)
426         .customFor({S64})
427         .legalFor({S32, S64})
428         .scalarize(0)
429         .clampScalar(0, S32, S64);
430     } else {
431       getActionDefinitionsBuilder(G_FFLOOR)
432         .legalFor({S32, S64})
433         .scalarize(0)
434         .clampScalar(0, S32, S64);
435     }
436   }
437 
438   getActionDefinitionsBuilder(G_FPTRUNC)
439     .legalFor({{S32, S64}, {S16, S32}})
440     .scalarize(0);
441 
442   getActionDefinitionsBuilder(G_FPEXT)
443     .legalFor({{S64, S32}, {S32, S16}})
444     .lowerFor({{S64, S16}}) // FIXME: Implement
445     .scalarize(0);
446 
447   getActionDefinitionsBuilder(G_FSUB)
448       // Use actual fsub instruction
449       .legalFor({S32})
450       // Must use fadd + fneg
451       .lowerFor({S64, S16, V2S16})
452       .scalarize(0)
453       .clampScalar(0, S32, S64);
454 
455   // Whether this is legal depends on the floating point mode for the function.
456   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
457   if (ST.hasMadF16())
458     FMad.customFor({S32, S16});
459   else
460     FMad.customFor({S32});
461   FMad.scalarize(0)
462       .lower();
463 
464   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
465     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
466                {S32, S1}, {S64, S1}, {S16, S1}})
467     .scalarize(0)
468     .clampScalar(0, S32, S64)
469     .widenScalarToNextPow2(1, 32);
470 
471   // TODO: Split s1->s64 during regbankselect for VALU.
472   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
473     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
474     .lowerFor({{S32, S64}})
475     .lowerIf(typeIs(1, S1))
476     .customFor({{S64, S64}});
477   if (ST.has16BitInsts())
478     IToFP.legalFor({{S16, S16}});
479   IToFP.clampScalar(1, S32, S64)
480        .scalarize(0);
481 
482   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
483     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
484     .customFor({{S64, S64}});
485   if (ST.has16BitInsts())
486     FPToI.legalFor({{S16, S16}});
487   else
488     FPToI.minScalar(1, S32);
489 
490   FPToI.minScalar(0, S32)
491        .scalarize(0)
492        .lower();
493 
494   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
495     .scalarize(0)
496     .lower();
497 
498   if (ST.has16BitInsts()) {
499     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
500       .legalFor({S16, S32, S64})
501       .clampScalar(0, S16, S64)
502       .scalarize(0);
503   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
504     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
505       .legalFor({S32, S64})
506       .clampScalar(0, S32, S64)
507       .scalarize(0);
508   } else {
509     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
510       .legalFor({S32})
511       .customFor({S64})
512       .clampScalar(0, S32, S64)
513       .scalarize(0);
514   }
515 
516   getActionDefinitionsBuilder({G_PTR_ADD, G_PTR_MASK})
517     .scalarize(0)
518     .alwaysLegal();
519 
520   auto &CmpBuilder =
521     getActionDefinitionsBuilder(G_ICMP)
522     // The compare output type differs based on the register bank of the output,
523     // so make both s1 and s32 legal.
524     //
525     // Scalar compares producing output in scc will be promoted to s32, as that
526     // is the allocatable register type that will be needed for the copy from
527     // scc. This will be promoted during RegBankSelect, and we assume something
528     // before that won't try to use s32 result types.
529     //
530     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
531     // bank.
532     .legalForCartesianProduct(
533       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
534     .legalForCartesianProduct(
535       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
536   if (ST.has16BitInsts()) {
537     CmpBuilder.legalFor({{S1, S16}});
538   }
539 
540   CmpBuilder
541     .widenScalarToNextPow2(1)
542     .clampScalar(1, S32, S64)
543     .scalarize(0)
544     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
545 
546   getActionDefinitionsBuilder(G_FCMP)
547     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
548     .widenScalarToNextPow2(1)
549     .clampScalar(1, S32, S64)
550     .scalarize(0);
551 
552   // FIXME: fpow has a selection pattern that should move to custom lowering.
553   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2, G_FPOW});
554   if (ST.has16BitInsts())
555     Exp2Ops.legalFor({S32, S16});
556   else
557     Exp2Ops.legalFor({S32});
558   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
559   Exp2Ops.scalarize(0);
560 
561   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10});
562   if (ST.has16BitInsts())
563     ExpOps.customFor({{S32}, {S16}});
564   else
565     ExpOps.customFor({S32});
566   ExpOps.clampScalar(0, MinScalarFPTy, S32)
567         .scalarize(0);
568 
569   // The 64-bit versions produce 32-bit results, but only on the SALU.
570   getActionDefinitionsBuilder(G_CTPOP)
571     .legalFor({{S32, S32}, {S32, S64}})
572     .clampScalar(0, S32, S32)
573     .clampScalar(1, S32, S64)
574     .scalarize(0)
575     .widenScalarToNextPow2(0, 32)
576     .widenScalarToNextPow2(1, 32);
577 
578   // The hardware instructions return a different result on 0 than the generic
579   // instructions expect. The hardware produces -1, but these produce the
580   // bitwidth.
581   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
582     .scalarize(0)
583     .clampScalar(0, S32, S32)
584     .clampScalar(1, S32, S64)
585     .widenScalarToNextPow2(0, 32)
586     .widenScalarToNextPow2(1, 32)
587     .lower();
588 
589   // The 64-bit versions produce 32-bit results, but only on the SALU.
590   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
591     .legalFor({{S32, S32}, {S32, S64}})
592     .clampScalar(0, S32, S32)
593     .clampScalar(1, S32, S64)
594     .scalarize(0)
595     .widenScalarToNextPow2(0, 32)
596     .widenScalarToNextPow2(1, 32);
597 
598   // TODO: Expand for > s32
599   getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE})
600     .legalFor({S32})
601     .clampScalar(0, S32, S32)
602     .scalarize(0);
603 
604   if (ST.has16BitInsts()) {
605     if (ST.hasVOP3PInsts()) {
606       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
607         .legalFor({S32, S16, V2S16})
608         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
609         .clampMaxNumElements(0, S16, 2)
610         .clampScalar(0, S16, S32)
611         .widenScalarToNextPow2(0)
612         .scalarize(0);
613     } else {
614       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
615         .legalFor({S32, S16})
616         .widenScalarToNextPow2(0)
617         .clampScalar(0, S16, S32)
618         .scalarize(0);
619     }
620   } else {
621     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
622       .legalFor({S32})
623       .clampScalar(0, S32, S32)
624       .widenScalarToNextPow2(0)
625       .scalarize(0);
626   }
627 
628   auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
629     return [=](const LegalityQuery &Query) {
630       return Query.Types[TypeIdx0].getSizeInBits() <
631              Query.Types[TypeIdx1].getSizeInBits();
632     };
633   };
634 
635   auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
636     return [=](const LegalityQuery &Query) {
637       return Query.Types[TypeIdx0].getSizeInBits() >
638              Query.Types[TypeIdx1].getSizeInBits();
639     };
640   };
641 
642   getActionDefinitionsBuilder(G_INTTOPTR)
643     // List the common cases
644     .legalForCartesianProduct(AddrSpaces64, {S64})
645     .legalForCartesianProduct(AddrSpaces32, {S32})
646     .scalarize(0)
647     // Accept any address space as long as the size matches
648     .legalIf(sameSize(0, 1))
649     .widenScalarIf(smallerThan(1, 0),
650       [](const LegalityQuery &Query) {
651         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
652       })
653     .narrowScalarIf(greaterThan(1, 0),
654       [](const LegalityQuery &Query) {
655         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
656       });
657 
658   getActionDefinitionsBuilder(G_PTRTOINT)
659     // List the common cases
660     .legalForCartesianProduct(AddrSpaces64, {S64})
661     .legalForCartesianProduct(AddrSpaces32, {S32})
662     .scalarize(0)
663     // Accept any address space as long as the size matches
664     .legalIf(sameSize(0, 1))
665     .widenScalarIf(smallerThan(0, 1),
666       [](const LegalityQuery &Query) {
667         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
668       })
669     .narrowScalarIf(
670       greaterThan(0, 1),
671       [](const LegalityQuery &Query) {
672         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
673       });
674 
675   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
676     .scalarize(0)
677     .custom();
678 
679   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
680   // handle some operations by just promoting the register during
681   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
682   auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned {
683     switch (AS) {
684     // FIXME: Private element size.
685     case AMDGPUAS::PRIVATE_ADDRESS:
686       return 32;
687     // FIXME: Check subtarget
688     case AMDGPUAS::LOCAL_ADDRESS:
689       return ST.useDS128() ? 128 : 64;
690 
691     // Treat constant and global as identical. SMRD loads are sometimes usable
692     // for global loads (ideally constant address space should be eliminated)
693     // depending on the context. Legality cannot be context dependent, but
694     // RegBankSelect can split the load as necessary depending on the pointer
695     // register bank/uniformity and if the memory is invariant or not written in
696     // a kernel.
697     case AMDGPUAS::CONSTANT_ADDRESS:
698     case AMDGPUAS::GLOBAL_ADDRESS:
699       return IsLoad ? 512 : 128;
700     default:
701       return 128;
702     }
703   };
704 
705   const auto needToSplitMemOp = [=](const LegalityQuery &Query, bool IsLoad) -> bool {
706     const LLT DstTy = Query.Types[0];
707 
708     // Split vector extloads.
709     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
710     unsigned Align = Query.MMODescrs[0].AlignInBits;
711 
712     if (MemSize < DstTy.getSizeInBits())
713       MemSize = std::max(MemSize, Align);
714 
715     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
716       return true;
717 
718     const LLT PtrTy = Query.Types[1];
719     unsigned AS = PtrTy.getAddressSpace();
720     if (MemSize > maxSizeForAddrSpace(AS, IsLoad))
721       return true;
722 
723     // Catch weird sized loads that don't evenly divide into the access sizes
724     // TODO: May be able to widen depending on alignment etc.
725     unsigned NumRegs = MemSize / 32;
726     if (NumRegs == 3 && !ST.hasDwordx3LoadStores())
727       return true;
728 
729     if (Align < MemSize) {
730       const SITargetLowering *TLI = ST.getTargetLowering();
731       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
732     }
733 
734     return false;
735   };
736 
737   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
738   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
739   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
740 
741   // TODO: Refine based on subtargets which support unaligned access or 128-bit
742   // LDS
743   // TODO: Unsupported flat for SI.
744 
745   for (unsigned Op : {G_LOAD, G_STORE}) {
746     const bool IsStore = Op == G_STORE;
747 
748     auto &Actions = getActionDefinitionsBuilder(Op);
749     // Whitelist the common cases.
750     // TODO: Pointer loads
751     // TODO: Wide constant loads
752     // TODO: Only CI+ has 3x loads
753     // TODO: Loads to s16 on gfx9
754     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
755                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
756                                       {V3S32, GlobalPtr, 96, GlobalAlign32},
757                                       {S96, GlobalPtr, 96, GlobalAlign32},
758                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
759                                       {S128, GlobalPtr, 128, GlobalAlign32},
760                                       {S64, GlobalPtr, 64, GlobalAlign32},
761                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
762                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
763                                       {S32, GlobalPtr, 8, GlobalAlign8},
764                                       {S32, GlobalPtr, 16, GlobalAlign16},
765 
766                                       {S32, LocalPtr, 32, 32},
767                                       {S64, LocalPtr, 64, 32},
768                                       {V2S32, LocalPtr, 64, 32},
769                                       {S32, LocalPtr, 8, 8},
770                                       {S32, LocalPtr, 16, 16},
771                                       {V2S16, LocalPtr, 32, 32},
772 
773                                       {S32, PrivatePtr, 32, 32},
774                                       {S32, PrivatePtr, 8, 8},
775                                       {S32, PrivatePtr, 16, 16},
776                                       {V2S16, PrivatePtr, 32, 32},
777 
778                                       {S32, FlatPtr, 32, GlobalAlign32},
779                                       {S32, FlatPtr, 16, GlobalAlign16},
780                                       {S32, FlatPtr, 8, GlobalAlign8},
781                                       {V2S16, FlatPtr, 32, GlobalAlign32},
782 
783                                       {S32, ConstantPtr, 32, GlobalAlign32},
784                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
785                                       {V3S32, ConstantPtr, 96, GlobalAlign32},
786                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
787                                       {S64, ConstantPtr, 64, GlobalAlign32},
788                                       {S128, ConstantPtr, 128, GlobalAlign32},
789                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
790     Actions
791         .customIf(typeIs(1, Constant32Ptr))
792         .narrowScalarIf(
793             [=](const LegalityQuery &Query) -> bool {
794               return !Query.Types[0].isVector() &&
795                      needToSplitMemOp(Query, Op == G_LOAD);
796             },
797             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
798               const LLT DstTy = Query.Types[0];
799               const LLT PtrTy = Query.Types[1];
800 
801               const unsigned DstSize = DstTy.getSizeInBits();
802               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
803 
804               // Split extloads.
805               if (DstSize > MemSize)
806                 return std::make_pair(0, LLT::scalar(MemSize));
807 
808               if (DstSize > 32 && (DstSize % 32 != 0)) {
809                 // FIXME: Need a way to specify non-extload of larger size if
810                 // suitably aligned.
811                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
812               }
813 
814               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
815                                                      Op == G_LOAD);
816               if (MemSize > MaxSize)
817                 return std::make_pair(0, LLT::scalar(MaxSize));
818 
819               unsigned Align = Query.MMODescrs[0].AlignInBits;
820               return std::make_pair(0, LLT::scalar(Align));
821             })
822         .fewerElementsIf(
823             [=](const LegalityQuery &Query) -> bool {
824               return Query.Types[0].isVector() &&
825                      needToSplitMemOp(Query, Op == G_LOAD);
826             },
827             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
828               const LLT DstTy = Query.Types[0];
829               const LLT PtrTy = Query.Types[1];
830 
831               LLT EltTy = DstTy.getElementType();
832               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
833                                                      Op == G_LOAD);
834 
835               // Split if it's too large for the address space.
836               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
837                 unsigned NumElts = DstTy.getNumElements();
838                 unsigned EltSize = EltTy.getSizeInBits();
839 
840                 if (MaxSize % EltSize == 0) {
841                   return std::make_pair(
842                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
843                 }
844 
845                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
846 
847                 // FIXME: Refine when odd breakdowns handled
848                 // The scalars will need to be re-legalized.
849                 if (NumPieces == 1 || NumPieces >= NumElts ||
850                     NumElts % NumPieces != 0)
851                   return std::make_pair(0, EltTy);
852 
853                 return std::make_pair(0,
854                                       LLT::vector(NumElts / NumPieces, EltTy));
855               }
856 
857               // Need to split because of alignment.
858               unsigned Align = Query.MMODescrs[0].AlignInBits;
859               unsigned EltSize = EltTy.getSizeInBits();
860               if (EltSize > Align &&
861                   (EltSize / Align < DstTy.getNumElements())) {
862                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
863               }
864 
865               // May need relegalization for the scalars.
866               return std::make_pair(0, EltTy);
867             })
868         .minScalar(0, S32);
869 
870     if (IsStore)
871       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
872 
873     // TODO: Need a bitcast lower option?
874     Actions
875         .legalIf([=](const LegalityQuery &Query) {
876           const LLT Ty0 = Query.Types[0];
877           unsigned Size = Ty0.getSizeInBits();
878           unsigned MemSize = Query.MMODescrs[0].SizeInBits;
879           unsigned Align = Query.MMODescrs[0].AlignInBits;
880 
881           // FIXME: Widening store from alignment not valid.
882           if (MemSize < Size)
883             MemSize = std::max(MemSize, Align);
884 
885           // No extending vector loads.
886           if (Size > MemSize && Ty0.isVector())
887             return false;
888 
889           switch (MemSize) {
890           case 8:
891           case 16:
892             return Size == 32;
893           case 32:
894           case 64:
895           case 128:
896             return true;
897           case 96:
898             return ST.hasDwordx3LoadStores();
899           case 256:
900           case 512:
901             return true;
902           default:
903             return false;
904           }
905         })
906         .widenScalarToNextPow2(0)
907         // TODO: v3s32->v4s32 with alignment
908         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
909   }
910 
911   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
912                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
913                                                   {S32, GlobalPtr, 16, 2 * 8},
914                                                   {S32, LocalPtr, 8, 8},
915                                                   {S32, LocalPtr, 16, 16},
916                                                   {S32, PrivatePtr, 8, 8},
917                                                   {S32, PrivatePtr, 16, 16},
918                                                   {S32, ConstantPtr, 8, 8},
919                                                   {S32, ConstantPtr, 16, 2 * 8}});
920   if (ST.hasFlatAddressSpace()) {
921     ExtLoads.legalForTypesWithMemDesc(
922         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
923   }
924 
925   ExtLoads.clampScalar(0, S32, S32)
926           .widenScalarToNextPow2(0)
927           .unsupportedIfMemSizeNotPow2()
928           .lower();
929 
930   auto &Atomics = getActionDefinitionsBuilder(
931     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
932      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
933      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
934      G_ATOMICRMW_UMIN})
935     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
936                {S64, GlobalPtr}, {S64, LocalPtr}});
937   if (ST.hasFlatAddressSpace()) {
938     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
939   }
940 
941   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
942     .legalFor({{S32, LocalPtr}});
943 
944   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
945   // demarshalling
946   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
947     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
948                 {S32, FlatPtr}, {S64, FlatPtr}})
949     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
950                {S32, RegionPtr}, {S64, RegionPtr}});
951   // TODO: Pointer types, any 32-bit or 64-bit vector
952 
953   // Condition should be s32 for scalar, s1 for vector.
954   getActionDefinitionsBuilder(G_SELECT)
955     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
956           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
957           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
958     .clampScalar(0, S16, S64)
959     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
960     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
961     .scalarize(1)
962     .clampMaxNumElements(0, S32, 2)
963     .clampMaxNumElements(0, LocalPtr, 2)
964     .clampMaxNumElements(0, PrivatePtr, 2)
965     .scalarize(0)
966     .widenScalarToNextPow2(0)
967     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
968 
969   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
970   // be more flexible with the shift amount type.
971   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
972     .legalFor({{S32, S32}, {S64, S32}});
973   if (ST.has16BitInsts()) {
974     if (ST.hasVOP3PInsts()) {
975       Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
976             .clampMaxNumElements(0, S16, 2);
977     } else
978       Shifts.legalFor({{S16, S32}, {S16, S16}});
979 
980     // TODO: Support 16-bit shift amounts
981     Shifts.clampScalar(1, S32, S32);
982     Shifts.clampScalar(0, S16, S64);
983     Shifts.widenScalarToNextPow2(0, 16);
984   } else {
985     // Make sure we legalize the shift amount type first, as the general
986     // expansion for the shifted type will produce much worse code if it hasn't
987     // been truncated already.
988     Shifts.clampScalar(1, S32, S32);
989     Shifts.clampScalar(0, S32, S64);
990     Shifts.widenScalarToNextPow2(0, 32);
991   }
992   Shifts.scalarize(0);
993 
994   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
995     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
996     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
997     unsigned IdxTypeIdx = 2;
998 
999     getActionDefinitionsBuilder(Op)
1000       .customIf([=](const LegalityQuery &Query) {
1001           const LLT EltTy = Query.Types[EltTypeIdx];
1002           const LLT VecTy = Query.Types[VecTypeIdx];
1003           const LLT IdxTy = Query.Types[IdxTypeIdx];
1004           return (EltTy.getSizeInBits() == 16 ||
1005                   EltTy.getSizeInBits() % 32 == 0) &&
1006                  VecTy.getSizeInBits() % 32 == 0 &&
1007                  VecTy.getSizeInBits() <= 1024 &&
1008                  IdxTy.getSizeInBits() == 32;
1009         })
1010       .clampScalar(EltTypeIdx, S32, S64)
1011       .clampScalar(VecTypeIdx, S32, S64)
1012       .clampScalar(IdxTypeIdx, S32, S32);
1013   }
1014 
1015   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1016     .unsupportedIf([=](const LegalityQuery &Query) {
1017         const LLT &EltTy = Query.Types[1].getElementType();
1018         return Query.Types[0] != EltTy;
1019       });
1020 
1021   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1022     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1023     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1024 
1025     // FIXME: Doesn't handle extract of illegal sizes.
1026     getActionDefinitionsBuilder(Op)
1027       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1028       // FIXME: Multiples of 16 should not be legal.
1029       .legalIf([=](const LegalityQuery &Query) {
1030           const LLT BigTy = Query.Types[BigTyIdx];
1031           const LLT LitTy = Query.Types[LitTyIdx];
1032           return (BigTy.getSizeInBits() % 32 == 0) &&
1033                  (LitTy.getSizeInBits() % 16 == 0);
1034         })
1035       .widenScalarIf(
1036         [=](const LegalityQuery &Query) {
1037           const LLT BigTy = Query.Types[BigTyIdx];
1038           return (BigTy.getScalarSizeInBits() < 16);
1039         },
1040         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1041       .widenScalarIf(
1042         [=](const LegalityQuery &Query) {
1043           const LLT LitTy = Query.Types[LitTyIdx];
1044           return (LitTy.getScalarSizeInBits() < 16);
1045         },
1046         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1047       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1048       .widenScalarToNextPow2(BigTyIdx, 32);
1049 
1050   }
1051 
1052   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1053     .legalForCartesianProduct(AllS32Vectors, {S32})
1054     .legalForCartesianProduct(AllS64Vectors, {S64})
1055     .clampNumElements(0, V16S32, V32S32)
1056     .clampNumElements(0, V2S64, V16S64)
1057     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1058 
1059   if (ST.hasScalarPackInsts()) {
1060     BuildVector
1061       // FIXME: Should probably widen s1 vectors straight to s32
1062       .minScalarOrElt(0, S16)
1063       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1064       .minScalar(1, S32);
1065 
1066     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1067       .legalFor({V2S16, S32})
1068       .lower();
1069     BuildVector.minScalarOrElt(0, S32);
1070   } else {
1071     BuildVector.customFor({V2S16, S16});
1072     BuildVector.minScalarOrElt(0, S32);
1073 
1074     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1075       .customFor({V2S16, S32})
1076       .lower();
1077   }
1078 
1079   BuildVector.legalIf(isRegisterType(0));
1080 
1081   // FIXME: Clamp maximum size
1082   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1083     .legalIf(isRegisterType(0));
1084 
1085   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1086   // pre-legalize.
1087   if (ST.hasVOP3PInsts()) {
1088     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1089       .customFor({V2S16, V2S16})
1090       .lower();
1091   } else
1092     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1093 
1094   // Merge/Unmerge
1095   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1096     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1097     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1098 
1099     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1100       const LLT &Ty = Query.Types[TypeIdx];
1101       if (Ty.isVector()) {
1102         const LLT &EltTy = Ty.getElementType();
1103         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
1104           return true;
1105         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1106           return true;
1107       }
1108       return false;
1109     };
1110 
1111     auto &Builder = getActionDefinitionsBuilder(Op)
1112       // Try to widen to s16 first for small types.
1113       // TODO: Only do this on targets with legal s16 shifts
1114       .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1115 
1116       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1117       .lowerFor({{S16, V2S16}})
1118       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1119       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1120                            elementTypeIs(1, S16)),
1121                        changeTo(1, V2S16))
1122       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1123       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1124       // valid.
1125       .clampScalar(LitTyIdx, S32, S256)
1126       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1127       // Break up vectors with weird elements into scalars
1128       .fewerElementsIf(
1129         [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
1130         scalarize(0))
1131       .fewerElementsIf(
1132         [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
1133         scalarize(1))
1134       .clampScalar(BigTyIdx, S32, S1024);
1135 
1136     if (Op == G_MERGE_VALUES) {
1137       Builder.widenScalarIf(
1138         // TODO: Use 16-bit shifts if legal for 8-bit values?
1139         [=](const LegalityQuery &Query) {
1140           const LLT Ty = Query.Types[LitTyIdx];
1141           return Ty.getSizeInBits() < 32;
1142         },
1143         changeTo(LitTyIdx, S32));
1144     }
1145 
1146     Builder.widenScalarIf(
1147       [=](const LegalityQuery &Query) {
1148         const LLT Ty = Query.Types[BigTyIdx];
1149         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1150           Ty.getSizeInBits() % 16 != 0;
1151       },
1152       [=](const LegalityQuery &Query) {
1153         // Pick the next power of 2, or a multiple of 64 over 128.
1154         // Whichever is smaller.
1155         const LLT &Ty = Query.Types[BigTyIdx];
1156         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1157         if (NewSizeInBits >= 256) {
1158           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1159           if (RoundedTo < NewSizeInBits)
1160             NewSizeInBits = RoundedTo;
1161         }
1162         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1163       })
1164       .legalIf([=](const LegalityQuery &Query) {
1165           const LLT &BigTy = Query.Types[BigTyIdx];
1166           const LLT &LitTy = Query.Types[LitTyIdx];
1167 
1168           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1169             return false;
1170           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1171             return false;
1172 
1173           return BigTy.getSizeInBits() % 16 == 0 &&
1174                  LitTy.getSizeInBits() % 16 == 0 &&
1175                  BigTy.getSizeInBits() <= 1024;
1176         })
1177       // Any vectors left are the wrong size. Scalarize them.
1178       .scalarize(0)
1179       .scalarize(1);
1180   }
1181 
1182   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1183   // RegBankSelect.
1184   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1185     .legalFor({{S32}, {S64}});
1186 
1187   if (ST.hasVOP3PInsts()) {
1188     SextInReg.lowerFor({{V2S16}})
1189       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1190       // get more vector shift opportunities, since we'll get those when
1191       // expanded.
1192       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1193   } else if (ST.has16BitInsts()) {
1194     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1195   } else {
1196     // Prefer to promote to s32 before lowering if we don't have 16-bit
1197     // shifts. This avoid a lot of intermediate truncate and extend operations.
1198     SextInReg.lowerFor({{S32}, {S64}});
1199   }
1200 
1201   SextInReg
1202     .scalarize(0)
1203     .clampScalar(0, S32, S64)
1204     .lower();
1205 
1206   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1207     .legalFor({S64});
1208 
1209   getActionDefinitionsBuilder({
1210       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1211       G_FCOPYSIGN,
1212 
1213       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1214       G_READ_REGISTER,
1215       G_WRITE_REGISTER,
1216 
1217       G_SADDO, G_SSUBO,
1218 
1219        // TODO: Implement
1220       G_FMINIMUM, G_FMAXIMUM
1221     }).lower();
1222 
1223   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1224         G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1225         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1226     .unsupported();
1227 
1228   computeTables();
1229   verify(*ST.getInstrInfo());
1230 }
1231 
1232 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1233                                          MachineRegisterInfo &MRI,
1234                                          MachineIRBuilder &B,
1235                                          GISelChangeObserver &Observer) const {
1236   switch (MI.getOpcode()) {
1237   case TargetOpcode::G_ADDRSPACE_CAST:
1238     return legalizeAddrSpaceCast(MI, MRI, B);
1239   case TargetOpcode::G_FRINT:
1240     return legalizeFrint(MI, MRI, B);
1241   case TargetOpcode::G_FCEIL:
1242     return legalizeFceil(MI, MRI, B);
1243   case TargetOpcode::G_INTRINSIC_TRUNC:
1244     return legalizeIntrinsicTrunc(MI, MRI, B);
1245   case TargetOpcode::G_SITOFP:
1246     return legalizeITOFP(MI, MRI, B, true);
1247   case TargetOpcode::G_UITOFP:
1248     return legalizeITOFP(MI, MRI, B, false);
1249   case TargetOpcode::G_FPTOSI:
1250     return legalizeFPTOI(MI, MRI, B, true);
1251   case TargetOpcode::G_FPTOUI:
1252     return legalizeFPTOI(MI, MRI, B, false);
1253   case TargetOpcode::G_FMINNUM:
1254   case TargetOpcode::G_FMAXNUM:
1255   case TargetOpcode::G_FMINNUM_IEEE:
1256   case TargetOpcode::G_FMAXNUM_IEEE:
1257     return legalizeMinNumMaxNum(MI, MRI, B);
1258   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1259     return legalizeExtractVectorElt(MI, MRI, B);
1260   case TargetOpcode::G_INSERT_VECTOR_ELT:
1261     return legalizeInsertVectorElt(MI, MRI, B);
1262   case TargetOpcode::G_SHUFFLE_VECTOR:
1263     return legalizeShuffleVector(MI, MRI, B);
1264   case TargetOpcode::G_FSIN:
1265   case TargetOpcode::G_FCOS:
1266     return legalizeSinCos(MI, MRI, B);
1267   case TargetOpcode::G_GLOBAL_VALUE:
1268     return legalizeGlobalValue(MI, MRI, B);
1269   case TargetOpcode::G_LOAD:
1270     return legalizeLoad(MI, MRI, B, Observer);
1271   case TargetOpcode::G_FMAD:
1272     return legalizeFMad(MI, MRI, B);
1273   case TargetOpcode::G_FDIV:
1274     return legalizeFDIV(MI, MRI, B);
1275   case TargetOpcode::G_ATOMIC_CMPXCHG:
1276     return legalizeAtomicCmpXChg(MI, MRI, B);
1277   case TargetOpcode::G_FLOG:
1278     return legalizeFlog(MI, B, 1.0f / numbers::log2ef);
1279   case TargetOpcode::G_FLOG10:
1280     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1281   case TargetOpcode::G_FEXP:
1282     return legalizeFExp(MI, B);
1283   case TargetOpcode::G_FFLOOR:
1284     return legalizeFFloor(MI, MRI, B);
1285   case TargetOpcode::G_BUILD_VECTOR:
1286     return legalizeBuildVector(MI, MRI, B);
1287   default:
1288     return false;
1289   }
1290 
1291   llvm_unreachable("expected switch to return");
1292 }
1293 
1294 Register AMDGPULegalizerInfo::getSegmentAperture(
1295   unsigned AS,
1296   MachineRegisterInfo &MRI,
1297   MachineIRBuilder &B) const {
1298   MachineFunction &MF = B.getMF();
1299   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1300   const LLT S32 = LLT::scalar(32);
1301 
1302   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1303 
1304   if (ST.hasApertureRegs()) {
1305     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1306     // getreg.
1307     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1308         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1309         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1310     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1311         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1312         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1313     unsigned Encoding =
1314         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1315         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1316         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1317 
1318     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1319 
1320     B.buildInstr(AMDGPU::S_GETREG_B32)
1321       .addDef(GetReg)
1322       .addImm(Encoding);
1323     MRI.setType(GetReg, S32);
1324 
1325     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1326     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1327   }
1328 
1329   Register QueuePtr = MRI.createGenericVirtualRegister(
1330     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1331 
1332   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1333   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1334     return Register();
1335 
1336   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1337   // private_segment_aperture_base_hi.
1338   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1339 
1340   // TODO: can we be smarter about machine pointer info?
1341   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1342   MachineMemOperand *MMO = MF.getMachineMemOperand(
1343     PtrInfo,
1344     MachineMemOperand::MOLoad |
1345     MachineMemOperand::MODereferenceable |
1346     MachineMemOperand::MOInvariant,
1347     4,
1348     MinAlign(64, StructOffset));
1349 
1350   Register LoadAddr;
1351 
1352   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1353   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1354 }
1355 
1356 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1357   MachineInstr &MI, MachineRegisterInfo &MRI,
1358   MachineIRBuilder &B) const {
1359   MachineFunction &MF = B.getMF();
1360 
1361   B.setInstr(MI);
1362 
1363   const LLT S32 = LLT::scalar(32);
1364   Register Dst = MI.getOperand(0).getReg();
1365   Register Src = MI.getOperand(1).getReg();
1366 
1367   LLT DstTy = MRI.getType(Dst);
1368   LLT SrcTy = MRI.getType(Src);
1369   unsigned DestAS = DstTy.getAddressSpace();
1370   unsigned SrcAS = SrcTy.getAddressSpace();
1371 
1372   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1373   // vector element.
1374   assert(!DstTy.isVector());
1375 
1376   const AMDGPUTargetMachine &TM
1377     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1378 
1379   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1380   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1381     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1382     return true;
1383   }
1384 
1385   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1386     // Truncate.
1387     B.buildExtract(Dst, Src, 0);
1388     MI.eraseFromParent();
1389     return true;
1390   }
1391 
1392   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1393     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1394     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1395 
1396     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1397     // another. Merge operands are required to be the same type, but creating an
1398     // extra ptrtoint would be kind of pointless.
1399     auto HighAddr = B.buildConstant(
1400       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1401     B.buildMerge(Dst, {Src, HighAddr});
1402     MI.eraseFromParent();
1403     return true;
1404   }
1405 
1406   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1407     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1408            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1409     unsigned NullVal = TM.getNullPointerValue(DestAS);
1410 
1411     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1412     auto FlatNull = B.buildConstant(SrcTy, 0);
1413 
1414     // Extract low 32-bits of the pointer.
1415     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1416 
1417     auto CmpRes =
1418         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1419     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1420 
1421     MI.eraseFromParent();
1422     return true;
1423   }
1424 
1425   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1426     return false;
1427 
1428   if (!ST.hasFlatAddressSpace())
1429     return false;
1430 
1431   auto SegmentNull =
1432       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1433   auto FlatNull =
1434       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1435 
1436   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1437   if (!ApertureReg.isValid())
1438     return false;
1439 
1440   auto CmpRes =
1441       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1442 
1443   // Coerce the type of the low half of the result so we can use merge_values.
1444   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1445 
1446   // TODO: Should we allow mismatched types but matching sizes in merges to
1447   // avoid the ptrtoint?
1448   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1449   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1450 
1451   MI.eraseFromParent();
1452   return true;
1453 }
1454 
1455 bool AMDGPULegalizerInfo::legalizeFrint(
1456   MachineInstr &MI, MachineRegisterInfo &MRI,
1457   MachineIRBuilder &B) const {
1458   B.setInstr(MI);
1459 
1460   Register Src = MI.getOperand(1).getReg();
1461   LLT Ty = MRI.getType(Src);
1462   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1463 
1464   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1465   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1466 
1467   auto C1 = B.buildFConstant(Ty, C1Val);
1468   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1469 
1470   // TODO: Should this propagate fast-math-flags?
1471   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1472   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1473 
1474   auto C2 = B.buildFConstant(Ty, C2Val);
1475   auto Fabs = B.buildFAbs(Ty, Src);
1476 
1477   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1478   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1479   return true;
1480 }
1481 
1482 bool AMDGPULegalizerInfo::legalizeFceil(
1483   MachineInstr &MI, MachineRegisterInfo &MRI,
1484   MachineIRBuilder &B) const {
1485   B.setInstr(MI);
1486 
1487   const LLT S1 = LLT::scalar(1);
1488   const LLT S64 = LLT::scalar(64);
1489 
1490   Register Src = MI.getOperand(1).getReg();
1491   assert(MRI.getType(Src) == S64);
1492 
1493   // result = trunc(src)
1494   // if (src > 0.0 && src != result)
1495   //   result += 1.0
1496 
1497   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1498 
1499   const auto Zero = B.buildFConstant(S64, 0.0);
1500   const auto One = B.buildFConstant(S64, 1.0);
1501   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1502   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1503   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1504   auto Add = B.buildSelect(S64, And, One, Zero);
1505 
1506   // TODO: Should this propagate fast-math-flags?
1507   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1508   return true;
1509 }
1510 
1511 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1512                                               MachineIRBuilder &B) {
1513   const unsigned FractBits = 52;
1514   const unsigned ExpBits = 11;
1515   LLT S32 = LLT::scalar(32);
1516 
1517   auto Const0 = B.buildConstant(S32, FractBits - 32);
1518   auto Const1 = B.buildConstant(S32, ExpBits);
1519 
1520   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1521     .addUse(Const0.getReg(0))
1522     .addUse(Const1.getReg(0));
1523 
1524   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1525 }
1526 
1527 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1528   MachineInstr &MI, MachineRegisterInfo &MRI,
1529   MachineIRBuilder &B) const {
1530   B.setInstr(MI);
1531 
1532   const LLT S1 = LLT::scalar(1);
1533   const LLT S32 = LLT::scalar(32);
1534   const LLT S64 = LLT::scalar(64);
1535 
1536   Register Src = MI.getOperand(1).getReg();
1537   assert(MRI.getType(Src) == S64);
1538 
1539   // TODO: Should this use extract since the low half is unused?
1540   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1541   Register Hi = Unmerge.getReg(1);
1542 
1543   // Extract the upper half, since this is where we will find the sign and
1544   // exponent.
1545   auto Exp = extractF64Exponent(Hi, B);
1546 
1547   const unsigned FractBits = 52;
1548 
1549   // Extract the sign bit.
1550   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1551   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1552 
1553   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1554 
1555   const auto Zero32 = B.buildConstant(S32, 0);
1556 
1557   // Extend back to 64-bits.
1558   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1559 
1560   auto Shr = B.buildAShr(S64, FractMask, Exp);
1561   auto Not = B.buildNot(S64, Shr);
1562   auto Tmp0 = B.buildAnd(S64, Src, Not);
1563   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1564 
1565   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1566   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1567 
1568   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1569   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1570   return true;
1571 }
1572 
1573 bool AMDGPULegalizerInfo::legalizeITOFP(
1574   MachineInstr &MI, MachineRegisterInfo &MRI,
1575   MachineIRBuilder &B, bool Signed) const {
1576   B.setInstr(MI);
1577 
1578   Register Dst = MI.getOperand(0).getReg();
1579   Register Src = MI.getOperand(1).getReg();
1580 
1581   const LLT S64 = LLT::scalar(64);
1582   const LLT S32 = LLT::scalar(32);
1583 
1584   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1585 
1586   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1587 
1588   auto CvtHi = Signed ?
1589     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1590     B.buildUITOFP(S64, Unmerge.getReg(1));
1591 
1592   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1593 
1594   auto ThirtyTwo = B.buildConstant(S32, 32);
1595   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1596     .addUse(CvtHi.getReg(0))
1597     .addUse(ThirtyTwo.getReg(0));
1598 
1599   // TODO: Should this propagate fast-math-flags?
1600   B.buildFAdd(Dst, LdExp, CvtLo);
1601   MI.eraseFromParent();
1602   return true;
1603 }
1604 
1605 // TODO: Copied from DAG implementation. Verify logic and document how this
1606 // actually works.
1607 bool AMDGPULegalizerInfo::legalizeFPTOI(
1608   MachineInstr &MI, MachineRegisterInfo &MRI,
1609   MachineIRBuilder &B, bool Signed) const {
1610   B.setInstr(MI);
1611 
1612   Register Dst = MI.getOperand(0).getReg();
1613   Register Src = MI.getOperand(1).getReg();
1614 
1615   const LLT S64 = LLT::scalar(64);
1616   const LLT S32 = LLT::scalar(32);
1617 
1618   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1619 
1620   unsigned Flags = MI.getFlags();
1621 
1622   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
1623   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
1624   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
1625 
1626   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
1627   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
1628   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
1629 
1630   auto Hi = Signed ?
1631     B.buildFPTOSI(S32, FloorMul) :
1632     B.buildFPTOUI(S32, FloorMul);
1633   auto Lo = B.buildFPTOUI(S32, Fma);
1634 
1635   B.buildMerge(Dst, { Lo, Hi });
1636   MI.eraseFromParent();
1637 
1638   return true;
1639 }
1640 
1641 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1642   MachineInstr &MI, MachineRegisterInfo &MRI,
1643   MachineIRBuilder &B) const {
1644   MachineFunction &MF = B.getMF();
1645   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1646 
1647   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1648                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1649 
1650   // With ieee_mode disabled, the instructions have the correct behavior
1651   // already for G_FMINNUM/G_FMAXNUM
1652   if (!MFI->getMode().IEEE)
1653     return !IsIEEEOp;
1654 
1655   if (IsIEEEOp)
1656     return true;
1657 
1658   MachineIRBuilder HelperBuilder(MI);
1659   GISelObserverWrapper DummyObserver;
1660   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1661   HelperBuilder.setInstr(MI);
1662   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1663 }
1664 
1665 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1666   MachineInstr &MI, MachineRegisterInfo &MRI,
1667   MachineIRBuilder &B) const {
1668   // TODO: Should move some of this into LegalizerHelper.
1669 
1670   // TODO: Promote dynamic indexing of s16 to s32
1671 
1672   // FIXME: Artifact combiner probably should have replaced the truncated
1673   // constant before this, so we shouldn't need
1674   // getConstantVRegValWithLookThrough.
1675   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1676     MI.getOperand(2).getReg(), MRI);
1677   if (!IdxVal) // Dynamic case will be selected to register indexing.
1678     return true;
1679 
1680   Register Dst = MI.getOperand(0).getReg();
1681   Register Vec = MI.getOperand(1).getReg();
1682 
1683   LLT VecTy = MRI.getType(Vec);
1684   LLT EltTy = VecTy.getElementType();
1685   assert(EltTy == MRI.getType(Dst));
1686 
1687   B.setInstr(MI);
1688 
1689   if (IdxVal->Value < VecTy.getNumElements())
1690     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
1691   else
1692     B.buildUndef(Dst);
1693 
1694   MI.eraseFromParent();
1695   return true;
1696 }
1697 
1698 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1699   MachineInstr &MI, MachineRegisterInfo &MRI,
1700   MachineIRBuilder &B) const {
1701   // TODO: Should move some of this into LegalizerHelper.
1702 
1703   // TODO: Promote dynamic indexing of s16 to s32
1704 
1705   // FIXME: Artifact combiner probably should have replaced the truncated
1706   // constant before this, so we shouldn't need
1707   // getConstantVRegValWithLookThrough.
1708   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1709     MI.getOperand(3).getReg(), MRI);
1710   if (!IdxVal) // Dynamic case will be selected to register indexing.
1711     return true;
1712 
1713   Register Dst = MI.getOperand(0).getReg();
1714   Register Vec = MI.getOperand(1).getReg();
1715   Register Ins = MI.getOperand(2).getReg();
1716 
1717   LLT VecTy = MRI.getType(Vec);
1718   LLT EltTy = VecTy.getElementType();
1719   assert(EltTy == MRI.getType(Ins));
1720 
1721   B.setInstr(MI);
1722 
1723   if (IdxVal->Value < VecTy.getNumElements())
1724     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
1725   else
1726     B.buildUndef(Dst);
1727 
1728   MI.eraseFromParent();
1729   return true;
1730 }
1731 
1732 static bool isLegalVOP3PShuffleMask(ArrayRef<int> Mask) {
1733   assert(Mask.size() == 2);
1734 
1735   // If one half is undef, the other is trivially in the same reg.
1736   if (Mask[0] == -1 || Mask[1] == -1)
1737     return true;
1738   return ((Mask[0] == 0 || Mask[0] == 1) && (Mask[1] == 0 || Mask[1] == 1)) ||
1739          ((Mask[0] == 2 || Mask[0] == 3) && (Mask[1] == 2 || Mask[1] == 3));
1740 }
1741 
1742 bool AMDGPULegalizerInfo::legalizeShuffleVector(
1743   MachineInstr &MI, MachineRegisterInfo &MRI,
1744   MachineIRBuilder &B) const {
1745   const LLT V2S16 = LLT::vector(2, 16);
1746 
1747   Register Dst = MI.getOperand(0).getReg();
1748   Register Src0 = MI.getOperand(1).getReg();
1749   LLT DstTy = MRI.getType(Dst);
1750   LLT SrcTy = MRI.getType(Src0);
1751 
1752   if (SrcTy == V2S16 && DstTy == V2S16 &&
1753       isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
1754     return true;
1755 
1756   MachineIRBuilder HelperBuilder(MI);
1757   GISelObserverWrapper DummyObserver;
1758   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
1759   HelperBuilder.setInstr(MI);
1760   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
1761 }
1762 
1763 bool AMDGPULegalizerInfo::legalizeSinCos(
1764   MachineInstr &MI, MachineRegisterInfo &MRI,
1765   MachineIRBuilder &B) const {
1766   B.setInstr(MI);
1767 
1768   Register DstReg = MI.getOperand(0).getReg();
1769   Register SrcReg = MI.getOperand(1).getReg();
1770   LLT Ty = MRI.getType(DstReg);
1771   unsigned Flags = MI.getFlags();
1772 
1773   Register TrigVal;
1774   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1775   if (ST.hasTrigReducedRange()) {
1776     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1777     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1778       .addUse(MulVal.getReg(0))
1779       .setMIFlags(Flags).getReg(0);
1780   } else
1781     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1782 
1783   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1784     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1785   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1786     .addUse(TrigVal)
1787     .setMIFlags(Flags);
1788   MI.eraseFromParent();
1789   return true;
1790 }
1791 
1792 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1793   Register DstReg, LLT PtrTy,
1794   MachineIRBuilder &B, const GlobalValue *GV,
1795   unsigned Offset, unsigned GAFlags) const {
1796   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1797   // to the following code sequence:
1798   //
1799   // For constant address space:
1800   //   s_getpc_b64 s[0:1]
1801   //   s_add_u32 s0, s0, $symbol
1802   //   s_addc_u32 s1, s1, 0
1803   //
1804   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1805   //   a fixup or relocation is emitted to replace $symbol with a literal
1806   //   constant, which is a pc-relative offset from the encoding of the $symbol
1807   //   operand to the global variable.
1808   //
1809   // For global address space:
1810   //   s_getpc_b64 s[0:1]
1811   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1812   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1813   //
1814   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1815   //   fixups or relocations are emitted to replace $symbol@*@lo and
1816   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1817   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
1818   //   operand to the global variable.
1819   //
1820   // What we want here is an offset from the value returned by s_getpc
1821   // (which is the address of the s_add_u32 instruction) to the global
1822   // variable, but since the encoding of $symbol starts 4 bytes after the start
1823   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1824   // small. This requires us to add 4 to the global variable offset in order to
1825   // compute the correct address.
1826 
1827   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1828 
1829   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1830     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1831 
1832   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1833     .addDef(PCReg);
1834 
1835   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1836   if (GAFlags == SIInstrInfo::MO_NONE)
1837     MIB.addImm(0);
1838   else
1839     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1840 
1841   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1842 
1843   if (PtrTy.getSizeInBits() == 32)
1844     B.buildExtract(DstReg, PCReg, 0);
1845   return true;
1846  }
1847 
1848 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1849   MachineInstr &MI, MachineRegisterInfo &MRI,
1850   MachineIRBuilder &B) const {
1851   Register DstReg = MI.getOperand(0).getReg();
1852   LLT Ty = MRI.getType(DstReg);
1853   unsigned AS = Ty.getAddressSpace();
1854 
1855   const GlobalValue *GV = MI.getOperand(1).getGlobal();
1856   MachineFunction &MF = B.getMF();
1857   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1858   B.setInstr(MI);
1859 
1860   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1861     if (!MFI->isEntryFunction()) {
1862       const Function &Fn = MF.getFunction();
1863       DiagnosticInfoUnsupported BadLDSDecl(
1864         Fn, "local memory global used by non-kernel function", MI.getDebugLoc());
1865       Fn.getContext().diagnose(BadLDSDecl);
1866     }
1867 
1868     // TODO: We could emit code to handle the initialization somewhere.
1869     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1870       const SITargetLowering *TLI = ST.getTargetLowering();
1871       if (!TLI->shouldUseLDSConstAddress(GV)) {
1872         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
1873         return true; // Leave in place;
1874       }
1875 
1876       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1877       MI.eraseFromParent();
1878       return true;
1879     }
1880 
1881     const Function &Fn = MF.getFunction();
1882     DiagnosticInfoUnsupported BadInit(
1883       Fn, "unsupported initializer for address space", MI.getDebugLoc());
1884     Fn.getContext().diagnose(BadInit);
1885     return true;
1886   }
1887 
1888   const SITargetLowering *TLI = ST.getTargetLowering();
1889 
1890   if (TLI->shouldEmitFixup(GV)) {
1891     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
1892     MI.eraseFromParent();
1893     return true;
1894   }
1895 
1896   if (TLI->shouldEmitPCReloc(GV)) {
1897     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
1898     MI.eraseFromParent();
1899     return true;
1900   }
1901 
1902   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1903   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
1904 
1905   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
1906     MachinePointerInfo::getGOT(MF),
1907     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1908     MachineMemOperand::MOInvariant,
1909     8 /*Size*/, 8 /*Align*/);
1910 
1911   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
1912 
1913   if (Ty.getSizeInBits() == 32) {
1914     // Truncate if this is a 32-bit constant adrdess.
1915     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
1916     B.buildExtract(DstReg, Load, 0);
1917   } else
1918     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
1919 
1920   MI.eraseFromParent();
1921   return true;
1922 }
1923 
1924 bool AMDGPULegalizerInfo::legalizeLoad(
1925   MachineInstr &MI, MachineRegisterInfo &MRI,
1926   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
1927   B.setInstr(MI);
1928   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1929   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
1930   Observer.changingInstr(MI);
1931   MI.getOperand(1).setReg(Cast.getReg(0));
1932   Observer.changedInstr(MI);
1933   return true;
1934 }
1935 
1936 bool AMDGPULegalizerInfo::legalizeFMad(
1937   MachineInstr &MI, MachineRegisterInfo &MRI,
1938   MachineIRBuilder &B) const {
1939   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1940   assert(Ty.isScalar());
1941 
1942   MachineFunction &MF = B.getMF();
1943   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1944 
1945   // TODO: Always legal with future ftz flag.
1946   // FIXME: Do we need just output?
1947   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
1948     return true;
1949   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
1950     return true;
1951 
1952   MachineIRBuilder HelperBuilder(MI);
1953   GISelObserverWrapper DummyObserver;
1954   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1955   HelperBuilder.setMBB(*MI.getParent());
1956   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
1957 }
1958 
1959 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
1960   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
1961   Register DstReg = MI.getOperand(0).getReg();
1962   Register PtrReg = MI.getOperand(1).getReg();
1963   Register CmpVal = MI.getOperand(2).getReg();
1964   Register NewVal = MI.getOperand(3).getReg();
1965 
1966   assert(SITargetLowering::isFlatGlobalAddrSpace(
1967            MRI.getType(PtrReg).getAddressSpace()) &&
1968          "this should not have been custom lowered");
1969 
1970   LLT ValTy = MRI.getType(CmpVal);
1971   LLT VecTy = LLT::vector(2, ValTy);
1972 
1973   B.setInstr(MI);
1974   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
1975 
1976   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
1977     .addDef(DstReg)
1978     .addUse(PtrReg)
1979     .addUse(PackedVal)
1980     .setMemRefs(MI.memoperands());
1981 
1982   MI.eraseFromParent();
1983   return true;
1984 }
1985 
1986 bool AMDGPULegalizerInfo::legalizeFlog(
1987   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
1988   Register Dst = MI.getOperand(0).getReg();
1989   Register Src = MI.getOperand(1).getReg();
1990   LLT Ty = B.getMRI()->getType(Dst);
1991   unsigned Flags = MI.getFlags();
1992   B.setInstr(MI);
1993 
1994   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
1995   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
1996 
1997   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
1998   MI.eraseFromParent();
1999   return true;
2000 }
2001 
2002 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2003                                        MachineIRBuilder &B) const {
2004   Register Dst = MI.getOperand(0).getReg();
2005   Register Src = MI.getOperand(1).getReg();
2006   unsigned Flags = MI.getFlags();
2007   LLT Ty = B.getMRI()->getType(Dst);
2008   B.setInstr(MI);
2009 
2010   auto K = B.buildFConstant(Ty, numbers::log2e);
2011   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2012   B.buildFExp2(Dst, Mul, Flags);
2013   MI.eraseFromParent();
2014   return true;
2015 }
2016 
2017 // Find a source register, ignoring any possible source modifiers.
2018 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2019   Register ModSrc = OrigSrc;
2020   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2021     ModSrc = SrcFNeg->getOperand(1).getReg();
2022     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2023       ModSrc = SrcFAbs->getOperand(1).getReg();
2024   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2025     ModSrc = SrcFAbs->getOperand(1).getReg();
2026   return ModSrc;
2027 }
2028 
2029 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2030                                          MachineRegisterInfo &MRI,
2031                                          MachineIRBuilder &B) const {
2032   B.setInstr(MI);
2033 
2034   const LLT S1 = LLT::scalar(1);
2035   const LLT S64 = LLT::scalar(64);
2036   Register Dst = MI.getOperand(0).getReg();
2037   Register OrigSrc = MI.getOperand(1).getReg();
2038   unsigned Flags = MI.getFlags();
2039   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2040          "this should not have been custom lowered");
2041 
2042   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2043   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2044   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2045   // V_FRACT bug is:
2046   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2047   //
2048   // Convert floor(x) to (x - fract(x))
2049 
2050   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2051     .addUse(OrigSrc)
2052     .setMIFlags(Flags);
2053 
2054   // Give source modifier matching some assistance before obscuring a foldable
2055   // pattern.
2056 
2057   // TODO: We can avoid the neg on the fract? The input sign to fract
2058   // shouldn't matter?
2059   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2060 
2061   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2062 
2063   Register Min = MRI.createGenericVirtualRegister(S64);
2064 
2065   // We don't need to concern ourselves with the snan handling difference, so
2066   // use the one which will directly select.
2067   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2068   if (MFI->getMode().IEEE)
2069     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2070   else
2071     B.buildFMinNum(Min, Fract, Const, Flags);
2072 
2073   Register CorrectedFract = Min;
2074   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2075     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2076     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2077   }
2078 
2079   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2080   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2081 
2082   MI.eraseFromParent();
2083   return true;
2084 }
2085 
2086 // Turn an illegal packed v2s16 build vector into bit operations.
2087 // TODO: This should probably be a bitcast action in LegalizerHelper.
2088 bool AMDGPULegalizerInfo::legalizeBuildVector(
2089   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2090   Register Dst = MI.getOperand(0).getReg();
2091   LLT DstTy = MRI.getType(Dst);
2092   const LLT S32 = LLT::scalar(32);
2093   const LLT V2S16 = LLT::vector(2, 16);
2094   (void)DstTy;
2095   (void)V2S16;
2096   assert(DstTy == V2S16);
2097 
2098   Register Src0 = MI.getOperand(1).getReg();
2099   Register Src1 = MI.getOperand(2).getReg();
2100   assert(MRI.getType(Src0) == LLT::scalar(16));
2101 
2102   B.setInstr(MI);
2103   auto Merge = B.buildMerge(S32, {Src0, Src1});
2104   B.buildBitcast(Dst, Merge);
2105 
2106   MI.eraseFromParent();
2107   return true;
2108 }
2109 
2110 // Return the use branch instruction, otherwise null if the usage is invalid.
2111 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2112                                        MachineRegisterInfo &MRI,
2113                                        MachineInstr *&Br) {
2114   Register CondDef = MI.getOperand(0).getReg();
2115   if (!MRI.hasOneNonDBGUse(CondDef))
2116     return nullptr;
2117 
2118   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2119   if (UseMI.getParent() != MI.getParent() ||
2120       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2121     return nullptr;
2122 
2123   // Make sure the cond br is followed by a G_BR
2124   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2125   if (Next != MI.getParent()->end()) {
2126     if (Next->getOpcode() != AMDGPU::G_BR)
2127       return nullptr;
2128     Br = &*Next;
2129   }
2130 
2131   return &UseMI;
2132 }
2133 
2134 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
2135                                                 Register Reg, LLT Ty) const {
2136   Register LiveIn = MRI.getLiveInVirtReg(Reg);
2137   if (LiveIn)
2138     return LiveIn;
2139 
2140   Register NewReg = MRI.createGenericVirtualRegister(Ty);
2141   MRI.addLiveIn(Reg, NewReg);
2142   return NewReg;
2143 }
2144 
2145 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2146                                          const ArgDescriptor *Arg) const {
2147   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2148     return false; // TODO: Handle these
2149 
2150   assert(Arg->getRegister().isPhysical());
2151 
2152   MachineRegisterInfo &MRI = *B.getMRI();
2153 
2154   LLT Ty = MRI.getType(DstReg);
2155   Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
2156 
2157   if (Arg->isMasked()) {
2158     // TODO: Should we try to emit this once in the entry block?
2159     const LLT S32 = LLT::scalar(32);
2160     const unsigned Mask = Arg->getMask();
2161     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2162 
2163     Register AndMaskSrc = LiveIn;
2164 
2165     if (Shift != 0) {
2166       auto ShiftAmt = B.buildConstant(S32, Shift);
2167       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2168     }
2169 
2170     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2171   } else
2172     B.buildCopy(DstReg, LiveIn);
2173 
2174   // Insert the argument copy if it doens't already exist.
2175   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
2176   if (!MRI.getVRegDef(LiveIn)) {
2177     // FIXME: Should have scoped insert pt
2178     MachineBasicBlock &OrigInsBB = B.getMBB();
2179     auto OrigInsPt = B.getInsertPt();
2180 
2181     MachineBasicBlock &EntryMBB = B.getMF().front();
2182     EntryMBB.addLiveIn(Arg->getRegister());
2183     B.setInsertPt(EntryMBB, EntryMBB.begin());
2184     B.buildCopy(LiveIn, Arg->getRegister());
2185 
2186     B.setInsertPt(OrigInsBB, OrigInsPt);
2187   }
2188 
2189   return true;
2190 }
2191 
2192 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2193   MachineInstr &MI,
2194   MachineRegisterInfo &MRI,
2195   MachineIRBuilder &B,
2196   AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2197   B.setInstr(MI);
2198 
2199   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2200 
2201   const ArgDescriptor *Arg;
2202   const TargetRegisterClass *RC;
2203   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
2204   if (!Arg) {
2205     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
2206     return false;
2207   }
2208 
2209   if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
2210     MI.eraseFromParent();
2211     return true;
2212   }
2213 
2214   return false;
2215 }
2216 
2217 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2218                                        MachineRegisterInfo &MRI,
2219                                        MachineIRBuilder &B) const {
2220   B.setInstr(MI);
2221   Register Dst = MI.getOperand(0).getReg();
2222   LLT DstTy = MRI.getType(Dst);
2223   LLT S16 = LLT::scalar(16);
2224   LLT S32 = LLT::scalar(32);
2225   LLT S64 = LLT::scalar(64);
2226 
2227   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2228     return true;
2229 
2230   if (DstTy == S16)
2231     return legalizeFDIV16(MI, MRI, B);
2232   if (DstTy == S32)
2233     return legalizeFDIV32(MI, MRI, B);
2234   if (DstTy == S64)
2235     return legalizeFDIV64(MI, MRI, B);
2236 
2237   return false;
2238 }
2239 
2240 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2241                                                  MachineRegisterInfo &MRI,
2242                                                  MachineIRBuilder &B) const {
2243   Register Res = MI.getOperand(0).getReg();
2244   Register LHS = MI.getOperand(1).getReg();
2245   Register RHS = MI.getOperand(2).getReg();
2246 
2247   uint16_t Flags = MI.getFlags();
2248 
2249   LLT ResTy = MRI.getType(Res);
2250   LLT S32 = LLT::scalar(32);
2251   LLT S64 = LLT::scalar(64);
2252 
2253   const MachineFunction &MF = B.getMF();
2254   bool Unsafe =
2255     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2256 
2257   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2258     return false;
2259 
2260   if (!Unsafe && ResTy == S32 &&
2261       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2262     return false;
2263 
2264   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2265     // 1 / x -> RCP(x)
2266     if (CLHS->isExactlyValue(1.0)) {
2267       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2268         .addUse(RHS)
2269         .setMIFlags(Flags);
2270 
2271       MI.eraseFromParent();
2272       return true;
2273     }
2274 
2275     // -1 / x -> RCP( FNEG(x) )
2276     if (CLHS->isExactlyValue(-1.0)) {
2277       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2278       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2279         .addUse(FNeg.getReg(0))
2280         .setMIFlags(Flags);
2281 
2282       MI.eraseFromParent();
2283       return true;
2284     }
2285   }
2286 
2287   // x / y -> x * (1.0 / y)
2288   if (Unsafe) {
2289     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2290       .addUse(RHS)
2291       .setMIFlags(Flags);
2292     B.buildFMul(Res, LHS, RCP, Flags);
2293 
2294     MI.eraseFromParent();
2295     return true;
2296   }
2297 
2298   return false;
2299 }
2300 
2301 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2302                                          MachineRegisterInfo &MRI,
2303                                          MachineIRBuilder &B) const {
2304   B.setInstr(MI);
2305   Register Res = MI.getOperand(0).getReg();
2306   Register LHS = MI.getOperand(1).getReg();
2307   Register RHS = MI.getOperand(2).getReg();
2308 
2309   uint16_t Flags = MI.getFlags();
2310 
2311   LLT S16 = LLT::scalar(16);
2312   LLT S32 = LLT::scalar(32);
2313 
2314   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2315   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2316 
2317   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2318     .addUse(RHSExt.getReg(0))
2319     .setMIFlags(Flags);
2320 
2321   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2322   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2323 
2324   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2325     .addUse(RDst.getReg(0))
2326     .addUse(RHS)
2327     .addUse(LHS)
2328     .setMIFlags(Flags);
2329 
2330   MI.eraseFromParent();
2331   return true;
2332 }
2333 
2334 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2335 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2336 static void toggleSPDenormMode(bool Enable,
2337                                MachineIRBuilder &B,
2338                                const GCNSubtarget &ST,
2339                                AMDGPU::SIModeRegisterDefaults Mode) {
2340   // Set SP denorm mode to this value.
2341   unsigned SPDenormMode =
2342     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
2343 
2344   if (ST.hasDenormModeInst()) {
2345     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2346     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
2347 
2348     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2349     B.buildInstr(AMDGPU::S_DENORM_MODE)
2350       .addImm(NewDenormModeValue);
2351 
2352   } else {
2353     // Select FP32 bit field in mode register.
2354     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2355                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2356                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2357 
2358     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2359       .addImm(SPDenormMode)
2360       .addImm(SPDenormModeBitField);
2361   }
2362 }
2363 
2364 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2365                                          MachineRegisterInfo &MRI,
2366                                          MachineIRBuilder &B) const {
2367   B.setInstr(MI);
2368   Register Res = MI.getOperand(0).getReg();
2369   Register LHS = MI.getOperand(1).getReg();
2370   Register RHS = MI.getOperand(2).getReg();
2371   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2372   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2373 
2374   uint16_t Flags = MI.getFlags();
2375 
2376   LLT S32 = LLT::scalar(32);
2377   LLT S1 = LLT::scalar(1);
2378 
2379   auto One = B.buildFConstant(S32, 1.0f);
2380 
2381   auto DenominatorScaled =
2382     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2383       .addUse(RHS)
2384       .addUse(LHS)
2385       .addImm(1)
2386       .setMIFlags(Flags);
2387   auto NumeratorScaled =
2388     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2389       .addUse(LHS)
2390       .addUse(RHS)
2391       .addImm(0)
2392       .setMIFlags(Flags);
2393 
2394   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2395     .addUse(DenominatorScaled.getReg(0))
2396     .setMIFlags(Flags);
2397   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2398 
2399   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2400   // aren't modeled as reading it.
2401   if (!Mode.allFP32Denormals())
2402     toggleSPDenormMode(true, B, ST, Mode);
2403 
2404   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2405   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2406   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2407   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2408   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2409   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2410 
2411   if (!Mode.allFP32Denormals())
2412     toggleSPDenormMode(false, B, ST, Mode);
2413 
2414   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2415     .addUse(Fma4.getReg(0))
2416     .addUse(Fma1.getReg(0))
2417     .addUse(Fma3.getReg(0))
2418     .addUse(NumeratorScaled.getReg(1))
2419     .setMIFlags(Flags);
2420 
2421   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2422     .addUse(Fmas.getReg(0))
2423     .addUse(RHS)
2424     .addUse(LHS)
2425     .setMIFlags(Flags);
2426 
2427   MI.eraseFromParent();
2428   return true;
2429 }
2430 
2431 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
2432                                          MachineRegisterInfo &MRI,
2433                                          MachineIRBuilder &B) const {
2434   B.setInstr(MI);
2435   Register Res = MI.getOperand(0).getReg();
2436   Register LHS = MI.getOperand(1).getReg();
2437   Register RHS = MI.getOperand(2).getReg();
2438 
2439   uint16_t Flags = MI.getFlags();
2440 
2441   LLT S64 = LLT::scalar(64);
2442   LLT S1 = LLT::scalar(1);
2443 
2444   auto One = B.buildFConstant(S64, 1.0);
2445 
2446   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2447     .addUse(LHS)
2448     .addUse(RHS)
2449     .addImm(1)
2450     .setMIFlags(Flags);
2451 
2452   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
2453 
2454   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
2455     .addUse(DivScale0.getReg(0))
2456     .setMIFlags(Flags);
2457 
2458   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
2459   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
2460   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
2461 
2462   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2463     .addUse(LHS)
2464     .addUse(RHS)
2465     .addImm(0)
2466     .setMIFlags(Flags);
2467 
2468   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
2469   auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags);
2470   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
2471 
2472   Register Scale;
2473   if (!ST.hasUsableDivScaleConditionOutput()) {
2474     // Workaround a hardware bug on SI where the condition output from div_scale
2475     // is not usable.
2476 
2477     LLT S32 = LLT::scalar(32);
2478 
2479     auto NumUnmerge = B.buildUnmerge(S32, LHS);
2480     auto DenUnmerge = B.buildUnmerge(S32, RHS);
2481     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
2482     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
2483 
2484     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
2485                               Scale1Unmerge.getReg(1));
2486     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
2487                               Scale0Unmerge.getReg(1));
2488     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
2489   } else {
2490     Scale = DivScale1.getReg(1);
2491   }
2492 
2493   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
2494     .addUse(Fma4.getReg(0))
2495     .addUse(Fma3.getReg(0))
2496     .addUse(Mul.getReg(0))
2497     .addUse(Scale)
2498     .setMIFlags(Flags);
2499 
2500   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
2501     .addUse(Fmas.getReg(0))
2502     .addUse(RHS)
2503     .addUse(LHS)
2504     .setMIFlags(Flags);
2505 
2506   MI.eraseFromParent();
2507   return true;
2508 }
2509 
2510 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
2511                                                  MachineRegisterInfo &MRI,
2512                                                  MachineIRBuilder &B) const {
2513   B.setInstr(MI);
2514   Register Res = MI.getOperand(0).getReg();
2515   Register LHS = MI.getOperand(2).getReg();
2516   Register RHS = MI.getOperand(3).getReg();
2517   uint16_t Flags = MI.getFlags();
2518 
2519   LLT S32 = LLT::scalar(32);
2520   LLT S1 = LLT::scalar(1);
2521 
2522   auto Abs = B.buildFAbs(S32, RHS, Flags);
2523   const APFloat C0Val(1.0f);
2524 
2525   auto C0 = B.buildConstant(S32, 0x6f800000);
2526   auto C1 = B.buildConstant(S32, 0x2f800000);
2527   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
2528 
2529   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
2530   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
2531 
2532   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
2533 
2534   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2535     .addUse(Mul0.getReg(0))
2536     .setMIFlags(Flags);
2537 
2538   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
2539 
2540   B.buildFMul(Res, Sel, Mul1, Flags);
2541 
2542   MI.eraseFromParent();
2543   return true;
2544 }
2545 
2546 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
2547                                                  MachineRegisterInfo &MRI,
2548                                                  MachineIRBuilder &B) const {
2549   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2550   if (!MFI->isEntryFunction()) {
2551     return legalizePreloadedArgIntrin(MI, MRI, B,
2552                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
2553   }
2554 
2555   B.setInstr(MI);
2556 
2557   uint64_t Offset =
2558     ST.getTargetLowering()->getImplicitParameterOffset(
2559       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
2560   Register DstReg = MI.getOperand(0).getReg();
2561   LLT DstTy = MRI.getType(DstReg);
2562   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
2563 
2564   const ArgDescriptor *Arg;
2565   const TargetRegisterClass *RC;
2566   std::tie(Arg, RC)
2567     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2568   if (!Arg)
2569     return false;
2570 
2571   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
2572   if (!loadInputValue(KernargPtrReg, B, Arg))
2573     return false;
2574 
2575   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
2576   MI.eraseFromParent();
2577   return true;
2578 }
2579 
2580 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
2581                                               MachineRegisterInfo &MRI,
2582                                               MachineIRBuilder &B,
2583                                               unsigned AddrSpace) const {
2584   B.setInstr(MI);
2585   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
2586   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
2587   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
2588   MI.eraseFromParent();
2589   return true;
2590 }
2591 
2592 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
2593 // offset (the offset that is included in bounds checking and swizzling, to be
2594 // split between the instruction's voffset and immoffset fields) and soffset
2595 // (the offset that is excluded from bounds checking and swizzling, to go in
2596 // the instruction's soffset field).  This function takes the first kind of
2597 // offset and figures out how to split it between voffset and immoffset.
2598 std::tuple<Register, unsigned, unsigned>
2599 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
2600                                         Register OrigOffset) const {
2601   const unsigned MaxImm = 4095;
2602   Register BaseReg;
2603   unsigned TotalConstOffset;
2604   MachineInstr *OffsetDef;
2605   const LLT S32 = LLT::scalar(32);
2606 
2607   std::tie(BaseReg, TotalConstOffset, OffsetDef)
2608     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
2609 
2610   unsigned ImmOffset = TotalConstOffset;
2611 
2612   // If the immediate value is too big for the immoffset field, put the value
2613   // and -4096 into the immoffset field so that the value that is copied/added
2614   // for the voffset field is a multiple of 4096, and it stands more chance
2615   // of being CSEd with the copy/add for another similar load/store.
2616   // However, do not do that rounding down to a multiple of 4096 if that is a
2617   // negative number, as it appears to be illegal to have a negative offset
2618   // in the vgpr, even if adding the immediate offset makes it positive.
2619   unsigned Overflow = ImmOffset & ~MaxImm;
2620   ImmOffset -= Overflow;
2621   if ((int32_t)Overflow < 0) {
2622     Overflow += ImmOffset;
2623     ImmOffset = 0;
2624   }
2625 
2626   if (Overflow != 0) {
2627     if (!BaseReg) {
2628       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
2629     } else {
2630       auto OverflowVal = B.buildConstant(S32, Overflow);
2631       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
2632     }
2633   }
2634 
2635   if (!BaseReg)
2636     BaseReg = B.buildConstant(S32, 0).getReg(0);
2637 
2638   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
2639 }
2640 
2641 /// Handle register layout difference for f16 images for some subtargets.
2642 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
2643                                              MachineRegisterInfo &MRI,
2644                                              Register Reg) const {
2645   if (!ST.hasUnpackedD16VMem())
2646     return Reg;
2647 
2648   const LLT S16 = LLT::scalar(16);
2649   const LLT S32 = LLT::scalar(32);
2650   LLT StoreVT = MRI.getType(Reg);
2651   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
2652 
2653   auto Unmerge = B.buildUnmerge(S16, Reg);
2654 
2655   SmallVector<Register, 4> WideRegs;
2656   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
2657     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
2658 
2659   int NumElts = StoreVT.getNumElements();
2660 
2661   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
2662 }
2663 
2664 Register AMDGPULegalizerInfo::fixStoreSourceType(
2665   MachineIRBuilder &B, Register VData, bool IsFormat) const {
2666   MachineRegisterInfo *MRI = B.getMRI();
2667   LLT Ty = MRI->getType(VData);
2668 
2669   const LLT S16 = LLT::scalar(16);
2670 
2671   // Fixup illegal register types for i8 stores.
2672   if (Ty == LLT::scalar(8) || Ty == S16) {
2673     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
2674     return AnyExt;
2675   }
2676 
2677   if (Ty.isVector()) {
2678     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
2679       if (IsFormat)
2680         return handleD16VData(B, *MRI, VData);
2681     }
2682   }
2683 
2684   return VData;
2685 }
2686 
2687 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
2688                                               MachineRegisterInfo &MRI,
2689                                               MachineIRBuilder &B,
2690                                               bool IsTyped,
2691                                               bool IsFormat) const {
2692   B.setInstr(MI);
2693 
2694   Register VData = MI.getOperand(1).getReg();
2695   LLT Ty = MRI.getType(VData);
2696   LLT EltTy = Ty.getScalarType();
2697   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
2698   const LLT S32 = LLT::scalar(32);
2699 
2700   VData = fixStoreSourceType(B, VData, IsFormat);
2701   Register RSrc = MI.getOperand(2).getReg();
2702 
2703   MachineMemOperand *MMO = *MI.memoperands_begin();
2704   const int MemSize = MMO->getSize();
2705 
2706   unsigned ImmOffset;
2707   unsigned TotalOffset;
2708 
2709   // The typed intrinsics add an immediate after the registers.
2710   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
2711 
2712   // The struct intrinsic variants add one additional operand over raw.
2713   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
2714   Register VIndex;
2715   int OpOffset = 0;
2716   if (HasVIndex) {
2717     VIndex = MI.getOperand(3).getReg();
2718     OpOffset = 1;
2719   }
2720 
2721   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
2722   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
2723 
2724   unsigned Format = 0;
2725   if (IsTyped) {
2726     Format = MI.getOperand(5 + OpOffset).getImm();
2727     ++OpOffset;
2728   }
2729 
2730   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
2731 
2732   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
2733   if (TotalOffset != 0)
2734     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
2735 
2736   unsigned Opc;
2737   if (IsTyped) {
2738     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
2739                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
2740   } else if (IsFormat) {
2741     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
2742                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
2743   } else {
2744     switch (MemSize) {
2745     case 1:
2746       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
2747       break;
2748     case 2:
2749       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
2750       break;
2751     default:
2752       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
2753       break;
2754     }
2755   }
2756 
2757   if (!VIndex)
2758     VIndex = B.buildConstant(S32, 0).getReg(0);
2759 
2760   auto MIB = B.buildInstr(Opc)
2761     .addUse(VData)              // vdata
2762     .addUse(RSrc)               // rsrc
2763     .addUse(VIndex)             // vindex
2764     .addUse(VOffset)            // voffset
2765     .addUse(SOffset)            // soffset
2766     .addImm(ImmOffset);         // offset(imm)
2767 
2768   if (IsTyped)
2769     MIB.addImm(Format);
2770 
2771   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
2772      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
2773      .addMemOperand(MMO);
2774 
2775   MI.eraseFromParent();
2776   return true;
2777 }
2778 
2779 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
2780                                              MachineRegisterInfo &MRI,
2781                                              MachineIRBuilder &B,
2782                                              bool IsFormat,
2783                                              bool IsTyped) const {
2784   B.setInstr(MI);
2785 
2786   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
2787   MachineMemOperand *MMO = *MI.memoperands_begin();
2788   const int MemSize = MMO->getSize();
2789   const LLT S32 = LLT::scalar(32);
2790 
2791   Register Dst = MI.getOperand(0).getReg();
2792   Register RSrc = MI.getOperand(2).getReg();
2793 
2794   // The typed intrinsics add an immediate after the registers.
2795   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
2796 
2797   // The struct intrinsic variants add one additional operand over raw.
2798   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
2799   Register VIndex;
2800   int OpOffset = 0;
2801   if (HasVIndex) {
2802     VIndex = MI.getOperand(3).getReg();
2803     OpOffset = 1;
2804   }
2805 
2806   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
2807   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
2808 
2809   unsigned Format = 0;
2810   if (IsTyped) {
2811     Format = MI.getOperand(5 + OpOffset).getImm();
2812     ++OpOffset;
2813   }
2814 
2815   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
2816   unsigned ImmOffset;
2817   unsigned TotalOffset;
2818 
2819   LLT Ty = MRI.getType(Dst);
2820   LLT EltTy = Ty.getScalarType();
2821   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
2822   const bool Unpacked = ST.hasUnpackedD16VMem();
2823 
2824   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
2825   if (TotalOffset != 0)
2826     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
2827 
2828   unsigned Opc;
2829 
2830   if (IsTyped) {
2831     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
2832                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
2833   } else if (IsFormat) {
2834     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
2835                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
2836   } else {
2837     switch (MemSize) {
2838     case 1:
2839       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
2840       break;
2841     case 2:
2842       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
2843       break;
2844     default:
2845       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
2846       break;
2847     }
2848   }
2849 
2850   Register LoadDstReg;
2851 
2852   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
2853   LLT UnpackedTy = Ty.changeElementSize(32);
2854 
2855   if (IsExtLoad)
2856     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
2857   else if (Unpacked && IsD16 && Ty.isVector())
2858     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
2859   else
2860     LoadDstReg = Dst;
2861 
2862   if (!VIndex)
2863     VIndex = B.buildConstant(S32, 0).getReg(0);
2864 
2865   auto MIB = B.buildInstr(Opc)
2866     .addDef(LoadDstReg)         // vdata
2867     .addUse(RSrc)               // rsrc
2868     .addUse(VIndex)             // vindex
2869     .addUse(VOffset)            // voffset
2870     .addUse(SOffset)            // soffset
2871     .addImm(ImmOffset);         // offset(imm)
2872 
2873   if (IsTyped)
2874     MIB.addImm(Format);
2875 
2876   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
2877      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
2878      .addMemOperand(MMO);
2879 
2880   if (LoadDstReg != Dst) {
2881     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2882 
2883     // Widen result for extending loads was widened.
2884     if (IsExtLoad)
2885       B.buildTrunc(Dst, LoadDstReg);
2886     else {
2887       // Repack to original 16-bit vector result
2888       // FIXME: G_TRUNC should work, but legalization currently fails
2889       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
2890       SmallVector<Register, 4> Repack;
2891       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
2892         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
2893       B.buildMerge(Dst, Repack);
2894     }
2895   }
2896 
2897   MI.eraseFromParent();
2898   return true;
2899 }
2900 
2901 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
2902                                                MachineIRBuilder &B,
2903                                                bool IsInc) const {
2904   B.setInstr(MI);
2905   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
2906                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
2907   B.buildInstr(Opc)
2908     .addDef(MI.getOperand(0).getReg())
2909     .addUse(MI.getOperand(2).getReg())
2910     .addUse(MI.getOperand(3).getReg())
2911     .cloneMemRefs(MI);
2912   MI.eraseFromParent();
2913   return true;
2914 }
2915 
2916 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
2917   switch (IntrID) {
2918   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
2919   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
2920     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
2921   case Intrinsic::amdgcn_raw_buffer_atomic_add:
2922   case Intrinsic::amdgcn_struct_buffer_atomic_add:
2923     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
2924   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
2925   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
2926     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
2927   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
2928   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
2929     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
2930   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
2931   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
2932     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
2933   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
2934   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
2935     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
2936   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
2937   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
2938     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
2939   case Intrinsic::amdgcn_raw_buffer_atomic_and:
2940   case Intrinsic::amdgcn_struct_buffer_atomic_and:
2941     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
2942   case Intrinsic::amdgcn_raw_buffer_atomic_or:
2943   case Intrinsic::amdgcn_struct_buffer_atomic_or:
2944     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
2945   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
2946   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
2947     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
2948   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
2949   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
2950     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
2951   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
2952   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
2953     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
2954   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
2955   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
2956     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
2957   default:
2958     llvm_unreachable("unhandled atomic opcode");
2959   }
2960 }
2961 
2962 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
2963                                                MachineIRBuilder &B,
2964                                                Intrinsic::ID IID) const {
2965   B.setInstr(MI);
2966 
2967   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
2968                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
2969 
2970   Register Dst = MI.getOperand(0).getReg();
2971   Register VData = MI.getOperand(2).getReg();
2972 
2973   Register CmpVal;
2974   int OpOffset = 0;
2975 
2976   if (IsCmpSwap) {
2977     CmpVal = MI.getOperand(3 + OpOffset).getReg();
2978     ++OpOffset;
2979   }
2980 
2981   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
2982   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
2983 
2984   // The struct intrinsic variants add one additional operand over raw.
2985   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
2986   Register VIndex;
2987   if (HasVIndex) {
2988     VIndex = MI.getOperand(4 + OpOffset).getReg();
2989     ++OpOffset;
2990   }
2991 
2992   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
2993   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
2994   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
2995 
2996   MachineMemOperand *MMO = *MI.memoperands_begin();
2997 
2998   unsigned ImmOffset;
2999   unsigned TotalOffset;
3000   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3001   if (TotalOffset != 0)
3002     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3003 
3004   if (!VIndex)
3005     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3006 
3007   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
3008     .addDef(Dst)
3009     .addUse(VData); // vdata
3010 
3011   if (IsCmpSwap)
3012     MIB.addReg(CmpVal);
3013 
3014   MIB.addUse(RSrc)               // rsrc
3015      .addUse(VIndex)             // vindex
3016      .addUse(VOffset)            // voffset
3017      .addUse(SOffset)            // soffset
3018      .addImm(ImmOffset)          // offset(imm)
3019      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3020      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3021      .addMemOperand(MMO);
3022 
3023   MI.eraseFromParent();
3024   return true;
3025 }
3026 
3027 // Produce a vector of s16 elements from s32 pieces.
3028 static void truncToS16Vector(MachineIRBuilder &B, Register DstReg,
3029                              ArrayRef<Register> UnmergeParts) {
3030   const LLT S16 = LLT::scalar(16);
3031 
3032   SmallVector<Register, 4> RemergeParts(UnmergeParts.size());
3033   for (int I = 0, E = UnmergeParts.size(); I != E; ++I)
3034     RemergeParts[I] = B.buildTrunc(S16, UnmergeParts[I]).getReg(0);
3035 
3036   B.buildBuildVector(DstReg, RemergeParts);
3037 }
3038 
3039 /// Convert a set of s32 registers to a result vector with s16 elements.
3040 static void bitcastToS16Vector(MachineIRBuilder &B, Register DstReg,
3041                                ArrayRef<Register> UnmergeParts) {
3042   MachineRegisterInfo &MRI = *B.getMRI();
3043   const LLT V2S16 = LLT::vector(2, 16);
3044   LLT TargetTy = MRI.getType(DstReg);
3045   int NumElts = UnmergeParts.size();
3046 
3047   if (NumElts == 1) {
3048     assert(TargetTy == V2S16);
3049     B.buildBitcast(DstReg, UnmergeParts[0]);
3050     return;
3051   }
3052 
3053   SmallVector<Register, 4> RemergeParts(NumElts);
3054   for (int I = 0; I != NumElts; ++I)
3055     RemergeParts[I] = B.buildBitcast(V2S16, UnmergeParts[I]).getReg(0);
3056 
3057   if (TargetTy.getSizeInBits() == 32u * NumElts) {
3058     B.buildConcatVectors(DstReg, RemergeParts);
3059     return;
3060   }
3061 
3062   const LLT V3S16 = LLT::vector(3, 16);
3063   const LLT V6S16 = LLT::vector(6, 16);
3064 
3065   // Widen to v6s16 and unpack v3 parts.
3066   assert(TargetTy == V3S16);
3067 
3068   RemergeParts.push_back(B.buildUndef(V2S16).getReg(0));
3069   auto Concat = B.buildConcatVectors(V6S16, RemergeParts);
3070   B.buildUnmerge({DstReg, MRI.createGenericVirtualRegister(V3S16)}, Concat);
3071 }
3072 
3073 // FIXME: Just vector trunc should be sufficent, but legalization currently
3074 // broken.
3075 static void repackUnpackedD16Load(MachineIRBuilder &B, Register DstReg,
3076                                   Register WideDstReg) {
3077   const LLT S32 = LLT::scalar(32);
3078   const LLT S16 = LLT::scalar(16);
3079 
3080   auto Unmerge = B.buildUnmerge(S32, WideDstReg);
3081 
3082   int NumOps = Unmerge->getNumOperands() - 1;
3083   SmallVector<Register, 4> RemergeParts(NumOps);
3084   for (int I = 0; I != NumOps; ++I)
3085     RemergeParts[I] = B.buildTrunc(S16, Unmerge.getReg(I)).getReg(0);
3086 
3087   B.buildBuildVector(DstReg, RemergeParts);
3088 }
3089 
3090 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3091     MachineInstr &MI, MachineIRBuilder &B,
3092     GISelChangeObserver &Observer,
3093     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3094   bool IsTFE = MI.getNumExplicitDefs() == 2;
3095 
3096   // We are only processing the operands of d16 image operations on subtargets
3097   // that use the unpacked register layout, or need to repack the TFE result.
3098 
3099   // TODO: Need to handle a16 images too
3100   // TODO: Do we need to guard against already legalized intrinsics?
3101   if (!IsTFE && !ST.hasUnpackedD16VMem())
3102     return true;
3103 
3104   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3105     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3106 
3107   if (BaseOpcode->Atomic) // No d16 atomics, or TFE.
3108     return true;
3109 
3110   B.setInstr(MI);
3111 
3112   MachineRegisterInfo *MRI = B.getMRI();
3113   const LLT S32 = LLT::scalar(32);
3114   const LLT S16 = LLT::scalar(16);
3115 
3116   if (BaseOpcode->Store) { // No TFE for stores?
3117     Register VData = MI.getOperand(1).getReg();
3118     LLT Ty = MRI->getType(VData);
3119     if (!Ty.isVector() || Ty.getElementType() != S16)
3120       return true;
3121 
3122     B.setInstr(MI);
3123 
3124     Observer.changingInstr(MI);
3125     MI.getOperand(1).setReg(handleD16VData(B, *MRI, VData));
3126     Observer.changedInstr(MI);
3127     return true;
3128   }
3129 
3130   Register DstReg = MI.getOperand(0).getReg();
3131   LLT Ty = MRI->getType(DstReg);
3132   const LLT EltTy = Ty.getScalarType();
3133   const bool IsD16 = Ty.getScalarType() == S16;
3134   const unsigned NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
3135 
3136   if (IsTFE) {
3137     // In the IR, TFE is supposed to be used with a 2 element struct return
3138     // type. The intruction really returns these two values in one contiguous
3139     // register, with one additional dword beyond the loaded data. Rewrite the
3140     // return type to use a single register result.
3141     Register Dst1Reg = MI.getOperand(1).getReg();
3142     if (MRI->getType(Dst1Reg) != S32)
3143       return false;
3144 
3145     // TODO: Make sure the TFE operand bit is set.
3146 
3147     // The raw dword aligned data component of the load. The only legal cases
3148     // where this matters should be when using the packed D16 format, for
3149     // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
3150     LLT RoundedTy;
3151     LLT TFETy;
3152 
3153     if (IsD16 && ST.hasUnpackedD16VMem()) {
3154       RoundedTy = LLT::scalarOrVector(NumElts, 32);
3155       TFETy = LLT::vector(NumElts + 1, 32);
3156     } else {
3157       unsigned EltSize = Ty.getScalarSizeInBits();
3158       unsigned RoundedElts = (Ty.getSizeInBits() + 31) / 32;
3159       unsigned RoundedSize = 32 * RoundedElts;
3160       RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
3161       TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
3162     }
3163 
3164     Register TFEReg = MRI->createGenericVirtualRegister(TFETy);
3165     Observer.changingInstr(MI);
3166 
3167     MI.getOperand(0).setReg(TFEReg);
3168     MI.RemoveOperand(1);
3169 
3170     Observer.changedInstr(MI);
3171 
3172     // Insert after the instruction.
3173     B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3174 
3175     // Now figure out how to copy the new result register back into the old
3176     // result.
3177 
3178     SmallVector<Register, 5> UnmergeResults(TFETy.getNumElements(), Dst1Reg);
3179     int NumDataElts = TFETy.getNumElements() - 1;
3180 
3181     if (!Ty.isVector()) {
3182       // Simplest case is a trivial unmerge (plus a truncate for d16).
3183       UnmergeResults[0] = Ty == S32 ?
3184         DstReg : MRI->createGenericVirtualRegister(S32);
3185 
3186       B.buildUnmerge(UnmergeResults, TFEReg);
3187       if (Ty != S32)
3188         B.buildTrunc(DstReg, UnmergeResults[0]);
3189       return true;
3190     }
3191 
3192     // We have to repack into a new vector of some kind.
3193     for (int I = 0; I != NumDataElts; ++I)
3194       UnmergeResults[I] = MRI->createGenericVirtualRegister(S32);
3195     B.buildUnmerge(UnmergeResults, TFEReg);
3196 
3197     // Drop the final TFE element.
3198     ArrayRef<Register> DataPart(UnmergeResults.data(), NumDataElts);
3199 
3200     if (EltTy == S32)
3201       B.buildBuildVector(DstReg, DataPart);
3202     else if (ST.hasUnpackedD16VMem())
3203       truncToS16Vector(B, DstReg, DataPart);
3204     else
3205       bitcastToS16Vector(B, DstReg, DataPart);
3206 
3207     return true;
3208   }
3209 
3210   // Must be an image load.
3211   if (!Ty.isVector() || Ty.getElementType() != S16)
3212     return true;
3213 
3214   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3215 
3216   LLT WidenedTy = Ty.changeElementType(S32);
3217   Register WideDstReg = MRI->createGenericVirtualRegister(WidenedTy);
3218 
3219   Observer.changingInstr(MI);
3220   MI.getOperand(0).setReg(WideDstReg);
3221   Observer.changedInstr(MI);
3222 
3223   repackUnpackedD16Load(B, DstReg, WideDstReg);
3224   return true;
3225 }
3226 
3227 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
3228   MachineInstr &MI, MachineIRBuilder &B,
3229   GISelChangeObserver &Observer) const {
3230   Register Dst = MI.getOperand(0).getReg();
3231   LLT Ty = B.getMRI()->getType(Dst);
3232   unsigned Size = Ty.getSizeInBits();
3233   MachineFunction &MF = B.getMF();
3234 
3235   Observer.changingInstr(MI);
3236 
3237   // FIXME: We don't really need this intermediate instruction. The intrinsic
3238   // should be fixed to have a memory operand. Since it's readnone, we're not
3239   // allowed to add one.
3240   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
3241   MI.RemoveOperand(1); // Remove intrinsic ID
3242 
3243   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
3244   // TODO: Should this use datalayout alignment?
3245   const unsigned MemSize = (Size + 7) / 8;
3246   const unsigned MemAlign = 4;
3247   MachineMemOperand *MMO = MF.getMachineMemOperand(
3248     MachinePointerInfo(),
3249     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
3250     MachineMemOperand::MOInvariant, MemSize, MemAlign);
3251   MI.addMemOperand(MF, MMO);
3252 
3253   // There are no 96-bit result scalar loads, but widening to 128-bit should
3254   // always be legal. We may need to restore this to a 96-bit result if it turns
3255   // out this needs to be converted to a vector load during RegBankSelect.
3256   if (!isPowerOf2_32(Size)) {
3257     LegalizerHelper Helper(MF, *this, Observer, B);
3258     B.setInstr(MI);
3259 
3260     if (Ty.isVector())
3261       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
3262     else
3263       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
3264   }
3265 
3266   Observer.changedInstr(MI);
3267   return true;
3268 }
3269 
3270 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
3271                                             MachineIRBuilder &B,
3272                                             GISelChangeObserver &Observer) const {
3273   MachineRegisterInfo &MRI = *B.getMRI();
3274 
3275   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
3276   auto IntrID = MI.getIntrinsicID();
3277   switch (IntrID) {
3278   case Intrinsic::amdgcn_if:
3279   case Intrinsic::amdgcn_else: {
3280     MachineInstr *Br = nullptr;
3281     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
3282       const SIRegisterInfo *TRI
3283         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
3284 
3285       B.setInstr(*BrCond);
3286       Register Def = MI.getOperand(1).getReg();
3287       Register Use = MI.getOperand(3).getReg();
3288 
3289       MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB();
3290       if (Br)
3291         BrTarget = Br->getOperand(0).getMBB();
3292 
3293       if (IntrID == Intrinsic::amdgcn_if) {
3294         B.buildInstr(AMDGPU::SI_IF)
3295           .addDef(Def)
3296           .addUse(Use)
3297           .addMBB(BrTarget);
3298       } else {
3299         B.buildInstr(AMDGPU::SI_ELSE)
3300           .addDef(Def)
3301           .addUse(Use)
3302           .addMBB(BrTarget)
3303           .addImm(0);
3304       }
3305 
3306       if (Br)
3307         Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB());
3308 
3309       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
3310       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
3311       MI.eraseFromParent();
3312       BrCond->eraseFromParent();
3313       return true;
3314     }
3315 
3316     return false;
3317   }
3318   case Intrinsic::amdgcn_loop: {
3319     MachineInstr *Br = nullptr;
3320     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
3321       const SIRegisterInfo *TRI
3322         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
3323 
3324       B.setInstr(*BrCond);
3325 
3326       // FIXME: Need to adjust branch targets based on unconditional branch.
3327       Register Reg = MI.getOperand(2).getReg();
3328       B.buildInstr(AMDGPU::SI_LOOP)
3329         .addUse(Reg)
3330         .addMBB(BrCond->getOperand(1).getMBB());
3331       MI.eraseFromParent();
3332       BrCond->eraseFromParent();
3333       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
3334       return true;
3335     }
3336 
3337     return false;
3338   }
3339   case Intrinsic::amdgcn_kernarg_segment_ptr:
3340     return legalizePreloadedArgIntrin(
3341       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
3342   case Intrinsic::amdgcn_implicitarg_ptr:
3343     return legalizeImplicitArgPtr(MI, MRI, B);
3344   case Intrinsic::amdgcn_workitem_id_x:
3345     return legalizePreloadedArgIntrin(MI, MRI, B,
3346                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
3347   case Intrinsic::amdgcn_workitem_id_y:
3348     return legalizePreloadedArgIntrin(MI, MRI, B,
3349                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
3350   case Intrinsic::amdgcn_workitem_id_z:
3351     return legalizePreloadedArgIntrin(MI, MRI, B,
3352                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
3353   case Intrinsic::amdgcn_workgroup_id_x:
3354     return legalizePreloadedArgIntrin(MI, MRI, B,
3355                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
3356   case Intrinsic::amdgcn_workgroup_id_y:
3357     return legalizePreloadedArgIntrin(MI, MRI, B,
3358                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
3359   case Intrinsic::amdgcn_workgroup_id_z:
3360     return legalizePreloadedArgIntrin(MI, MRI, B,
3361                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
3362   case Intrinsic::amdgcn_dispatch_ptr:
3363     return legalizePreloadedArgIntrin(MI, MRI, B,
3364                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
3365   case Intrinsic::amdgcn_queue_ptr:
3366     return legalizePreloadedArgIntrin(MI, MRI, B,
3367                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
3368   case Intrinsic::amdgcn_implicit_buffer_ptr:
3369     return legalizePreloadedArgIntrin(
3370       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
3371   case Intrinsic::amdgcn_dispatch_id:
3372     return legalizePreloadedArgIntrin(MI, MRI, B,
3373                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
3374   case Intrinsic::amdgcn_fdiv_fast:
3375     return legalizeFDIVFastIntrin(MI, MRI, B);
3376   case Intrinsic::amdgcn_is_shared:
3377     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
3378   case Intrinsic::amdgcn_is_private:
3379     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
3380   case Intrinsic::amdgcn_wavefrontsize: {
3381     B.setInstr(MI);
3382     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
3383     MI.eraseFromParent();
3384     return true;
3385   }
3386   case Intrinsic::amdgcn_s_buffer_load:
3387     return legalizeSBufferLoad(MI, B, Observer);
3388   case Intrinsic::amdgcn_raw_buffer_store:
3389   case Intrinsic::amdgcn_struct_buffer_store:
3390     return legalizeBufferStore(MI, MRI, B, false, false);
3391   case Intrinsic::amdgcn_raw_buffer_store_format:
3392   case Intrinsic::amdgcn_struct_buffer_store_format:
3393     return legalizeBufferStore(MI, MRI, B, false, true);
3394   case Intrinsic::amdgcn_raw_tbuffer_store:
3395   case Intrinsic::amdgcn_struct_tbuffer_store:
3396     return legalizeBufferStore(MI, MRI, B, true, true);
3397   case Intrinsic::amdgcn_raw_buffer_load:
3398   case Intrinsic::amdgcn_struct_buffer_load:
3399     return legalizeBufferLoad(MI, MRI, B, false, false);
3400   case Intrinsic::amdgcn_raw_buffer_load_format:
3401   case Intrinsic::amdgcn_struct_buffer_load_format:
3402     return legalizeBufferLoad(MI, MRI, B, true, false);
3403   case Intrinsic::amdgcn_raw_tbuffer_load:
3404   case Intrinsic::amdgcn_struct_tbuffer_load:
3405     return legalizeBufferLoad(MI, MRI, B, true, true);
3406   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3407   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3408   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3409   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3410   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3411   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3412   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3413   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3414   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3415   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3416   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3417   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3418   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3419   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3420   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3421   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3422   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3423   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3424   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3425   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3426   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3427   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3428   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3429   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3430   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3431   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3432     return legalizeBufferAtomic(MI, B, IntrID);
3433   case Intrinsic::amdgcn_atomic_inc:
3434     return legalizeAtomicIncDec(MI, B, true);
3435   case Intrinsic::amdgcn_atomic_dec:
3436     return legalizeAtomicIncDec(MI, B, false);
3437   default: {
3438     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
3439             AMDGPU::getImageDimIntrinsicInfo(IntrID))
3440       return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr);
3441     return true;
3442   }
3443   }
3444 
3445   return true;
3446 }
3447