1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
19 #endif
20 
21 #include "AMDGPULegalizerInfo.h"
22 
23 #include "AMDGPU.h"
24 #include "AMDGPUGlobalISelUtils.h"
25 #include "AMDGPUTargetMachine.h"
26 #include "SIMachineFunctionInfo.h"
27 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
28 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
29 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
30 #include "llvm/CodeGen/TargetOpcodes.h"
31 #include "llvm/CodeGen/ValueTypes.h"
32 #include "llvm/IR/DerivedTypes.h"
33 #include "llvm/IR/DiagnosticInfo.h"
34 #include "llvm/IR/Type.h"
35 #include "llvm/Support/Debug.h"
36 
37 #define DEBUG_TYPE "amdgpu-legalinfo"
38 
39 using namespace llvm;
40 using namespace LegalizeActions;
41 using namespace LegalizeMutations;
42 using namespace LegalityPredicates;
43 using namespace MIPatternMatch;
44 
45 // Round the number of elements to the next power of two elements
46 static LLT getPow2VectorType(LLT Ty) {
47   unsigned NElts = Ty.getNumElements();
48   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
49   return Ty.changeNumElements(Pow2NElts);
50 }
51 
52 // Round the number of bits to the next power of two bits
53 static LLT getPow2ScalarType(LLT Ty) {
54   unsigned Bits = Ty.getSizeInBits();
55   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
56   return LLT::scalar(Pow2Bits);
57 }
58 
59 static LegalityPredicate isMultiple32(unsigned TypeIdx,
60                                       unsigned MaxSize = 1024) {
61   return [=](const LegalityQuery &Query) {
62     const LLT Ty = Query.Types[TypeIdx];
63     const LLT EltTy = Ty.getScalarType();
64     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
65   };
66 }
67 
68 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
69   return [=](const LegalityQuery &Query) {
70     return Query.Types[TypeIdx].getSizeInBits() == Size;
71   };
72 }
73 
74 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
75   return [=](const LegalityQuery &Query) {
76     const LLT Ty = Query.Types[TypeIdx];
77     return Ty.isVector() &&
78            Ty.getNumElements() % 2 != 0 &&
79            Ty.getElementType().getSizeInBits() < 32 &&
80            Ty.getSizeInBits() % 32 != 0;
81   };
82 }
83 
84 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
85   return [=](const LegalityQuery &Query) {
86     const LLT Ty = Query.Types[TypeIdx];
87     const LLT EltTy = Ty.getScalarType();
88     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
89   };
90 }
91 
92 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
93   return [=](const LegalityQuery &Query) {
94     const LLT Ty = Query.Types[TypeIdx];
95     const LLT EltTy = Ty.getElementType();
96     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
97   };
98 }
99 
100 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
101   return [=](const LegalityQuery &Query) {
102     const LLT Ty = Query.Types[TypeIdx];
103     const LLT EltTy = Ty.getElementType();
104     unsigned Size = Ty.getSizeInBits();
105     unsigned Pieces = (Size + 63) / 64;
106     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
107     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
108   };
109 }
110 
111 // Increase the number of vector elements to reach the next multiple of 32-bit
112 // type.
113 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
114   return [=](const LegalityQuery &Query) {
115     const LLT Ty = Query.Types[TypeIdx];
116 
117     const LLT EltTy = Ty.getElementType();
118     const int Size = Ty.getSizeInBits();
119     const int EltSize = EltTy.getSizeInBits();
120     const int NextMul32 = (Size + 31) / 32;
121 
122     assert(EltSize < 32);
123 
124     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
125     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
126   };
127 }
128 
129 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
130   return [=](const LegalityQuery &Query) {
131     const LLT QueryTy = Query.Types[TypeIdx];
132     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
133   };
134 }
135 
136 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
137   return [=](const LegalityQuery &Query) {
138     const LLT QueryTy = Query.Types[TypeIdx];
139     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
140   };
141 }
142 
143 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
144   return [=](const LegalityQuery &Query) {
145     const LLT QueryTy = Query.Types[TypeIdx];
146     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
147   };
148 }
149 
150 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
151 // v2s16.
152 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
153   return [=](const LegalityQuery &Query) {
154     const LLT Ty = Query.Types[TypeIdx];
155     if (Ty.isVector()) {
156       const int EltSize = Ty.getElementType().getSizeInBits();
157       return EltSize == 32 || EltSize == 64 ||
158             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
159              EltSize == 128 || EltSize == 256;
160     }
161 
162     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
163   };
164 }
165 
166 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
167   return [=](const LegalityQuery &Query) {
168     const LLT QueryTy = Query.Types[TypeIdx];
169     return QueryTy.isVector() && QueryTy.getElementType() == Type;
170   };
171 }
172 
173 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
174   return [=](const LegalityQuery &Query) {
175     const LLT Ty = Query.Types[TypeIdx];
176     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
177            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
178   };
179 }
180 
181 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
182                                          const GCNTargetMachine &TM)
183   :  ST(ST_) {
184   using namespace TargetOpcode;
185 
186   auto GetAddrSpacePtr = [&TM](unsigned AS) {
187     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
188   };
189 
190   const LLT S1 = LLT::scalar(1);
191   const LLT S16 = LLT::scalar(16);
192   const LLT S32 = LLT::scalar(32);
193   const LLT S64 = LLT::scalar(64);
194   const LLT S96 = LLT::scalar(96);
195   const LLT S128 = LLT::scalar(128);
196   const LLT S256 = LLT::scalar(256);
197   const LLT S1024 = LLT::scalar(1024);
198 
199   const LLT V2S16 = LLT::vector(2, 16);
200   const LLT V4S16 = LLT::vector(4, 16);
201 
202   const LLT V2S32 = LLT::vector(2, 32);
203   const LLT V3S32 = LLT::vector(3, 32);
204   const LLT V4S32 = LLT::vector(4, 32);
205   const LLT V5S32 = LLT::vector(5, 32);
206   const LLT V6S32 = LLT::vector(6, 32);
207   const LLT V7S32 = LLT::vector(7, 32);
208   const LLT V8S32 = LLT::vector(8, 32);
209   const LLT V9S32 = LLT::vector(9, 32);
210   const LLT V10S32 = LLT::vector(10, 32);
211   const LLT V11S32 = LLT::vector(11, 32);
212   const LLT V12S32 = LLT::vector(12, 32);
213   const LLT V13S32 = LLT::vector(13, 32);
214   const LLT V14S32 = LLT::vector(14, 32);
215   const LLT V15S32 = LLT::vector(15, 32);
216   const LLT V16S32 = LLT::vector(16, 32);
217   const LLT V32S32 = LLT::vector(32, 32);
218 
219   const LLT V2S64 = LLT::vector(2, 64);
220   const LLT V3S64 = LLT::vector(3, 64);
221   const LLT V4S64 = LLT::vector(4, 64);
222   const LLT V5S64 = LLT::vector(5, 64);
223   const LLT V6S64 = LLT::vector(6, 64);
224   const LLT V7S64 = LLT::vector(7, 64);
225   const LLT V8S64 = LLT::vector(8, 64);
226   const LLT V16S64 = LLT::vector(16, 64);
227 
228   std::initializer_list<LLT> AllS32Vectors =
229     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
230      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
231   std::initializer_list<LLT> AllS64Vectors =
232     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
233 
234   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
235   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
236   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
237   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
238   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
239   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
240   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
241 
242   const LLT CodePtr = FlatPtr;
243 
244   const std::initializer_list<LLT> AddrSpaces64 = {
245     GlobalPtr, ConstantPtr, FlatPtr
246   };
247 
248   const std::initializer_list<LLT> AddrSpaces32 = {
249     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
250   };
251 
252   const std::initializer_list<LLT> FPTypesBase = {
253     S32, S64
254   };
255 
256   const std::initializer_list<LLT> FPTypes16 = {
257     S32, S64, S16
258   };
259 
260   const std::initializer_list<LLT> FPTypesPK16 = {
261     S32, S64, S16, V2S16
262   };
263 
264   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
265 
266   setAction({G_BRCOND, S1}, Legal); // VCC branches
267   setAction({G_BRCOND, S32}, Legal); // SCC branches
268 
269   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
270   // elements for v3s16
271   getActionDefinitionsBuilder(G_PHI)
272     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
273     .legalFor(AllS32Vectors)
274     .legalFor(AllS64Vectors)
275     .legalFor(AddrSpaces64)
276     .legalFor(AddrSpaces32)
277     .clampScalar(0, S32, S256)
278     .widenScalarToNextPow2(0, 32)
279     .clampMaxNumElements(0, S32, 16)
280     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
281     .legalIf(isPointer(0));
282 
283   if (ST.has16BitInsts()) {
284     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
285       .legalFor({S32, S16})
286       .clampScalar(0, S16, S32)
287       .scalarize(0)
288       .widenScalarToNextPow2(0, 32);
289   } else {
290     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
291       .legalFor({S32})
292       .clampScalar(0, S32, S32)
293       .scalarize(0);
294   }
295 
296   // FIXME: Not really legal. Placeholder for custom lowering.
297   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
298     .legalFor({S32, S64})
299     .clampScalar(0, S32, S64)
300     .widenScalarToNextPow2(0, 32)
301     .scalarize(0);
302 
303   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
304     .legalFor({S32})
305     .clampScalar(0, S32, S32)
306     .scalarize(0);
307 
308   // Report legal for any types we can handle anywhere. For the cases only legal
309   // on the SALU, RegBankSelect will be able to re-legalize.
310   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
311     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
312     .clampScalar(0, S32, S64)
313     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
314     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
315     .widenScalarToNextPow2(0)
316     .scalarize(0);
317 
318   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
319                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
320     .legalFor({{S32, S1}, {S32, S32}})
321     .clampScalar(0, S32, S32)
322     .scalarize(0); // TODO: Implement.
323 
324   getActionDefinitionsBuilder(G_BITCAST)
325     // Don't worry about the size constraint.
326     .legalIf(all(isRegisterType(0), isRegisterType(1)))
327     .lower();
328 
329 
330   getActionDefinitionsBuilder(G_CONSTANT)
331     .legalFor({S1, S32, S64, S16, GlobalPtr,
332                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
333     .clampScalar(0, S32, S64)
334     .widenScalarToNextPow2(0)
335     .legalIf(isPointer(0));
336 
337   getActionDefinitionsBuilder(G_FCONSTANT)
338     .legalFor({S32, S64, S16})
339     .clampScalar(0, S16, S64);
340 
341   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
342     .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
343                ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
344     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
345     .clampScalarOrElt(0, S32, S1024)
346     .legalIf(isMultiple32(0))
347     .widenScalarToNextPow2(0, 32)
348     .clampMaxNumElements(0, S32, 16);
349 
350   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
351   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
352     .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr});
353   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
354 
355   auto &FPOpActions = getActionDefinitionsBuilder(
356     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
357     .legalFor({S32, S64});
358   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
359     .customFor({S32, S64});
360   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
361     .customFor({S32, S64});
362 
363   if (ST.has16BitInsts()) {
364     if (ST.hasVOP3PInsts())
365       FPOpActions.legalFor({S16, V2S16});
366     else
367       FPOpActions.legalFor({S16});
368 
369     TrigActions.customFor({S16});
370     FDIVActions.customFor({S16});
371   }
372 
373   auto &MinNumMaxNum = getActionDefinitionsBuilder({
374       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
375 
376   if (ST.hasVOP3PInsts()) {
377     MinNumMaxNum.customFor(FPTypesPK16)
378       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
379       .clampMaxNumElements(0, S16, 2)
380       .clampScalar(0, S16, S64)
381       .scalarize(0);
382   } else if (ST.has16BitInsts()) {
383     MinNumMaxNum.customFor(FPTypes16)
384       .clampScalar(0, S16, S64)
385       .scalarize(0);
386   } else {
387     MinNumMaxNum.customFor(FPTypesBase)
388       .clampScalar(0, S32, S64)
389       .scalarize(0);
390   }
391 
392   if (ST.hasVOP3PInsts())
393     FPOpActions.clampMaxNumElements(0, S16, 2);
394 
395   FPOpActions
396     .scalarize(0)
397     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
398 
399   TrigActions
400     .scalarize(0)
401     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
402 
403   FDIVActions
404     .scalarize(0)
405     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
406 
407   getActionDefinitionsBuilder({G_FNEG, G_FABS})
408     .legalFor(FPTypesPK16)
409     .clampMaxNumElements(0, S16, 2)
410     .scalarize(0)
411     .clampScalar(0, S16, S64);
412 
413   if (ST.has16BitInsts()) {
414     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
415       .legalFor({S32, S64, S16})
416       .scalarize(0)
417       .clampScalar(0, S16, S64);
418   } else {
419     getActionDefinitionsBuilder(G_FSQRT)
420       .legalFor({S32, S64})
421       .scalarize(0)
422       .clampScalar(0, S32, S64);
423 
424     if (ST.hasFractBug()) {
425       getActionDefinitionsBuilder(G_FFLOOR)
426         .customFor({S64})
427         .legalFor({S32, S64})
428         .scalarize(0)
429         .clampScalar(0, S32, S64);
430     } else {
431       getActionDefinitionsBuilder(G_FFLOOR)
432         .legalFor({S32, S64})
433         .scalarize(0)
434         .clampScalar(0, S32, S64);
435     }
436   }
437 
438   getActionDefinitionsBuilder(G_FPTRUNC)
439     .legalFor({{S32, S64}, {S16, S32}})
440     .scalarize(0);
441 
442   getActionDefinitionsBuilder(G_FPEXT)
443     .legalFor({{S64, S32}, {S32, S16}})
444     .lowerFor({{S64, S16}}) // FIXME: Implement
445     .scalarize(0);
446 
447   getActionDefinitionsBuilder(G_FSUB)
448       // Use actual fsub instruction
449       .legalFor({S32})
450       // Must use fadd + fneg
451       .lowerFor({S64, S16, V2S16})
452       .scalarize(0)
453       .clampScalar(0, S32, S64);
454 
455   // Whether this is legal depends on the floating point mode for the function.
456   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
457   if (ST.hasMadF16())
458     FMad.customFor({S32, S16});
459   else
460     FMad.customFor({S32});
461   FMad.scalarize(0)
462       .lower();
463 
464   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
465     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
466                {S32, S1}, {S64, S1}, {S16, S1}})
467     .scalarize(0)
468     .clampScalar(0, S32, S64)
469     .widenScalarToNextPow2(1, 32);
470 
471   // TODO: Split s1->s64 during regbankselect for VALU.
472   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
473     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
474     .lowerFor({{S32, S64}})
475     .lowerIf(typeIs(1, S1))
476     .customFor({{S64, S64}});
477   if (ST.has16BitInsts())
478     IToFP.legalFor({{S16, S16}});
479   IToFP.clampScalar(1, S32, S64)
480        .scalarize(0);
481 
482   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
483     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
484     .customFor({{S64, S64}});
485   if (ST.has16BitInsts())
486     FPToI.legalFor({{S16, S16}});
487   else
488     FPToI.minScalar(1, S32);
489 
490   FPToI.minScalar(0, S32)
491        .scalarize(0)
492        .lower();
493 
494   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
495     .scalarize(0)
496     .lower();
497 
498   if (ST.has16BitInsts()) {
499     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
500       .legalFor({S16, S32, S64})
501       .clampScalar(0, S16, S64)
502       .scalarize(0);
503   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
504     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
505       .legalFor({S32, S64})
506       .clampScalar(0, S32, S64)
507       .scalarize(0);
508   } else {
509     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
510       .legalFor({S32})
511       .customFor({S64})
512       .clampScalar(0, S32, S64)
513       .scalarize(0);
514   }
515 
516   getActionDefinitionsBuilder({G_PTR_ADD, G_PTR_MASK})
517     .scalarize(0)
518     .alwaysLegal();
519 
520   auto &CmpBuilder =
521     getActionDefinitionsBuilder(G_ICMP)
522     // The compare output type differs based on the register bank of the output,
523     // so make both s1 and s32 legal.
524     //
525     // Scalar compares producing output in scc will be promoted to s32, as that
526     // is the allocatable register type that will be needed for the copy from
527     // scc. This will be promoted during RegBankSelect, and we assume something
528     // before that won't try to use s32 result types.
529     //
530     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
531     // bank.
532     .legalForCartesianProduct(
533       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
534     .legalForCartesianProduct(
535       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
536   if (ST.has16BitInsts()) {
537     CmpBuilder.legalFor({{S1, S16}});
538   }
539 
540   CmpBuilder
541     .widenScalarToNextPow2(1)
542     .clampScalar(1, S32, S64)
543     .scalarize(0)
544     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
545 
546   getActionDefinitionsBuilder(G_FCMP)
547     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
548     .widenScalarToNextPow2(1)
549     .clampScalar(1, S32, S64)
550     .scalarize(0);
551 
552   // FIXME: fpow has a selection pattern that should move to custom lowering.
553   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2, G_FPOW});
554   if (ST.has16BitInsts())
555     Exp2Ops.legalFor({S32, S16});
556   else
557     Exp2Ops.legalFor({S32});
558   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
559   Exp2Ops.scalarize(0);
560 
561   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10});
562   if (ST.has16BitInsts())
563     ExpOps.customFor({{S32}, {S16}});
564   else
565     ExpOps.customFor({S32});
566   ExpOps.clampScalar(0, MinScalarFPTy, S32)
567         .scalarize(0);
568 
569   // The 64-bit versions produce 32-bit results, but only on the SALU.
570   getActionDefinitionsBuilder(G_CTPOP)
571     .legalFor({{S32, S32}, {S32, S64}})
572     .clampScalar(0, S32, S32)
573     .clampScalar(1, S32, S64)
574     .scalarize(0)
575     .widenScalarToNextPow2(0, 32)
576     .widenScalarToNextPow2(1, 32);
577 
578   // The hardware instructions return a different result on 0 than the generic
579   // instructions expect. The hardware produces -1, but these produce the
580   // bitwidth.
581   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
582     .scalarize(0)
583     .clampScalar(0, S32, S32)
584     .clampScalar(1, S32, S64)
585     .widenScalarToNextPow2(0, 32)
586     .widenScalarToNextPow2(1, 32)
587     .lower();
588 
589   // The 64-bit versions produce 32-bit results, but only on the SALU.
590   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
591     .legalFor({{S32, S32}, {S32, S64}})
592     .clampScalar(0, S32, S32)
593     .clampScalar(1, S32, S64)
594     .scalarize(0)
595     .widenScalarToNextPow2(0, 32)
596     .widenScalarToNextPow2(1, 32);
597 
598   // TODO: Expand for > s32
599   getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE})
600     .legalFor({S32})
601     .clampScalar(0, S32, S32)
602     .scalarize(0);
603 
604   if (ST.has16BitInsts()) {
605     if (ST.hasVOP3PInsts()) {
606       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
607         .legalFor({S32, S16, V2S16})
608         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
609         .clampMaxNumElements(0, S16, 2)
610         .clampScalar(0, S16, S32)
611         .widenScalarToNextPow2(0)
612         .scalarize(0);
613     } else {
614       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
615         .legalFor({S32, S16})
616         .widenScalarToNextPow2(0)
617         .clampScalar(0, S16, S32)
618         .scalarize(0);
619     }
620   } else {
621     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
622       .legalFor({S32})
623       .clampScalar(0, S32, S32)
624       .widenScalarToNextPow2(0)
625       .scalarize(0);
626   }
627 
628   auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
629     return [=](const LegalityQuery &Query) {
630       return Query.Types[TypeIdx0].getSizeInBits() <
631              Query.Types[TypeIdx1].getSizeInBits();
632     };
633   };
634 
635   auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
636     return [=](const LegalityQuery &Query) {
637       return Query.Types[TypeIdx0].getSizeInBits() >
638              Query.Types[TypeIdx1].getSizeInBits();
639     };
640   };
641 
642   getActionDefinitionsBuilder(G_INTTOPTR)
643     // List the common cases
644     .legalForCartesianProduct(AddrSpaces64, {S64})
645     .legalForCartesianProduct(AddrSpaces32, {S32})
646     .scalarize(0)
647     // Accept any address space as long as the size matches
648     .legalIf(sameSize(0, 1))
649     .widenScalarIf(smallerThan(1, 0),
650       [](const LegalityQuery &Query) {
651         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
652       })
653     .narrowScalarIf(greaterThan(1, 0),
654       [](const LegalityQuery &Query) {
655         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
656       });
657 
658   getActionDefinitionsBuilder(G_PTRTOINT)
659     // List the common cases
660     .legalForCartesianProduct(AddrSpaces64, {S64})
661     .legalForCartesianProduct(AddrSpaces32, {S32})
662     .scalarize(0)
663     // Accept any address space as long as the size matches
664     .legalIf(sameSize(0, 1))
665     .widenScalarIf(smallerThan(0, 1),
666       [](const LegalityQuery &Query) {
667         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
668       })
669     .narrowScalarIf(
670       greaterThan(0, 1),
671       [](const LegalityQuery &Query) {
672         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
673       });
674 
675   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
676     .scalarize(0)
677     .custom();
678 
679   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
680   // handle some operations by just promoting the register during
681   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
682   auto maxSizeForAddrSpace = [this](unsigned AS, bool IsLoad) -> unsigned {
683     switch (AS) {
684     // FIXME: Private element size.
685     case AMDGPUAS::PRIVATE_ADDRESS:
686       return 32;
687     // FIXME: Check subtarget
688     case AMDGPUAS::LOCAL_ADDRESS:
689       return ST.useDS128() ? 128 : 64;
690 
691     // Treat constant and global as identical. SMRD loads are sometimes usable
692     // for global loads (ideally constant address space should be eliminated)
693     // depending on the context. Legality cannot be context dependent, but
694     // RegBankSelect can split the load as necessary depending on the pointer
695     // register bank/uniformity and if the memory is invariant or not written in
696     // a kernel.
697     case AMDGPUAS::CONSTANT_ADDRESS:
698     case AMDGPUAS::GLOBAL_ADDRESS:
699       return IsLoad ? 512 : 128;
700     default:
701       return 128;
702     }
703   };
704 
705   const auto needToSplitMemOp = [=](const LegalityQuery &Query, bool IsLoad) -> bool {
706     const LLT DstTy = Query.Types[0];
707 
708     // Split vector extloads.
709     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
710     unsigned Align = Query.MMODescrs[0].AlignInBits;
711 
712     if (MemSize < DstTy.getSizeInBits())
713       MemSize = std::max(MemSize, Align);
714 
715     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
716       return true;
717 
718     const LLT PtrTy = Query.Types[1];
719     unsigned AS = PtrTy.getAddressSpace();
720     if (MemSize > maxSizeForAddrSpace(AS, IsLoad))
721       return true;
722 
723     // Catch weird sized loads that don't evenly divide into the access sizes
724     // TODO: May be able to widen depending on alignment etc.
725     unsigned NumRegs = MemSize / 32;
726     if (NumRegs == 3 && !ST.hasDwordx3LoadStores())
727       return true;
728 
729     if (Align < MemSize) {
730       const SITargetLowering *TLI = ST.getTargetLowering();
731       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
732     }
733 
734     return false;
735   };
736 
737   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
738   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
739   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
740 
741   // TODO: Refine based on subtargets which support unaligned access or 128-bit
742   // LDS
743   // TODO: Unsupported flat for SI.
744 
745   for (unsigned Op : {G_LOAD, G_STORE}) {
746     const bool IsStore = Op == G_STORE;
747 
748     auto &Actions = getActionDefinitionsBuilder(Op);
749     // Whitelist the common cases.
750     // TODO: Pointer loads
751     // TODO: Wide constant loads
752     // TODO: Only CI+ has 3x loads
753     // TODO: Loads to s16 on gfx9
754     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
755                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
756                                       {V3S32, GlobalPtr, 96, GlobalAlign32},
757                                       {S96, GlobalPtr, 96, GlobalAlign32},
758                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
759                                       {S128, GlobalPtr, 128, GlobalAlign32},
760                                       {S64, GlobalPtr, 64, GlobalAlign32},
761                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
762                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
763                                       {S32, GlobalPtr, 8, GlobalAlign8},
764                                       {S32, GlobalPtr, 16, GlobalAlign16},
765 
766                                       {S32, LocalPtr, 32, 32},
767                                       {S64, LocalPtr, 64, 32},
768                                       {V2S32, LocalPtr, 64, 32},
769                                       {S32, LocalPtr, 8, 8},
770                                       {S32, LocalPtr, 16, 16},
771                                       {V2S16, LocalPtr, 32, 32},
772 
773                                       {S32, PrivatePtr, 32, 32},
774                                       {S32, PrivatePtr, 8, 8},
775                                       {S32, PrivatePtr, 16, 16},
776                                       {V2S16, PrivatePtr, 32, 32},
777 
778                                       {S32, FlatPtr, 32, GlobalAlign32},
779                                       {S32, FlatPtr, 16, GlobalAlign16},
780                                       {S32, FlatPtr, 8, GlobalAlign8},
781                                       {V2S16, FlatPtr, 32, GlobalAlign32},
782 
783                                       {S32, ConstantPtr, 32, GlobalAlign32},
784                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
785                                       {V3S32, ConstantPtr, 96, GlobalAlign32},
786                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
787                                       {S64, ConstantPtr, 64, GlobalAlign32},
788                                       {S128, ConstantPtr, 128, GlobalAlign32},
789                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
790     Actions
791         .customIf(typeIs(1, Constant32Ptr))
792         .narrowScalarIf(
793             [=](const LegalityQuery &Query) -> bool {
794               return !Query.Types[0].isVector() &&
795                      needToSplitMemOp(Query, Op == G_LOAD);
796             },
797             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
798               const LLT DstTy = Query.Types[0];
799               const LLT PtrTy = Query.Types[1];
800 
801               const unsigned DstSize = DstTy.getSizeInBits();
802               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
803 
804               // Split extloads.
805               if (DstSize > MemSize)
806                 return std::make_pair(0, LLT::scalar(MemSize));
807 
808               if (DstSize > 32 && (DstSize % 32 != 0)) {
809                 // FIXME: Need a way to specify non-extload of larger size if
810                 // suitably aligned.
811                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
812               }
813 
814               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
815                                                      Op == G_LOAD);
816               if (MemSize > MaxSize)
817                 return std::make_pair(0, LLT::scalar(MaxSize));
818 
819               unsigned Align = Query.MMODescrs[0].AlignInBits;
820               return std::make_pair(0, LLT::scalar(Align));
821             })
822         .fewerElementsIf(
823             [=](const LegalityQuery &Query) -> bool {
824               return Query.Types[0].isVector() &&
825                      needToSplitMemOp(Query, Op == G_LOAD);
826             },
827             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
828               const LLT DstTy = Query.Types[0];
829               const LLT PtrTy = Query.Types[1];
830 
831               LLT EltTy = DstTy.getElementType();
832               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace(),
833                                                      Op == G_LOAD);
834 
835               // Split if it's too large for the address space.
836               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
837                 unsigned NumElts = DstTy.getNumElements();
838                 unsigned EltSize = EltTy.getSizeInBits();
839 
840                 if (MaxSize % EltSize == 0) {
841                   return std::make_pair(
842                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
843                 }
844 
845                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
846 
847                 // FIXME: Refine when odd breakdowns handled
848                 // The scalars will need to be re-legalized.
849                 if (NumPieces == 1 || NumPieces >= NumElts ||
850                     NumElts % NumPieces != 0)
851                   return std::make_pair(0, EltTy);
852 
853                 return std::make_pair(0,
854                                       LLT::vector(NumElts / NumPieces, EltTy));
855               }
856 
857               // Need to split because of alignment.
858               unsigned Align = Query.MMODescrs[0].AlignInBits;
859               unsigned EltSize = EltTy.getSizeInBits();
860               if (EltSize > Align &&
861                   (EltSize / Align < DstTy.getNumElements())) {
862                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
863               }
864 
865               // May need relegalization for the scalars.
866               return std::make_pair(0, EltTy);
867             })
868         .minScalar(0, S32);
869 
870     if (IsStore)
871       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
872 
873     // TODO: Need a bitcast lower option?
874     Actions
875         .legalIf([=](const LegalityQuery &Query) {
876           const LLT Ty0 = Query.Types[0];
877           unsigned Size = Ty0.getSizeInBits();
878           unsigned MemSize = Query.MMODescrs[0].SizeInBits;
879           unsigned Align = Query.MMODescrs[0].AlignInBits;
880 
881           // FIXME: Widening store from alignment not valid.
882           if (MemSize < Size)
883             MemSize = std::max(MemSize, Align);
884 
885           // No extending vector loads.
886           if (Size > MemSize && Ty0.isVector())
887             return false;
888 
889           switch (MemSize) {
890           case 8:
891           case 16:
892             return Size == 32;
893           case 32:
894           case 64:
895           case 128:
896             return true;
897           case 96:
898             return ST.hasDwordx3LoadStores();
899           case 256:
900           case 512:
901             return true;
902           default:
903             return false;
904           }
905         })
906         .widenScalarToNextPow2(0)
907         // TODO: v3s32->v4s32 with alignment
908         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
909   }
910 
911   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
912                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
913                                                   {S32, GlobalPtr, 16, 2 * 8},
914                                                   {S32, LocalPtr, 8, 8},
915                                                   {S32, LocalPtr, 16, 16},
916                                                   {S32, PrivatePtr, 8, 8},
917                                                   {S32, PrivatePtr, 16, 16},
918                                                   {S32, ConstantPtr, 8, 8},
919                                                   {S32, ConstantPtr, 16, 2 * 8}});
920   if (ST.hasFlatAddressSpace()) {
921     ExtLoads.legalForTypesWithMemDesc(
922         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
923   }
924 
925   ExtLoads.clampScalar(0, S32, S32)
926           .widenScalarToNextPow2(0)
927           .unsupportedIfMemSizeNotPow2()
928           .lower();
929 
930   auto &Atomics = getActionDefinitionsBuilder(
931     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
932      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
933      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
934      G_ATOMICRMW_UMIN})
935     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
936                {S64, GlobalPtr}, {S64, LocalPtr}});
937   if (ST.hasFlatAddressSpace()) {
938     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
939   }
940 
941   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
942     .legalFor({{S32, LocalPtr}});
943 
944   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
945   // demarshalling
946   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
947     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
948                 {S32, FlatPtr}, {S64, FlatPtr}})
949     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
950                {S32, RegionPtr}, {S64, RegionPtr}});
951   // TODO: Pointer types, any 32-bit or 64-bit vector
952 
953   // Condition should be s32 for scalar, s1 for vector.
954   getActionDefinitionsBuilder(G_SELECT)
955     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
956           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
957           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
958     .clampScalar(0, S16, S64)
959     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
960     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
961     .scalarize(1)
962     .clampMaxNumElements(0, S32, 2)
963     .clampMaxNumElements(0, LocalPtr, 2)
964     .clampMaxNumElements(0, PrivatePtr, 2)
965     .scalarize(0)
966     .widenScalarToNextPow2(0)
967     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
968 
969   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
970   // be more flexible with the shift amount type.
971   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
972     .legalFor({{S32, S32}, {S64, S32}});
973   if (ST.has16BitInsts()) {
974     if (ST.hasVOP3PInsts()) {
975       Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
976             .clampMaxNumElements(0, S16, 2);
977     } else
978       Shifts.legalFor({{S16, S32}, {S16, S16}});
979 
980     // TODO: Support 16-bit shift amounts
981     Shifts.clampScalar(1, S32, S32);
982     Shifts.clampScalar(0, S16, S64);
983     Shifts.widenScalarToNextPow2(0, 16);
984   } else {
985     // Make sure we legalize the shift amount type first, as the general
986     // expansion for the shifted type will produce much worse code if it hasn't
987     // been truncated already.
988     Shifts.clampScalar(1, S32, S32);
989     Shifts.clampScalar(0, S32, S64);
990     Shifts.widenScalarToNextPow2(0, 32);
991   }
992   Shifts.scalarize(0);
993 
994   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
995     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
996     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
997     unsigned IdxTypeIdx = 2;
998 
999     getActionDefinitionsBuilder(Op)
1000       .customIf([=](const LegalityQuery &Query) {
1001           const LLT EltTy = Query.Types[EltTypeIdx];
1002           const LLT VecTy = Query.Types[VecTypeIdx];
1003           const LLT IdxTy = Query.Types[IdxTypeIdx];
1004           return (EltTy.getSizeInBits() == 16 ||
1005                   EltTy.getSizeInBits() % 32 == 0) &&
1006                  VecTy.getSizeInBits() % 32 == 0 &&
1007                  VecTy.getSizeInBits() <= 1024 &&
1008                  IdxTy.getSizeInBits() == 32;
1009         })
1010       .clampScalar(EltTypeIdx, S32, S64)
1011       .clampScalar(VecTypeIdx, S32, S64)
1012       .clampScalar(IdxTypeIdx, S32, S32);
1013   }
1014 
1015   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1016     .unsupportedIf([=](const LegalityQuery &Query) {
1017         const LLT &EltTy = Query.Types[1].getElementType();
1018         return Query.Types[0] != EltTy;
1019       });
1020 
1021   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1022     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1023     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1024 
1025     // FIXME: Doesn't handle extract of illegal sizes.
1026     getActionDefinitionsBuilder(Op)
1027       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1028       // FIXME: Multiples of 16 should not be legal.
1029       .legalIf([=](const LegalityQuery &Query) {
1030           const LLT BigTy = Query.Types[BigTyIdx];
1031           const LLT LitTy = Query.Types[LitTyIdx];
1032           return (BigTy.getSizeInBits() % 32 == 0) &&
1033                  (LitTy.getSizeInBits() % 16 == 0);
1034         })
1035       .widenScalarIf(
1036         [=](const LegalityQuery &Query) {
1037           const LLT BigTy = Query.Types[BigTyIdx];
1038           return (BigTy.getScalarSizeInBits() < 16);
1039         },
1040         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1041       .widenScalarIf(
1042         [=](const LegalityQuery &Query) {
1043           const LLT LitTy = Query.Types[LitTyIdx];
1044           return (LitTy.getScalarSizeInBits() < 16);
1045         },
1046         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1047       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1048       .widenScalarToNextPow2(BigTyIdx, 32);
1049 
1050   }
1051 
1052   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1053     .legalForCartesianProduct(AllS32Vectors, {S32})
1054     .legalForCartesianProduct(AllS64Vectors, {S64})
1055     .clampNumElements(0, V16S32, V32S32)
1056     .clampNumElements(0, V2S64, V16S64)
1057     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1058 
1059   if (ST.hasScalarPackInsts()) {
1060     BuildVector
1061       // FIXME: Should probably widen s1 vectors straight to s32
1062       .minScalarOrElt(0, S16)
1063       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1064       .minScalar(1, S32);
1065 
1066     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1067       .legalFor({V2S16, S32})
1068       .lower();
1069     BuildVector.minScalarOrElt(0, S32);
1070   } else {
1071     BuildVector.customFor({V2S16, S16});
1072     BuildVector.minScalarOrElt(0, S32);
1073 
1074     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1075       .customFor({V2S16, S32})
1076       .lower();
1077   }
1078 
1079   BuildVector.legalIf(isRegisterType(0));
1080 
1081   // FIXME: Clamp maximum size
1082   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1083     .legalIf(isRegisterType(0));
1084 
1085   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1086   // pre-legalize.
1087   if (ST.hasVOP3PInsts()) {
1088     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1089       .customFor({V2S16, V2S16})
1090       .lower();
1091   } else
1092     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1093 
1094   // Merge/Unmerge
1095   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1096     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1097     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1098 
1099     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1100       const LLT &Ty = Query.Types[TypeIdx];
1101       if (Ty.isVector()) {
1102         const LLT &EltTy = Ty.getElementType();
1103         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
1104           return true;
1105         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1106           return true;
1107       }
1108       return false;
1109     };
1110 
1111     auto &Builder = getActionDefinitionsBuilder(Op)
1112       // Try to widen to s16 first for small types.
1113       // TODO: Only do this on targets with legal s16 shifts
1114       .minScalarOrEltIf(narrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1115 
1116       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1117       .lowerFor({{S16, V2S16}})
1118       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1119       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1120                            elementTypeIs(1, S16)),
1121                        changeTo(1, V2S16))
1122       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1123       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1124       // valid.
1125       .clampScalar(LitTyIdx, S32, S256)
1126       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1127       // Break up vectors with weird elements into scalars
1128       .fewerElementsIf(
1129         [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
1130         scalarize(0))
1131       .fewerElementsIf(
1132         [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
1133         scalarize(1))
1134       .clampScalar(BigTyIdx, S32, S1024);
1135 
1136     if (Op == G_MERGE_VALUES) {
1137       Builder.widenScalarIf(
1138         // TODO: Use 16-bit shifts if legal for 8-bit values?
1139         [=](const LegalityQuery &Query) {
1140           const LLT Ty = Query.Types[LitTyIdx];
1141           return Ty.getSizeInBits() < 32;
1142         },
1143         changeTo(LitTyIdx, S32));
1144     }
1145 
1146     Builder.widenScalarIf(
1147       [=](const LegalityQuery &Query) {
1148         const LLT Ty = Query.Types[BigTyIdx];
1149         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1150           Ty.getSizeInBits() % 16 != 0;
1151       },
1152       [=](const LegalityQuery &Query) {
1153         // Pick the next power of 2, or a multiple of 64 over 128.
1154         // Whichever is smaller.
1155         const LLT &Ty = Query.Types[BigTyIdx];
1156         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1157         if (NewSizeInBits >= 256) {
1158           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1159           if (RoundedTo < NewSizeInBits)
1160             NewSizeInBits = RoundedTo;
1161         }
1162         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1163       })
1164       .legalIf([=](const LegalityQuery &Query) {
1165           const LLT &BigTy = Query.Types[BigTyIdx];
1166           const LLT &LitTy = Query.Types[LitTyIdx];
1167 
1168           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1169             return false;
1170           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1171             return false;
1172 
1173           return BigTy.getSizeInBits() % 16 == 0 &&
1174                  LitTy.getSizeInBits() % 16 == 0 &&
1175                  BigTy.getSizeInBits() <= 1024;
1176         })
1177       // Any vectors left are the wrong size. Scalarize them.
1178       .scalarize(0)
1179       .scalarize(1);
1180   }
1181 
1182   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1183   // RegBankSelect.
1184   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1185     .legalFor({{S32}, {S64}});
1186 
1187   if (ST.hasVOP3PInsts()) {
1188     SextInReg.lowerFor({{V2S16}})
1189       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1190       // get more vector shift opportunities, since we'll get those when
1191       // expanded.
1192       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1193   } else if (ST.has16BitInsts()) {
1194     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1195   } else {
1196     // Prefer to promote to s32 before lowering if we don't have 16-bit
1197     // shifts. This avoid a lot of intermediate truncate and extend operations.
1198     SextInReg.lowerFor({{S32}, {S64}});
1199   }
1200 
1201   SextInReg
1202     .scalarize(0)
1203     .clampScalar(0, S32, S64)
1204     .lower();
1205 
1206   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1207     .legalFor({S64});
1208 
1209   getActionDefinitionsBuilder({
1210       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1211       G_FCOPYSIGN,
1212 
1213       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1214       G_READ_REGISTER,
1215       G_WRITE_REGISTER,
1216 
1217       G_SADDO, G_SSUBO,
1218 
1219        // TODO: Implement
1220       G_FMINIMUM, G_FMAXIMUM
1221     }).lower();
1222 
1223   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1224         G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1225         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1226     .unsupported();
1227 
1228   computeTables();
1229   verify(*ST.getInstrInfo());
1230 }
1231 
1232 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1233                                          MachineRegisterInfo &MRI,
1234                                          MachineIRBuilder &B,
1235                                          GISelChangeObserver &Observer) const {
1236   switch (MI.getOpcode()) {
1237   case TargetOpcode::G_ADDRSPACE_CAST:
1238     return legalizeAddrSpaceCast(MI, MRI, B);
1239   case TargetOpcode::G_FRINT:
1240     return legalizeFrint(MI, MRI, B);
1241   case TargetOpcode::G_FCEIL:
1242     return legalizeFceil(MI, MRI, B);
1243   case TargetOpcode::G_INTRINSIC_TRUNC:
1244     return legalizeIntrinsicTrunc(MI, MRI, B);
1245   case TargetOpcode::G_SITOFP:
1246     return legalizeITOFP(MI, MRI, B, true);
1247   case TargetOpcode::G_UITOFP:
1248     return legalizeITOFP(MI, MRI, B, false);
1249   case TargetOpcode::G_FPTOSI:
1250     return legalizeFPTOI(MI, MRI, B, true);
1251   case TargetOpcode::G_FPTOUI:
1252     return legalizeFPTOI(MI, MRI, B, false);
1253   case TargetOpcode::G_FMINNUM:
1254   case TargetOpcode::G_FMAXNUM:
1255   case TargetOpcode::G_FMINNUM_IEEE:
1256   case TargetOpcode::G_FMAXNUM_IEEE:
1257     return legalizeMinNumMaxNum(MI, MRI, B);
1258   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1259     return legalizeExtractVectorElt(MI, MRI, B);
1260   case TargetOpcode::G_INSERT_VECTOR_ELT:
1261     return legalizeInsertVectorElt(MI, MRI, B);
1262   case TargetOpcode::G_SHUFFLE_VECTOR:
1263     return legalizeShuffleVector(MI, MRI, B);
1264   case TargetOpcode::G_FSIN:
1265   case TargetOpcode::G_FCOS:
1266     return legalizeSinCos(MI, MRI, B);
1267   case TargetOpcode::G_GLOBAL_VALUE:
1268     return legalizeGlobalValue(MI, MRI, B);
1269   case TargetOpcode::G_LOAD:
1270     return legalizeLoad(MI, MRI, B, Observer);
1271   case TargetOpcode::G_FMAD:
1272     return legalizeFMad(MI, MRI, B);
1273   case TargetOpcode::G_FDIV:
1274     return legalizeFDIV(MI, MRI, B);
1275   case TargetOpcode::G_ATOMIC_CMPXCHG:
1276     return legalizeAtomicCmpXChg(MI, MRI, B);
1277   case TargetOpcode::G_FLOG:
1278     return legalizeFlog(MI, B, 1.0f / numbers::log2ef);
1279   case TargetOpcode::G_FLOG10:
1280     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1281   case TargetOpcode::G_FEXP:
1282     return legalizeFExp(MI, B);
1283   case TargetOpcode::G_FFLOOR:
1284     return legalizeFFloor(MI, MRI, B);
1285   case TargetOpcode::G_BUILD_VECTOR:
1286     return legalizeBuildVector(MI, MRI, B);
1287   default:
1288     return false;
1289   }
1290 
1291   llvm_unreachable("expected switch to return");
1292 }
1293 
1294 Register AMDGPULegalizerInfo::getSegmentAperture(
1295   unsigned AS,
1296   MachineRegisterInfo &MRI,
1297   MachineIRBuilder &B) const {
1298   MachineFunction &MF = B.getMF();
1299   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1300   const LLT S32 = LLT::scalar(32);
1301 
1302   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1303 
1304   if (ST.hasApertureRegs()) {
1305     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1306     // getreg.
1307     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1308         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1309         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1310     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1311         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1312         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1313     unsigned Encoding =
1314         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1315         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1316         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1317 
1318     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1319 
1320     B.buildInstr(AMDGPU::S_GETREG_B32)
1321       .addDef(GetReg)
1322       .addImm(Encoding);
1323     MRI.setType(GetReg, S32);
1324 
1325     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1326     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1327   }
1328 
1329   Register QueuePtr = MRI.createGenericVirtualRegister(
1330     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1331 
1332   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1333   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1334     return Register();
1335 
1336   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1337   // private_segment_aperture_base_hi.
1338   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1339 
1340   // TODO: can we be smarter about machine pointer info?
1341   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1342   MachineMemOperand *MMO = MF.getMachineMemOperand(
1343     PtrInfo,
1344     MachineMemOperand::MOLoad |
1345     MachineMemOperand::MODereferenceable |
1346     MachineMemOperand::MOInvariant,
1347     4,
1348     MinAlign(64, StructOffset));
1349 
1350   Register LoadAddr;
1351 
1352   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1353   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1354 }
1355 
1356 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1357   MachineInstr &MI, MachineRegisterInfo &MRI,
1358   MachineIRBuilder &B) const {
1359   MachineFunction &MF = B.getMF();
1360 
1361   B.setInstr(MI);
1362 
1363   const LLT S32 = LLT::scalar(32);
1364   Register Dst = MI.getOperand(0).getReg();
1365   Register Src = MI.getOperand(1).getReg();
1366 
1367   LLT DstTy = MRI.getType(Dst);
1368   LLT SrcTy = MRI.getType(Src);
1369   unsigned DestAS = DstTy.getAddressSpace();
1370   unsigned SrcAS = SrcTy.getAddressSpace();
1371 
1372   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1373   // vector element.
1374   assert(!DstTy.isVector());
1375 
1376   const AMDGPUTargetMachine &TM
1377     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1378 
1379   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1380   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1381     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1382     return true;
1383   }
1384 
1385   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1386     // Truncate.
1387     B.buildExtract(Dst, Src, 0);
1388     MI.eraseFromParent();
1389     return true;
1390   }
1391 
1392   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1393     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1394     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1395 
1396     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1397     // another. Merge operands are required to be the same type, but creating an
1398     // extra ptrtoint would be kind of pointless.
1399     auto HighAddr = B.buildConstant(
1400       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1401     B.buildMerge(Dst, {Src, HighAddr});
1402     MI.eraseFromParent();
1403     return true;
1404   }
1405 
1406   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1407     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1408            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1409     unsigned NullVal = TM.getNullPointerValue(DestAS);
1410 
1411     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1412     auto FlatNull = B.buildConstant(SrcTy, 0);
1413 
1414     // Extract low 32-bits of the pointer.
1415     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1416 
1417     auto CmpRes =
1418         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1419     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1420 
1421     MI.eraseFromParent();
1422     return true;
1423   }
1424 
1425   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1426     return false;
1427 
1428   if (!ST.hasFlatAddressSpace())
1429     return false;
1430 
1431   auto SegmentNull =
1432       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1433   auto FlatNull =
1434       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1435 
1436   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1437   if (!ApertureReg.isValid())
1438     return false;
1439 
1440   auto CmpRes =
1441       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1442 
1443   // Coerce the type of the low half of the result so we can use merge_values.
1444   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1445 
1446   // TODO: Should we allow mismatched types but matching sizes in merges to
1447   // avoid the ptrtoint?
1448   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1449   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1450 
1451   MI.eraseFromParent();
1452   return true;
1453 }
1454 
1455 bool AMDGPULegalizerInfo::legalizeFrint(
1456   MachineInstr &MI, MachineRegisterInfo &MRI,
1457   MachineIRBuilder &B) const {
1458   B.setInstr(MI);
1459 
1460   Register Src = MI.getOperand(1).getReg();
1461   LLT Ty = MRI.getType(Src);
1462   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1463 
1464   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1465   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1466 
1467   auto C1 = B.buildFConstant(Ty, C1Val);
1468   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1469 
1470   // TODO: Should this propagate fast-math-flags?
1471   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1472   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1473 
1474   auto C2 = B.buildFConstant(Ty, C2Val);
1475   auto Fabs = B.buildFAbs(Ty, Src);
1476 
1477   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1478   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1479   return true;
1480 }
1481 
1482 bool AMDGPULegalizerInfo::legalizeFceil(
1483   MachineInstr &MI, MachineRegisterInfo &MRI,
1484   MachineIRBuilder &B) const {
1485   B.setInstr(MI);
1486 
1487   const LLT S1 = LLT::scalar(1);
1488   const LLT S64 = LLT::scalar(64);
1489 
1490   Register Src = MI.getOperand(1).getReg();
1491   assert(MRI.getType(Src) == S64);
1492 
1493   // result = trunc(src)
1494   // if (src > 0.0 && src != result)
1495   //   result += 1.0
1496 
1497   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1498 
1499   const auto Zero = B.buildFConstant(S64, 0.0);
1500   const auto One = B.buildFConstant(S64, 1.0);
1501   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1502   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1503   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1504   auto Add = B.buildSelect(S64, And, One, Zero);
1505 
1506   // TODO: Should this propagate fast-math-flags?
1507   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1508   return true;
1509 }
1510 
1511 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1512                                               MachineIRBuilder &B) {
1513   const unsigned FractBits = 52;
1514   const unsigned ExpBits = 11;
1515   LLT S32 = LLT::scalar(32);
1516 
1517   auto Const0 = B.buildConstant(S32, FractBits - 32);
1518   auto Const1 = B.buildConstant(S32, ExpBits);
1519 
1520   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1521     .addUse(Const0.getReg(0))
1522     .addUse(Const1.getReg(0));
1523 
1524   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1525 }
1526 
1527 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1528   MachineInstr &MI, MachineRegisterInfo &MRI,
1529   MachineIRBuilder &B) const {
1530   B.setInstr(MI);
1531 
1532   const LLT S1 = LLT::scalar(1);
1533   const LLT S32 = LLT::scalar(32);
1534   const LLT S64 = LLT::scalar(64);
1535 
1536   Register Src = MI.getOperand(1).getReg();
1537   assert(MRI.getType(Src) == S64);
1538 
1539   // TODO: Should this use extract since the low half is unused?
1540   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1541   Register Hi = Unmerge.getReg(1);
1542 
1543   // Extract the upper half, since this is where we will find the sign and
1544   // exponent.
1545   auto Exp = extractF64Exponent(Hi, B);
1546 
1547   const unsigned FractBits = 52;
1548 
1549   // Extract the sign bit.
1550   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1551   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1552 
1553   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1554 
1555   const auto Zero32 = B.buildConstant(S32, 0);
1556 
1557   // Extend back to 64-bits.
1558   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1559 
1560   auto Shr = B.buildAShr(S64, FractMask, Exp);
1561   auto Not = B.buildNot(S64, Shr);
1562   auto Tmp0 = B.buildAnd(S64, Src, Not);
1563   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1564 
1565   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1566   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1567 
1568   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1569   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1570   return true;
1571 }
1572 
1573 bool AMDGPULegalizerInfo::legalizeITOFP(
1574   MachineInstr &MI, MachineRegisterInfo &MRI,
1575   MachineIRBuilder &B, bool Signed) const {
1576   B.setInstr(MI);
1577 
1578   Register Dst = MI.getOperand(0).getReg();
1579   Register Src = MI.getOperand(1).getReg();
1580 
1581   const LLT S64 = LLT::scalar(64);
1582   const LLT S32 = LLT::scalar(32);
1583 
1584   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1585 
1586   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1587 
1588   auto CvtHi = Signed ?
1589     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1590     B.buildUITOFP(S64, Unmerge.getReg(1));
1591 
1592   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1593 
1594   auto ThirtyTwo = B.buildConstant(S32, 32);
1595   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1596     .addUse(CvtHi.getReg(0))
1597     .addUse(ThirtyTwo.getReg(0));
1598 
1599   // TODO: Should this propagate fast-math-flags?
1600   B.buildFAdd(Dst, LdExp, CvtLo);
1601   MI.eraseFromParent();
1602   return true;
1603 }
1604 
1605 // TODO: Copied from DAG implementation. Verify logic and document how this
1606 // actually works.
1607 bool AMDGPULegalizerInfo::legalizeFPTOI(
1608   MachineInstr &MI, MachineRegisterInfo &MRI,
1609   MachineIRBuilder &B, bool Signed) const {
1610   B.setInstr(MI);
1611 
1612   Register Dst = MI.getOperand(0).getReg();
1613   Register Src = MI.getOperand(1).getReg();
1614 
1615   const LLT S64 = LLT::scalar(64);
1616   const LLT S32 = LLT::scalar(32);
1617 
1618   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1619 
1620   unsigned Flags = MI.getFlags();
1621 
1622   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
1623   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
1624   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
1625 
1626   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
1627   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
1628   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
1629 
1630   auto Hi = Signed ?
1631     B.buildFPTOSI(S32, FloorMul) :
1632     B.buildFPTOUI(S32, FloorMul);
1633   auto Lo = B.buildFPTOUI(S32, Fma);
1634 
1635   B.buildMerge(Dst, { Lo, Hi });
1636   MI.eraseFromParent();
1637 
1638   return true;
1639 }
1640 
1641 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1642   MachineInstr &MI, MachineRegisterInfo &MRI,
1643   MachineIRBuilder &B) const {
1644   MachineFunction &MF = B.getMF();
1645   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1646 
1647   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1648                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1649 
1650   // With ieee_mode disabled, the instructions have the correct behavior
1651   // already for G_FMINNUM/G_FMAXNUM
1652   if (!MFI->getMode().IEEE)
1653     return !IsIEEEOp;
1654 
1655   if (IsIEEEOp)
1656     return true;
1657 
1658   MachineIRBuilder HelperBuilder(MI);
1659   GISelObserverWrapper DummyObserver;
1660   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1661   HelperBuilder.setInstr(MI);
1662   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1663 }
1664 
1665 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1666   MachineInstr &MI, MachineRegisterInfo &MRI,
1667   MachineIRBuilder &B) const {
1668   // TODO: Should move some of this into LegalizerHelper.
1669 
1670   // TODO: Promote dynamic indexing of s16 to s32
1671   // TODO: Dynamic s64 indexing is only legal for SGPR.
1672   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
1673   if (!IdxVal) // Dynamic case will be selected to register indexing.
1674     return true;
1675 
1676   Register Dst = MI.getOperand(0).getReg();
1677   Register Vec = MI.getOperand(1).getReg();
1678 
1679   LLT VecTy = MRI.getType(Vec);
1680   LLT EltTy = VecTy.getElementType();
1681   assert(EltTy == MRI.getType(Dst));
1682 
1683   B.setInstr(MI);
1684 
1685   if (IdxVal.getValue() < VecTy.getNumElements())
1686     B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits());
1687   else
1688     B.buildUndef(Dst);
1689 
1690   MI.eraseFromParent();
1691   return true;
1692 }
1693 
1694 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1695   MachineInstr &MI, MachineRegisterInfo &MRI,
1696   MachineIRBuilder &B) const {
1697   // TODO: Should move some of this into LegalizerHelper.
1698 
1699   // TODO: Promote dynamic indexing of s16 to s32
1700   // TODO: Dynamic s64 indexing is only legal for SGPR.
1701   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI);
1702   if (!IdxVal) // Dynamic case will be selected to register indexing.
1703     return true;
1704 
1705   Register Dst = MI.getOperand(0).getReg();
1706   Register Vec = MI.getOperand(1).getReg();
1707   Register Ins = MI.getOperand(2).getReg();
1708 
1709   LLT VecTy = MRI.getType(Vec);
1710   LLT EltTy = VecTy.getElementType();
1711   assert(EltTy == MRI.getType(Ins));
1712 
1713   B.setInstr(MI);
1714 
1715   if (IdxVal.getValue() < VecTy.getNumElements())
1716     B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits());
1717   else
1718     B.buildUndef(Dst);
1719 
1720   MI.eraseFromParent();
1721   return true;
1722 }
1723 
1724 static bool isLegalVOP3PShuffleMask(ArrayRef<int> Mask) {
1725   assert(Mask.size() == 2);
1726 
1727   // If one half is undef, the other is trivially in the same reg.
1728   if (Mask[0] == -1 || Mask[1] == -1)
1729     return true;
1730   return ((Mask[0] == 0 || Mask[0] == 1) && (Mask[1] == 0 || Mask[1] == 1)) ||
1731          ((Mask[0] == 2 || Mask[0] == 3) && (Mask[1] == 2 || Mask[1] == 3));
1732 }
1733 
1734 bool AMDGPULegalizerInfo::legalizeShuffleVector(
1735   MachineInstr &MI, MachineRegisterInfo &MRI,
1736   MachineIRBuilder &B) const {
1737   const LLT V2S16 = LLT::vector(2, 16);
1738 
1739   Register Dst = MI.getOperand(0).getReg();
1740   Register Src0 = MI.getOperand(1).getReg();
1741   LLT DstTy = MRI.getType(Dst);
1742   LLT SrcTy = MRI.getType(Src0);
1743 
1744   if (SrcTy == V2S16 && DstTy == V2S16 &&
1745       isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
1746     return true;
1747 
1748   MachineIRBuilder HelperBuilder(MI);
1749   GISelObserverWrapper DummyObserver;
1750   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
1751   HelperBuilder.setInstr(MI);
1752   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
1753 }
1754 
1755 bool AMDGPULegalizerInfo::legalizeSinCos(
1756   MachineInstr &MI, MachineRegisterInfo &MRI,
1757   MachineIRBuilder &B) const {
1758   B.setInstr(MI);
1759 
1760   Register DstReg = MI.getOperand(0).getReg();
1761   Register SrcReg = MI.getOperand(1).getReg();
1762   LLT Ty = MRI.getType(DstReg);
1763   unsigned Flags = MI.getFlags();
1764 
1765   Register TrigVal;
1766   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1767   if (ST.hasTrigReducedRange()) {
1768     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1769     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1770       .addUse(MulVal.getReg(0))
1771       .setMIFlags(Flags).getReg(0);
1772   } else
1773     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1774 
1775   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1776     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1777   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1778     .addUse(TrigVal)
1779     .setMIFlags(Flags);
1780   MI.eraseFromParent();
1781   return true;
1782 }
1783 
1784 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1785   Register DstReg, LLT PtrTy,
1786   MachineIRBuilder &B, const GlobalValue *GV,
1787   unsigned Offset, unsigned GAFlags) const {
1788   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1789   // to the following code sequence:
1790   //
1791   // For constant address space:
1792   //   s_getpc_b64 s[0:1]
1793   //   s_add_u32 s0, s0, $symbol
1794   //   s_addc_u32 s1, s1, 0
1795   //
1796   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1797   //   a fixup or relocation is emitted to replace $symbol with a literal
1798   //   constant, which is a pc-relative offset from the encoding of the $symbol
1799   //   operand to the global variable.
1800   //
1801   // For global address space:
1802   //   s_getpc_b64 s[0:1]
1803   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1804   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1805   //
1806   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1807   //   fixups or relocations are emitted to replace $symbol@*@lo and
1808   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1809   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
1810   //   operand to the global variable.
1811   //
1812   // What we want here is an offset from the value returned by s_getpc
1813   // (which is the address of the s_add_u32 instruction) to the global
1814   // variable, but since the encoding of $symbol starts 4 bytes after the start
1815   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1816   // small. This requires us to add 4 to the global variable offset in order to
1817   // compute the correct address.
1818 
1819   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1820 
1821   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1822     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1823 
1824   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1825     .addDef(PCReg);
1826 
1827   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1828   if (GAFlags == SIInstrInfo::MO_NONE)
1829     MIB.addImm(0);
1830   else
1831     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1832 
1833   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1834 
1835   if (PtrTy.getSizeInBits() == 32)
1836     B.buildExtract(DstReg, PCReg, 0);
1837   return true;
1838  }
1839 
1840 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1841   MachineInstr &MI, MachineRegisterInfo &MRI,
1842   MachineIRBuilder &B) const {
1843   Register DstReg = MI.getOperand(0).getReg();
1844   LLT Ty = MRI.getType(DstReg);
1845   unsigned AS = Ty.getAddressSpace();
1846 
1847   const GlobalValue *GV = MI.getOperand(1).getGlobal();
1848   MachineFunction &MF = B.getMF();
1849   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1850   B.setInstr(MI);
1851 
1852   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1853     if (!MFI->isEntryFunction()) {
1854       const Function &Fn = MF.getFunction();
1855       DiagnosticInfoUnsupported BadLDSDecl(
1856         Fn, "local memory global used by non-kernel function", MI.getDebugLoc());
1857       Fn.getContext().diagnose(BadLDSDecl);
1858     }
1859 
1860     // TODO: We could emit code to handle the initialization somewhere.
1861     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1862       const SITargetLowering *TLI = ST.getTargetLowering();
1863       if (!TLI->shouldUseLDSConstAddress(GV)) {
1864         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
1865         return true; // Leave in place;
1866       }
1867 
1868       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1869       MI.eraseFromParent();
1870       return true;
1871     }
1872 
1873     const Function &Fn = MF.getFunction();
1874     DiagnosticInfoUnsupported BadInit(
1875       Fn, "unsupported initializer for address space", MI.getDebugLoc());
1876     Fn.getContext().diagnose(BadInit);
1877     return true;
1878   }
1879 
1880   const SITargetLowering *TLI = ST.getTargetLowering();
1881 
1882   if (TLI->shouldEmitFixup(GV)) {
1883     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
1884     MI.eraseFromParent();
1885     return true;
1886   }
1887 
1888   if (TLI->shouldEmitPCReloc(GV)) {
1889     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
1890     MI.eraseFromParent();
1891     return true;
1892   }
1893 
1894   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1895   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
1896 
1897   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
1898     MachinePointerInfo::getGOT(MF),
1899     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1900     MachineMemOperand::MOInvariant,
1901     8 /*Size*/, 8 /*Align*/);
1902 
1903   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
1904 
1905   if (Ty.getSizeInBits() == 32) {
1906     // Truncate if this is a 32-bit constant adrdess.
1907     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
1908     B.buildExtract(DstReg, Load, 0);
1909   } else
1910     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
1911 
1912   MI.eraseFromParent();
1913   return true;
1914 }
1915 
1916 bool AMDGPULegalizerInfo::legalizeLoad(
1917   MachineInstr &MI, MachineRegisterInfo &MRI,
1918   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
1919   B.setInstr(MI);
1920   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1921   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
1922   Observer.changingInstr(MI);
1923   MI.getOperand(1).setReg(Cast.getReg(0));
1924   Observer.changedInstr(MI);
1925   return true;
1926 }
1927 
1928 bool AMDGPULegalizerInfo::legalizeFMad(
1929   MachineInstr &MI, MachineRegisterInfo &MRI,
1930   MachineIRBuilder &B) const {
1931   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1932   assert(Ty.isScalar());
1933 
1934   MachineFunction &MF = B.getMF();
1935   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1936 
1937   // TODO: Always legal with future ftz flag.
1938   // FIXME: Do we need just output?
1939   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
1940     return true;
1941   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
1942     return true;
1943 
1944   MachineIRBuilder HelperBuilder(MI);
1945   GISelObserverWrapper DummyObserver;
1946   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1947   HelperBuilder.setMBB(*MI.getParent());
1948   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
1949 }
1950 
1951 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
1952   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
1953   Register DstReg = MI.getOperand(0).getReg();
1954   Register PtrReg = MI.getOperand(1).getReg();
1955   Register CmpVal = MI.getOperand(2).getReg();
1956   Register NewVal = MI.getOperand(3).getReg();
1957 
1958   assert(SITargetLowering::isFlatGlobalAddrSpace(
1959            MRI.getType(PtrReg).getAddressSpace()) &&
1960          "this should not have been custom lowered");
1961 
1962   LLT ValTy = MRI.getType(CmpVal);
1963   LLT VecTy = LLT::vector(2, ValTy);
1964 
1965   B.setInstr(MI);
1966   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
1967 
1968   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
1969     .addDef(DstReg)
1970     .addUse(PtrReg)
1971     .addUse(PackedVal)
1972     .setMemRefs(MI.memoperands());
1973 
1974   MI.eraseFromParent();
1975   return true;
1976 }
1977 
1978 bool AMDGPULegalizerInfo::legalizeFlog(
1979   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
1980   Register Dst = MI.getOperand(0).getReg();
1981   Register Src = MI.getOperand(1).getReg();
1982   LLT Ty = B.getMRI()->getType(Dst);
1983   unsigned Flags = MI.getFlags();
1984   B.setInstr(MI);
1985 
1986   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
1987   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
1988 
1989   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
1990   MI.eraseFromParent();
1991   return true;
1992 }
1993 
1994 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
1995                                        MachineIRBuilder &B) const {
1996   Register Dst = MI.getOperand(0).getReg();
1997   Register Src = MI.getOperand(1).getReg();
1998   unsigned Flags = MI.getFlags();
1999   LLT Ty = B.getMRI()->getType(Dst);
2000   B.setInstr(MI);
2001 
2002   auto K = B.buildFConstant(Ty, numbers::log2e);
2003   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2004   B.buildFExp2(Dst, Mul, Flags);
2005   MI.eraseFromParent();
2006   return true;
2007 }
2008 
2009 // Find a source register, ignoring any possible source modifiers.
2010 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2011   Register ModSrc = OrigSrc;
2012   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2013     ModSrc = SrcFNeg->getOperand(1).getReg();
2014     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2015       ModSrc = SrcFAbs->getOperand(1).getReg();
2016   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2017     ModSrc = SrcFAbs->getOperand(1).getReg();
2018   return ModSrc;
2019 }
2020 
2021 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2022                                          MachineRegisterInfo &MRI,
2023                                          MachineIRBuilder &B) const {
2024   B.setInstr(MI);
2025 
2026   const LLT S1 = LLT::scalar(1);
2027   const LLT S64 = LLT::scalar(64);
2028   Register Dst = MI.getOperand(0).getReg();
2029   Register OrigSrc = MI.getOperand(1).getReg();
2030   unsigned Flags = MI.getFlags();
2031   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2032          "this should not have been custom lowered");
2033 
2034   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2035   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2036   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2037   // V_FRACT bug is:
2038   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2039   //
2040   // Convert floor(x) to (x - fract(x))
2041 
2042   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2043     .addUse(OrigSrc)
2044     .setMIFlags(Flags);
2045 
2046   // Give source modifier matching some assistance before obscuring a foldable
2047   // pattern.
2048 
2049   // TODO: We can avoid the neg on the fract? The input sign to fract
2050   // shouldn't matter?
2051   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2052 
2053   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2054 
2055   Register Min = MRI.createGenericVirtualRegister(S64);
2056 
2057   // We don't need to concern ourselves with the snan handling difference, so
2058   // use the one which will directly select.
2059   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2060   if (MFI->getMode().IEEE)
2061     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2062   else
2063     B.buildFMinNum(Min, Fract, Const, Flags);
2064 
2065   Register CorrectedFract = Min;
2066   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2067     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2068     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2069   }
2070 
2071   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2072   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2073 
2074   MI.eraseFromParent();
2075   return true;
2076 }
2077 
2078 // Turn an illegal packed v2s16 build vector into bit operations.
2079 // TODO: This should probably be a bitcast action in LegalizerHelper.
2080 bool AMDGPULegalizerInfo::legalizeBuildVector(
2081   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2082   Register Dst = MI.getOperand(0).getReg();
2083   LLT DstTy = MRI.getType(Dst);
2084   const LLT S32 = LLT::scalar(32);
2085   const LLT V2S16 = LLT::vector(2, 16);
2086   (void)DstTy;
2087   (void)V2S16;
2088   assert(DstTy == V2S16);
2089 
2090   Register Src0 = MI.getOperand(1).getReg();
2091   Register Src1 = MI.getOperand(2).getReg();
2092   assert(MRI.getType(Src0) == LLT::scalar(16));
2093 
2094   B.setInstr(MI);
2095   auto Merge = B.buildMerge(S32, {Src0, Src1});
2096   B.buildBitcast(Dst, Merge);
2097 
2098   MI.eraseFromParent();
2099   return true;
2100 }
2101 
2102 // Return the use branch instruction, otherwise null if the usage is invalid.
2103 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2104                                        MachineRegisterInfo &MRI,
2105                                        MachineInstr *&Br) {
2106   Register CondDef = MI.getOperand(0).getReg();
2107   if (!MRI.hasOneNonDBGUse(CondDef))
2108     return nullptr;
2109 
2110   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2111   if (UseMI.getParent() != MI.getParent() ||
2112       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2113     return nullptr;
2114 
2115   // Make sure the cond br is followed by a G_BR
2116   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2117   if (Next != MI.getParent()->end()) {
2118     if (Next->getOpcode() != AMDGPU::G_BR)
2119       return nullptr;
2120     Br = &*Next;
2121   }
2122 
2123   return &UseMI;
2124 }
2125 
2126 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
2127                                                 Register Reg, LLT Ty) const {
2128   Register LiveIn = MRI.getLiveInVirtReg(Reg);
2129   if (LiveIn)
2130     return LiveIn;
2131 
2132   Register NewReg = MRI.createGenericVirtualRegister(Ty);
2133   MRI.addLiveIn(Reg, NewReg);
2134   return NewReg;
2135 }
2136 
2137 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2138                                          const ArgDescriptor *Arg) const {
2139   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2140     return false; // TODO: Handle these
2141 
2142   assert(Arg->getRegister().isPhysical());
2143 
2144   MachineRegisterInfo &MRI = *B.getMRI();
2145 
2146   LLT Ty = MRI.getType(DstReg);
2147   Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
2148 
2149   if (Arg->isMasked()) {
2150     // TODO: Should we try to emit this once in the entry block?
2151     const LLT S32 = LLT::scalar(32);
2152     const unsigned Mask = Arg->getMask();
2153     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2154 
2155     Register AndMaskSrc = LiveIn;
2156 
2157     if (Shift != 0) {
2158       auto ShiftAmt = B.buildConstant(S32, Shift);
2159       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2160     }
2161 
2162     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2163   } else
2164     B.buildCopy(DstReg, LiveIn);
2165 
2166   // Insert the argument copy if it doens't already exist.
2167   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
2168   if (!MRI.getVRegDef(LiveIn)) {
2169     // FIXME: Should have scoped insert pt
2170     MachineBasicBlock &OrigInsBB = B.getMBB();
2171     auto OrigInsPt = B.getInsertPt();
2172 
2173     MachineBasicBlock &EntryMBB = B.getMF().front();
2174     EntryMBB.addLiveIn(Arg->getRegister());
2175     B.setInsertPt(EntryMBB, EntryMBB.begin());
2176     B.buildCopy(LiveIn, Arg->getRegister());
2177 
2178     B.setInsertPt(OrigInsBB, OrigInsPt);
2179   }
2180 
2181   return true;
2182 }
2183 
2184 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2185   MachineInstr &MI,
2186   MachineRegisterInfo &MRI,
2187   MachineIRBuilder &B,
2188   AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2189   B.setInstr(MI);
2190 
2191   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2192 
2193   const ArgDescriptor *Arg;
2194   const TargetRegisterClass *RC;
2195   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
2196   if (!Arg) {
2197     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
2198     return false;
2199   }
2200 
2201   if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
2202     MI.eraseFromParent();
2203     return true;
2204   }
2205 
2206   return false;
2207 }
2208 
2209 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2210                                        MachineRegisterInfo &MRI,
2211                                        MachineIRBuilder &B) const {
2212   B.setInstr(MI);
2213   Register Dst = MI.getOperand(0).getReg();
2214   LLT DstTy = MRI.getType(Dst);
2215   LLT S16 = LLT::scalar(16);
2216   LLT S32 = LLT::scalar(32);
2217   LLT S64 = LLT::scalar(64);
2218 
2219   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2220     return true;
2221 
2222   if (DstTy == S16)
2223     return legalizeFDIV16(MI, MRI, B);
2224   if (DstTy == S32)
2225     return legalizeFDIV32(MI, MRI, B);
2226   if (DstTy == S64)
2227     return legalizeFDIV64(MI, MRI, B);
2228 
2229   return false;
2230 }
2231 
2232 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2233                                                  MachineRegisterInfo &MRI,
2234                                                  MachineIRBuilder &B) const {
2235   Register Res = MI.getOperand(0).getReg();
2236   Register LHS = MI.getOperand(1).getReg();
2237   Register RHS = MI.getOperand(2).getReg();
2238 
2239   uint16_t Flags = MI.getFlags();
2240 
2241   LLT ResTy = MRI.getType(Res);
2242   LLT S32 = LLT::scalar(32);
2243   LLT S64 = LLT::scalar(64);
2244 
2245   const MachineFunction &MF = B.getMF();
2246   bool Unsafe =
2247     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2248 
2249   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2250     return false;
2251 
2252   if (!Unsafe && ResTy == S32 &&
2253       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2254     return false;
2255 
2256   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2257     // 1 / x -> RCP(x)
2258     if (CLHS->isExactlyValue(1.0)) {
2259       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2260         .addUse(RHS)
2261         .setMIFlags(Flags);
2262 
2263       MI.eraseFromParent();
2264       return true;
2265     }
2266 
2267     // -1 / x -> RCP( FNEG(x) )
2268     if (CLHS->isExactlyValue(-1.0)) {
2269       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2270       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2271         .addUse(FNeg.getReg(0))
2272         .setMIFlags(Flags);
2273 
2274       MI.eraseFromParent();
2275       return true;
2276     }
2277   }
2278 
2279   // x / y -> x * (1.0 / y)
2280   if (Unsafe) {
2281     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2282       .addUse(RHS)
2283       .setMIFlags(Flags);
2284     B.buildFMul(Res, LHS, RCP, Flags);
2285 
2286     MI.eraseFromParent();
2287     return true;
2288   }
2289 
2290   return false;
2291 }
2292 
2293 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2294                                          MachineRegisterInfo &MRI,
2295                                          MachineIRBuilder &B) const {
2296   B.setInstr(MI);
2297   Register Res = MI.getOperand(0).getReg();
2298   Register LHS = MI.getOperand(1).getReg();
2299   Register RHS = MI.getOperand(2).getReg();
2300 
2301   uint16_t Flags = MI.getFlags();
2302 
2303   LLT S16 = LLT::scalar(16);
2304   LLT S32 = LLT::scalar(32);
2305 
2306   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2307   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2308 
2309   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2310     .addUse(RHSExt.getReg(0))
2311     .setMIFlags(Flags);
2312 
2313   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2314   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2315 
2316   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2317     .addUse(RDst.getReg(0))
2318     .addUse(RHS)
2319     .addUse(LHS)
2320     .setMIFlags(Flags);
2321 
2322   MI.eraseFromParent();
2323   return true;
2324 }
2325 
2326 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2327 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2328 static void toggleSPDenormMode(bool Enable,
2329                                MachineIRBuilder &B,
2330                                const GCNSubtarget &ST,
2331                                AMDGPU::SIModeRegisterDefaults Mode) {
2332   // Set SP denorm mode to this value.
2333   unsigned SPDenormMode =
2334     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
2335 
2336   if (ST.hasDenormModeInst()) {
2337     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2338     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
2339 
2340     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2341     B.buildInstr(AMDGPU::S_DENORM_MODE)
2342       .addImm(NewDenormModeValue);
2343 
2344   } else {
2345     // Select FP32 bit field in mode register.
2346     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2347                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2348                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2349 
2350     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2351       .addImm(SPDenormMode)
2352       .addImm(SPDenormModeBitField);
2353   }
2354 }
2355 
2356 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2357                                          MachineRegisterInfo &MRI,
2358                                          MachineIRBuilder &B) const {
2359   B.setInstr(MI);
2360   Register Res = MI.getOperand(0).getReg();
2361   Register LHS = MI.getOperand(1).getReg();
2362   Register RHS = MI.getOperand(2).getReg();
2363   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2364   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2365 
2366   uint16_t Flags = MI.getFlags();
2367 
2368   LLT S32 = LLT::scalar(32);
2369   LLT S1 = LLT::scalar(1);
2370 
2371   auto One = B.buildFConstant(S32, 1.0f);
2372 
2373   auto DenominatorScaled =
2374     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2375       .addUse(RHS)
2376       .addUse(LHS)
2377       .addImm(1)
2378       .setMIFlags(Flags);
2379   auto NumeratorScaled =
2380     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2381       .addUse(LHS)
2382       .addUse(RHS)
2383       .addImm(0)
2384       .setMIFlags(Flags);
2385 
2386   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2387     .addUse(DenominatorScaled.getReg(0))
2388     .setMIFlags(Flags);
2389   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2390 
2391   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2392   // aren't modeled as reading it.
2393   if (!Mode.allFP32Denormals())
2394     toggleSPDenormMode(true, B, ST, Mode);
2395 
2396   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2397   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2398   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2399   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2400   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2401   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2402 
2403   if (!Mode.allFP32Denormals())
2404     toggleSPDenormMode(false, B, ST, Mode);
2405 
2406   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2407     .addUse(Fma4.getReg(0))
2408     .addUse(Fma1.getReg(0))
2409     .addUse(Fma3.getReg(0))
2410     .addUse(NumeratorScaled.getReg(1))
2411     .setMIFlags(Flags);
2412 
2413   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2414     .addUse(Fmas.getReg(0))
2415     .addUse(RHS)
2416     .addUse(LHS)
2417     .setMIFlags(Flags);
2418 
2419   MI.eraseFromParent();
2420   return true;
2421 }
2422 
2423 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
2424                                          MachineRegisterInfo &MRI,
2425                                          MachineIRBuilder &B) const {
2426   B.setInstr(MI);
2427   Register Res = MI.getOperand(0).getReg();
2428   Register LHS = MI.getOperand(1).getReg();
2429   Register RHS = MI.getOperand(2).getReg();
2430 
2431   uint16_t Flags = MI.getFlags();
2432 
2433   LLT S64 = LLT::scalar(64);
2434   LLT S1 = LLT::scalar(1);
2435 
2436   auto One = B.buildFConstant(S64, 1.0);
2437 
2438   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2439     .addUse(LHS)
2440     .addUse(RHS)
2441     .addImm(1)
2442     .setMIFlags(Flags);
2443 
2444   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
2445 
2446   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
2447     .addUse(DivScale0.getReg(0))
2448     .setMIFlags(Flags);
2449 
2450   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
2451   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
2452   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
2453 
2454   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2455     .addUse(LHS)
2456     .addUse(RHS)
2457     .addImm(0)
2458     .setMIFlags(Flags);
2459 
2460   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
2461   auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags);
2462   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
2463 
2464   Register Scale;
2465   if (!ST.hasUsableDivScaleConditionOutput()) {
2466     // Workaround a hardware bug on SI where the condition output from div_scale
2467     // is not usable.
2468 
2469     LLT S32 = LLT::scalar(32);
2470 
2471     auto NumUnmerge = B.buildUnmerge(S32, LHS);
2472     auto DenUnmerge = B.buildUnmerge(S32, RHS);
2473     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
2474     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
2475 
2476     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
2477                               Scale1Unmerge.getReg(1));
2478     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
2479                               Scale0Unmerge.getReg(1));
2480     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
2481   } else {
2482     Scale = DivScale1.getReg(1);
2483   }
2484 
2485   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
2486     .addUse(Fma4.getReg(0))
2487     .addUse(Fma3.getReg(0))
2488     .addUse(Mul.getReg(0))
2489     .addUse(Scale)
2490     .setMIFlags(Flags);
2491 
2492   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
2493     .addUse(Fmas.getReg(0))
2494     .addUse(RHS)
2495     .addUse(LHS)
2496     .setMIFlags(Flags);
2497 
2498   MI.eraseFromParent();
2499   return true;
2500 }
2501 
2502 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
2503                                                  MachineRegisterInfo &MRI,
2504                                                  MachineIRBuilder &B) const {
2505   B.setInstr(MI);
2506   Register Res = MI.getOperand(0).getReg();
2507   Register LHS = MI.getOperand(2).getReg();
2508   Register RHS = MI.getOperand(3).getReg();
2509   uint16_t Flags = MI.getFlags();
2510 
2511   LLT S32 = LLT::scalar(32);
2512   LLT S1 = LLT::scalar(1);
2513 
2514   auto Abs = B.buildFAbs(S32, RHS, Flags);
2515   const APFloat C0Val(1.0f);
2516 
2517   auto C0 = B.buildConstant(S32, 0x6f800000);
2518   auto C1 = B.buildConstant(S32, 0x2f800000);
2519   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
2520 
2521   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
2522   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
2523 
2524   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
2525 
2526   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2527     .addUse(Mul0.getReg(0))
2528     .setMIFlags(Flags);
2529 
2530   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
2531 
2532   B.buildFMul(Res, Sel, Mul1, Flags);
2533 
2534   MI.eraseFromParent();
2535   return true;
2536 }
2537 
2538 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
2539                                                  MachineRegisterInfo &MRI,
2540                                                  MachineIRBuilder &B) const {
2541   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2542   if (!MFI->isEntryFunction()) {
2543     return legalizePreloadedArgIntrin(MI, MRI, B,
2544                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
2545   }
2546 
2547   B.setInstr(MI);
2548 
2549   uint64_t Offset =
2550     ST.getTargetLowering()->getImplicitParameterOffset(
2551       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
2552   Register DstReg = MI.getOperand(0).getReg();
2553   LLT DstTy = MRI.getType(DstReg);
2554   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
2555 
2556   const ArgDescriptor *Arg;
2557   const TargetRegisterClass *RC;
2558   std::tie(Arg, RC)
2559     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2560   if (!Arg)
2561     return false;
2562 
2563   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
2564   if (!loadInputValue(KernargPtrReg, B, Arg))
2565     return false;
2566 
2567   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
2568   MI.eraseFromParent();
2569   return true;
2570 }
2571 
2572 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
2573                                               MachineRegisterInfo &MRI,
2574                                               MachineIRBuilder &B,
2575                                               unsigned AddrSpace) const {
2576   B.setInstr(MI);
2577   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
2578   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
2579   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
2580   MI.eraseFromParent();
2581   return true;
2582 }
2583 
2584 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
2585 // offset (the offset that is included in bounds checking and swizzling, to be
2586 // split between the instruction's voffset and immoffset fields) and soffset
2587 // (the offset that is excluded from bounds checking and swizzling, to go in
2588 // the instruction's soffset field).  This function takes the first kind of
2589 // offset and figures out how to split it between voffset and immoffset.
2590 std::tuple<Register, unsigned, unsigned>
2591 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
2592                                         Register OrigOffset) const {
2593   const unsigned MaxImm = 4095;
2594   Register BaseReg;
2595   unsigned TotalConstOffset;
2596   MachineInstr *OffsetDef;
2597   const LLT S32 = LLT::scalar(32);
2598 
2599   std::tie(BaseReg, TotalConstOffset, OffsetDef)
2600     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
2601 
2602   unsigned ImmOffset = TotalConstOffset;
2603 
2604   // If the immediate value is too big for the immoffset field, put the value
2605   // and -4096 into the immoffset field so that the value that is copied/added
2606   // for the voffset field is a multiple of 4096, and it stands more chance
2607   // of being CSEd with the copy/add for another similar load/store.
2608   // However, do not do that rounding down to a multiple of 4096 if that is a
2609   // negative number, as it appears to be illegal to have a negative offset
2610   // in the vgpr, even if adding the immediate offset makes it positive.
2611   unsigned Overflow = ImmOffset & ~MaxImm;
2612   ImmOffset -= Overflow;
2613   if ((int32_t)Overflow < 0) {
2614     Overflow += ImmOffset;
2615     ImmOffset = 0;
2616   }
2617 
2618   if (Overflow != 0) {
2619     if (!BaseReg) {
2620       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
2621     } else {
2622       auto OverflowVal = B.buildConstant(S32, Overflow);
2623       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
2624     }
2625   }
2626 
2627   if (!BaseReg)
2628     BaseReg = B.buildConstant(S32, 0).getReg(0);
2629 
2630   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
2631 }
2632 
2633 /// Handle register layout difference for f16 images for some subtargets.
2634 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
2635                                              MachineRegisterInfo &MRI,
2636                                              Register Reg) const {
2637   if (!ST.hasUnpackedD16VMem())
2638     return Reg;
2639 
2640   const LLT S16 = LLT::scalar(16);
2641   const LLT S32 = LLT::scalar(32);
2642   LLT StoreVT = MRI.getType(Reg);
2643   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
2644 
2645   auto Unmerge = B.buildUnmerge(S16, Reg);
2646 
2647   SmallVector<Register, 4> WideRegs;
2648   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
2649     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
2650 
2651   int NumElts = StoreVT.getNumElements();
2652 
2653   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
2654 }
2655 
2656 Register AMDGPULegalizerInfo::fixStoreSourceType(
2657   MachineIRBuilder &B, Register VData, bool IsFormat) const {
2658   MachineRegisterInfo *MRI = B.getMRI();
2659   LLT Ty = MRI->getType(VData);
2660 
2661   const LLT S16 = LLT::scalar(16);
2662 
2663   // Fixup illegal register types for i8 stores.
2664   if (Ty == LLT::scalar(8) || Ty == S16) {
2665     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
2666     return AnyExt;
2667   }
2668 
2669   if (Ty.isVector()) {
2670     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
2671       if (IsFormat)
2672         return handleD16VData(B, *MRI, VData);
2673     }
2674   }
2675 
2676   return VData;
2677 }
2678 
2679 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
2680                                               MachineRegisterInfo &MRI,
2681                                               MachineIRBuilder &B,
2682                                               bool IsTyped,
2683                                               bool IsFormat) const {
2684   B.setInstr(MI);
2685 
2686   Register VData = MI.getOperand(1).getReg();
2687   LLT Ty = MRI.getType(VData);
2688   LLT EltTy = Ty.getScalarType();
2689   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
2690   const LLT S32 = LLT::scalar(32);
2691 
2692   VData = fixStoreSourceType(B, VData, IsFormat);
2693   Register RSrc = MI.getOperand(2).getReg();
2694 
2695   MachineMemOperand *MMO = *MI.memoperands_begin();
2696   const int MemSize = MMO->getSize();
2697 
2698   unsigned ImmOffset;
2699   unsigned TotalOffset;
2700 
2701   // The typed intrinsics add an immediate after the registers.
2702   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
2703 
2704   // The struct intrinsic variants add one additional operand over raw.
2705   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
2706   Register VIndex;
2707   int OpOffset = 0;
2708   if (HasVIndex) {
2709     VIndex = MI.getOperand(3).getReg();
2710     OpOffset = 1;
2711   }
2712 
2713   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
2714   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
2715 
2716   unsigned Format = 0;
2717   if (IsTyped) {
2718     Format = MI.getOperand(5 + OpOffset).getImm();
2719     ++OpOffset;
2720   }
2721 
2722   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
2723 
2724   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
2725   if (TotalOffset != 0)
2726     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
2727 
2728   unsigned Opc;
2729   if (IsTyped) {
2730     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
2731                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
2732   } else if (IsFormat) {
2733     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
2734                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
2735   } else {
2736     switch (MemSize) {
2737     case 1:
2738       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
2739       break;
2740     case 2:
2741       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
2742       break;
2743     default:
2744       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
2745       break;
2746     }
2747   }
2748 
2749   if (!VIndex)
2750     VIndex = B.buildConstant(S32, 0).getReg(0);
2751 
2752   auto MIB = B.buildInstr(Opc)
2753     .addUse(VData)              // vdata
2754     .addUse(RSrc)               // rsrc
2755     .addUse(VIndex)             // vindex
2756     .addUse(VOffset)            // voffset
2757     .addUse(SOffset)            // soffset
2758     .addImm(ImmOffset);         // offset(imm)
2759 
2760   if (IsTyped)
2761     MIB.addImm(Format);
2762 
2763   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
2764      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
2765      .addMemOperand(MMO);
2766 
2767   MI.eraseFromParent();
2768   return true;
2769 }
2770 
2771 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
2772                                              MachineRegisterInfo &MRI,
2773                                              MachineIRBuilder &B,
2774                                              bool IsFormat,
2775                                              bool IsTyped) const {
2776   B.setInstr(MI);
2777 
2778   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
2779   MachineMemOperand *MMO = *MI.memoperands_begin();
2780   const int MemSize = MMO->getSize();
2781   const LLT S32 = LLT::scalar(32);
2782 
2783   Register Dst = MI.getOperand(0).getReg();
2784   Register RSrc = MI.getOperand(2).getReg();
2785 
2786   // The typed intrinsics add an immediate after the registers.
2787   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
2788 
2789   // The struct intrinsic variants add one additional operand over raw.
2790   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
2791   Register VIndex;
2792   int OpOffset = 0;
2793   if (HasVIndex) {
2794     VIndex = MI.getOperand(3).getReg();
2795     OpOffset = 1;
2796   }
2797 
2798   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
2799   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
2800 
2801   unsigned Format = 0;
2802   if (IsTyped) {
2803     Format = MI.getOperand(5 + OpOffset).getImm();
2804     ++OpOffset;
2805   }
2806 
2807   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
2808   unsigned ImmOffset;
2809   unsigned TotalOffset;
2810 
2811   LLT Ty = MRI.getType(Dst);
2812   LLT EltTy = Ty.getScalarType();
2813   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
2814   const bool Unpacked = ST.hasUnpackedD16VMem();
2815 
2816   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
2817   if (TotalOffset != 0)
2818     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
2819 
2820   unsigned Opc;
2821 
2822   if (IsTyped) {
2823     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
2824                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
2825   } else if (IsFormat) {
2826     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
2827                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
2828   } else {
2829     switch (MemSize) {
2830     case 1:
2831       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
2832       break;
2833     case 2:
2834       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
2835       break;
2836     default:
2837       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
2838       break;
2839     }
2840   }
2841 
2842   Register LoadDstReg;
2843 
2844   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
2845   LLT UnpackedTy = Ty.changeElementSize(32);
2846 
2847   if (IsExtLoad)
2848     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
2849   else if (Unpacked && IsD16 && Ty.isVector())
2850     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
2851   else
2852     LoadDstReg = Dst;
2853 
2854   if (!VIndex)
2855     VIndex = B.buildConstant(S32, 0).getReg(0);
2856 
2857   auto MIB = B.buildInstr(Opc)
2858     .addDef(LoadDstReg)         // vdata
2859     .addUse(RSrc)               // rsrc
2860     .addUse(VIndex)             // vindex
2861     .addUse(VOffset)            // voffset
2862     .addUse(SOffset)            // soffset
2863     .addImm(ImmOffset);         // offset(imm)
2864 
2865   if (IsTyped)
2866     MIB.addImm(Format);
2867 
2868   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
2869      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
2870      .addMemOperand(MMO);
2871 
2872   if (LoadDstReg != Dst) {
2873     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2874 
2875     // Widen result for extending loads was widened.
2876     if (IsExtLoad)
2877       B.buildTrunc(Dst, LoadDstReg);
2878     else {
2879       // Repack to original 16-bit vector result
2880       // FIXME: G_TRUNC should work, but legalization currently fails
2881       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
2882       SmallVector<Register, 4> Repack;
2883       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
2884         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
2885       B.buildMerge(Dst, Repack);
2886     }
2887   }
2888 
2889   MI.eraseFromParent();
2890   return true;
2891 }
2892 
2893 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
2894                                                MachineIRBuilder &B,
2895                                                bool IsInc) const {
2896   B.setInstr(MI);
2897   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
2898                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
2899   B.buildInstr(Opc)
2900     .addDef(MI.getOperand(0).getReg())
2901     .addUse(MI.getOperand(2).getReg())
2902     .addUse(MI.getOperand(3).getReg())
2903     .cloneMemRefs(MI);
2904   MI.eraseFromParent();
2905   return true;
2906 }
2907 
2908 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
2909   switch (IntrID) {
2910   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
2911   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
2912     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
2913   case Intrinsic::amdgcn_raw_buffer_atomic_add:
2914   case Intrinsic::amdgcn_struct_buffer_atomic_add:
2915     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
2916   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
2917   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
2918     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
2919   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
2920   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
2921     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
2922   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
2923   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
2924     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
2925   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
2926   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
2927     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
2928   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
2929   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
2930     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
2931   case Intrinsic::amdgcn_raw_buffer_atomic_and:
2932   case Intrinsic::amdgcn_struct_buffer_atomic_and:
2933     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
2934   case Intrinsic::amdgcn_raw_buffer_atomic_or:
2935   case Intrinsic::amdgcn_struct_buffer_atomic_or:
2936     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
2937   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
2938   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
2939     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
2940   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
2941   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
2942     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
2943   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
2944   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
2945     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
2946   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
2947   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
2948     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
2949   default:
2950     llvm_unreachable("unhandled atomic opcode");
2951   }
2952 }
2953 
2954 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
2955                                                MachineIRBuilder &B,
2956                                                Intrinsic::ID IID) const {
2957   B.setInstr(MI);
2958 
2959   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
2960                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
2961 
2962   Register Dst = MI.getOperand(0).getReg();
2963   Register VData = MI.getOperand(2).getReg();
2964 
2965   Register CmpVal;
2966   int OpOffset = 0;
2967 
2968   if (IsCmpSwap) {
2969     CmpVal = MI.getOperand(3 + OpOffset).getReg();
2970     ++OpOffset;
2971   }
2972 
2973   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
2974   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
2975 
2976   // The struct intrinsic variants add one additional operand over raw.
2977   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
2978   Register VIndex;
2979   if (HasVIndex) {
2980     VIndex = MI.getOperand(4 + OpOffset).getReg();
2981     ++OpOffset;
2982   }
2983 
2984   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
2985   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
2986   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
2987 
2988   MachineMemOperand *MMO = *MI.memoperands_begin();
2989 
2990   unsigned ImmOffset;
2991   unsigned TotalOffset;
2992   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
2993   if (TotalOffset != 0)
2994     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
2995 
2996   if (!VIndex)
2997     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
2998 
2999   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
3000     .addDef(Dst)
3001     .addUse(VData); // vdata
3002 
3003   if (IsCmpSwap)
3004     MIB.addReg(CmpVal);
3005 
3006   MIB.addUse(RSrc)               // rsrc
3007      .addUse(VIndex)             // vindex
3008      .addUse(VOffset)            // voffset
3009      .addUse(SOffset)            // soffset
3010      .addImm(ImmOffset)          // offset(imm)
3011      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3012      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3013      .addMemOperand(MMO);
3014 
3015   MI.eraseFromParent();
3016   return true;
3017 }
3018 
3019 // Produce a vector of s16 elements from s32 pieces.
3020 static void truncToS16Vector(MachineIRBuilder &B, Register DstReg,
3021                              ArrayRef<Register> UnmergeParts) {
3022   const LLT S16 = LLT::scalar(16);
3023 
3024   SmallVector<Register, 4> RemergeParts(UnmergeParts.size());
3025   for (int I = 0, E = UnmergeParts.size(); I != E; ++I)
3026     RemergeParts[I] = B.buildTrunc(S16, UnmergeParts[I]).getReg(0);
3027 
3028   B.buildBuildVector(DstReg, RemergeParts);
3029 }
3030 
3031 /// Convert a set of s32 registers to a result vector with s16 elements.
3032 static void bitcastToS16Vector(MachineIRBuilder &B, Register DstReg,
3033                                ArrayRef<Register> UnmergeParts) {
3034   MachineRegisterInfo &MRI = *B.getMRI();
3035   const LLT V2S16 = LLT::vector(2, 16);
3036   LLT TargetTy = MRI.getType(DstReg);
3037   int NumElts = UnmergeParts.size();
3038 
3039   if (NumElts == 1) {
3040     assert(TargetTy == V2S16);
3041     B.buildBitcast(DstReg, UnmergeParts[0]);
3042     return;
3043   }
3044 
3045   SmallVector<Register, 4> RemergeParts(NumElts);
3046   for (int I = 0; I != NumElts; ++I)
3047     RemergeParts[I] = B.buildBitcast(V2S16, UnmergeParts[I]).getReg(0);
3048 
3049   if (TargetTy.getSizeInBits() == 32u * NumElts) {
3050     B.buildConcatVectors(DstReg, RemergeParts);
3051     return;
3052   }
3053 
3054   const LLT V3S16 = LLT::vector(3, 16);
3055   const LLT V6S16 = LLT::vector(6, 16);
3056 
3057   // Widen to v6s16 and unpack v3 parts.
3058   assert(TargetTy == V3S16);
3059 
3060   RemergeParts.push_back(B.buildUndef(V2S16).getReg(0));
3061   auto Concat = B.buildConcatVectors(V6S16, RemergeParts);
3062   B.buildUnmerge({DstReg, MRI.createGenericVirtualRegister(V3S16)}, Concat);
3063 }
3064 
3065 // FIXME: Just vector trunc should be sufficent, but legalization currently
3066 // broken.
3067 static void repackUnpackedD16Load(MachineIRBuilder &B, Register DstReg,
3068                                   Register WideDstReg) {
3069   const LLT S32 = LLT::scalar(32);
3070   const LLT S16 = LLT::scalar(16);
3071 
3072   auto Unmerge = B.buildUnmerge(S32, WideDstReg);
3073 
3074   int NumOps = Unmerge->getNumOperands() - 1;
3075   SmallVector<Register, 4> RemergeParts(NumOps);
3076   for (int I = 0; I != NumOps; ++I)
3077     RemergeParts[I] = B.buildTrunc(S16, Unmerge.getReg(I)).getReg(0);
3078 
3079   B.buildBuildVector(DstReg, RemergeParts);
3080 }
3081 
3082 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3083     MachineInstr &MI, MachineIRBuilder &B,
3084     GISelChangeObserver &Observer,
3085     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3086   bool IsTFE = MI.getNumExplicitDefs() == 2;
3087 
3088   // We are only processing the operands of d16 image operations on subtargets
3089   // that use the unpacked register layout, or need to repack the TFE result.
3090 
3091   // TODO: Need to handle a16 images too
3092   // TODO: Do we need to guard against already legalized intrinsics?
3093   if (!IsTFE && !ST.hasUnpackedD16VMem())
3094     return true;
3095 
3096   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3097     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3098 
3099   if (BaseOpcode->Atomic) // No d16 atomics, or TFE.
3100     return true;
3101 
3102   B.setInstr(MI);
3103 
3104   MachineRegisterInfo *MRI = B.getMRI();
3105   const LLT S32 = LLT::scalar(32);
3106   const LLT S16 = LLT::scalar(16);
3107 
3108   if (BaseOpcode->Store) { // No TFE for stores?
3109     Register VData = MI.getOperand(1).getReg();
3110     LLT Ty = MRI->getType(VData);
3111     if (!Ty.isVector() || Ty.getElementType() != S16)
3112       return true;
3113 
3114     B.setInstr(MI);
3115 
3116     Observer.changingInstr(MI);
3117     MI.getOperand(1).setReg(handleD16VData(B, *MRI, VData));
3118     Observer.changedInstr(MI);
3119     return true;
3120   }
3121 
3122   Register DstReg = MI.getOperand(0).getReg();
3123   LLT Ty = MRI->getType(DstReg);
3124   const LLT EltTy = Ty.getScalarType();
3125   const bool IsD16 = Ty.getScalarType() == S16;
3126   const unsigned NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
3127 
3128   if (IsTFE) {
3129     // In the IR, TFE is supposed to be used with a 2 element struct return
3130     // type. The intruction really returns these two values in one contiguous
3131     // register, with one additional dword beyond the loaded data. Rewrite the
3132     // return type to use a single register result.
3133     Register Dst1Reg = MI.getOperand(1).getReg();
3134     if (MRI->getType(Dst1Reg) != S32)
3135       return false;
3136 
3137     // TODO: Make sure the TFE operand bit is set.
3138 
3139     // The raw dword aligned data component of the load. The only legal cases
3140     // where this matters should be when using the packed D16 format, for
3141     // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
3142     LLT RoundedTy;
3143     LLT TFETy;
3144 
3145     if (IsD16 && ST.hasUnpackedD16VMem()) {
3146       RoundedTy = LLT::scalarOrVector(NumElts, 32);
3147       TFETy = LLT::vector(NumElts + 1, 32);
3148     } else {
3149       unsigned EltSize = Ty.getScalarSizeInBits();
3150       unsigned RoundedElts = (Ty.getSizeInBits() + 31) / 32;
3151       unsigned RoundedSize = 32 * RoundedElts;
3152       RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
3153       TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
3154     }
3155 
3156     Register TFEReg = MRI->createGenericVirtualRegister(TFETy);
3157     Observer.changingInstr(MI);
3158 
3159     MI.getOperand(0).setReg(TFEReg);
3160     MI.RemoveOperand(1);
3161 
3162     Observer.changedInstr(MI);
3163 
3164     // Insert after the instruction.
3165     B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3166 
3167     // Now figure out how to copy the new result register back into the old
3168     // result.
3169 
3170     SmallVector<Register, 5> UnmergeResults(TFETy.getNumElements(), Dst1Reg);
3171     int NumDataElts = TFETy.getNumElements() - 1;
3172 
3173     if (!Ty.isVector()) {
3174       // Simplest case is a trivial unmerge (plus a truncate for d16).
3175       UnmergeResults[0] = Ty == S32 ?
3176         DstReg : MRI->createGenericVirtualRegister(S32);
3177 
3178       B.buildUnmerge(UnmergeResults, TFEReg);
3179       if (Ty != S32)
3180         B.buildTrunc(DstReg, UnmergeResults[0]);
3181       return true;
3182     }
3183 
3184     // We have to repack into a new vector of some kind.
3185     for (int I = 0; I != NumDataElts; ++I)
3186       UnmergeResults[I] = MRI->createGenericVirtualRegister(S32);
3187     B.buildUnmerge(UnmergeResults, TFEReg);
3188 
3189     // Drop the final TFE element.
3190     ArrayRef<Register> DataPart(UnmergeResults.data(), NumDataElts);
3191 
3192     if (EltTy == S32)
3193       B.buildBuildVector(DstReg, DataPart);
3194     else if (ST.hasUnpackedD16VMem())
3195       truncToS16Vector(B, DstReg, DataPart);
3196     else
3197       bitcastToS16Vector(B, DstReg, DataPart);
3198 
3199     return true;
3200   }
3201 
3202   // Must be an image load.
3203   if (!Ty.isVector() || Ty.getElementType() != S16)
3204     return true;
3205 
3206   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3207 
3208   LLT WidenedTy = Ty.changeElementType(S32);
3209   Register WideDstReg = MRI->createGenericVirtualRegister(WidenedTy);
3210 
3211   Observer.changingInstr(MI);
3212   MI.getOperand(0).setReg(WideDstReg);
3213   Observer.changedInstr(MI);
3214 
3215   repackUnpackedD16Load(B, DstReg, WideDstReg);
3216   return true;
3217 }
3218 
3219 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
3220   MachineInstr &MI, MachineIRBuilder &B,
3221   GISelChangeObserver &Observer) const {
3222   Register Dst = MI.getOperand(0).getReg();
3223   LLT Ty = B.getMRI()->getType(Dst);
3224   unsigned Size = Ty.getSizeInBits();
3225   MachineFunction &MF = B.getMF();
3226 
3227   Observer.changingInstr(MI);
3228 
3229   // FIXME: We don't really need this intermediate instruction. The intrinsic
3230   // should be fixed to have a memory operand. Since it's readnone, we're not
3231   // allowed to add one.
3232   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
3233   MI.RemoveOperand(1); // Remove intrinsic ID
3234 
3235   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
3236   // TODO: Should this use datalayout alignment?
3237   const unsigned MemSize = (Size + 7) / 8;
3238   const unsigned MemAlign = 4;
3239   MachineMemOperand *MMO = MF.getMachineMemOperand(
3240     MachinePointerInfo(),
3241     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
3242     MachineMemOperand::MOInvariant, MemSize, MemAlign);
3243   MI.addMemOperand(MF, MMO);
3244 
3245   // There are no 96-bit result scalar loads, but widening to 128-bit should
3246   // always be legal. We may need to restore this to a 96-bit result if it turns
3247   // out this needs to be converted to a vector load during RegBankSelect.
3248   if (!isPowerOf2_32(Size)) {
3249     LegalizerHelper Helper(MF, *this, Observer, B);
3250     B.setInstr(MI);
3251 
3252     if (Ty.isVector())
3253       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
3254     else
3255       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
3256   }
3257 
3258   Observer.changedInstr(MI);
3259   return true;
3260 }
3261 
3262 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
3263                                             MachineIRBuilder &B,
3264                                             GISelChangeObserver &Observer) const {
3265   MachineRegisterInfo &MRI = *B.getMRI();
3266 
3267   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
3268   auto IntrID = MI.getIntrinsicID();
3269   switch (IntrID) {
3270   case Intrinsic::amdgcn_if:
3271   case Intrinsic::amdgcn_else: {
3272     MachineInstr *Br = nullptr;
3273     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
3274       const SIRegisterInfo *TRI
3275         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
3276 
3277       B.setInstr(*BrCond);
3278       Register Def = MI.getOperand(1).getReg();
3279       Register Use = MI.getOperand(3).getReg();
3280 
3281       MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB();
3282       if (Br)
3283         BrTarget = Br->getOperand(0).getMBB();
3284 
3285       if (IntrID == Intrinsic::amdgcn_if) {
3286         B.buildInstr(AMDGPU::SI_IF)
3287           .addDef(Def)
3288           .addUse(Use)
3289           .addMBB(BrTarget);
3290       } else {
3291         B.buildInstr(AMDGPU::SI_ELSE)
3292           .addDef(Def)
3293           .addUse(Use)
3294           .addMBB(BrTarget)
3295           .addImm(0);
3296       }
3297 
3298       if (Br)
3299         Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB());
3300 
3301       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
3302       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
3303       MI.eraseFromParent();
3304       BrCond->eraseFromParent();
3305       return true;
3306     }
3307 
3308     return false;
3309   }
3310   case Intrinsic::amdgcn_loop: {
3311     MachineInstr *Br = nullptr;
3312     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
3313       const SIRegisterInfo *TRI
3314         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
3315 
3316       B.setInstr(*BrCond);
3317 
3318       // FIXME: Need to adjust branch targets based on unconditional branch.
3319       Register Reg = MI.getOperand(2).getReg();
3320       B.buildInstr(AMDGPU::SI_LOOP)
3321         .addUse(Reg)
3322         .addMBB(BrCond->getOperand(1).getMBB());
3323       MI.eraseFromParent();
3324       BrCond->eraseFromParent();
3325       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
3326       return true;
3327     }
3328 
3329     return false;
3330   }
3331   case Intrinsic::amdgcn_kernarg_segment_ptr:
3332     return legalizePreloadedArgIntrin(
3333       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
3334   case Intrinsic::amdgcn_implicitarg_ptr:
3335     return legalizeImplicitArgPtr(MI, MRI, B);
3336   case Intrinsic::amdgcn_workitem_id_x:
3337     return legalizePreloadedArgIntrin(MI, MRI, B,
3338                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
3339   case Intrinsic::amdgcn_workitem_id_y:
3340     return legalizePreloadedArgIntrin(MI, MRI, B,
3341                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
3342   case Intrinsic::amdgcn_workitem_id_z:
3343     return legalizePreloadedArgIntrin(MI, MRI, B,
3344                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
3345   case Intrinsic::amdgcn_workgroup_id_x:
3346     return legalizePreloadedArgIntrin(MI, MRI, B,
3347                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
3348   case Intrinsic::amdgcn_workgroup_id_y:
3349     return legalizePreloadedArgIntrin(MI, MRI, B,
3350                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
3351   case Intrinsic::amdgcn_workgroup_id_z:
3352     return legalizePreloadedArgIntrin(MI, MRI, B,
3353                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
3354   case Intrinsic::amdgcn_dispatch_ptr:
3355     return legalizePreloadedArgIntrin(MI, MRI, B,
3356                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
3357   case Intrinsic::amdgcn_queue_ptr:
3358     return legalizePreloadedArgIntrin(MI, MRI, B,
3359                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
3360   case Intrinsic::amdgcn_implicit_buffer_ptr:
3361     return legalizePreloadedArgIntrin(
3362       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
3363   case Intrinsic::amdgcn_dispatch_id:
3364     return legalizePreloadedArgIntrin(MI, MRI, B,
3365                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
3366   case Intrinsic::amdgcn_fdiv_fast:
3367     return legalizeFDIVFastIntrin(MI, MRI, B);
3368   case Intrinsic::amdgcn_is_shared:
3369     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
3370   case Intrinsic::amdgcn_is_private:
3371     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
3372   case Intrinsic::amdgcn_wavefrontsize: {
3373     B.setInstr(MI);
3374     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
3375     MI.eraseFromParent();
3376     return true;
3377   }
3378   case Intrinsic::amdgcn_s_buffer_load:
3379     return legalizeSBufferLoad(MI, B, Observer);
3380   case Intrinsic::amdgcn_raw_buffer_store:
3381   case Intrinsic::amdgcn_struct_buffer_store:
3382     return legalizeBufferStore(MI, MRI, B, false, false);
3383   case Intrinsic::amdgcn_raw_buffer_store_format:
3384   case Intrinsic::amdgcn_struct_buffer_store_format:
3385     return legalizeBufferStore(MI, MRI, B, false, true);
3386   case Intrinsic::amdgcn_raw_tbuffer_store:
3387   case Intrinsic::amdgcn_struct_tbuffer_store:
3388     return legalizeBufferStore(MI, MRI, B, true, true);
3389   case Intrinsic::amdgcn_raw_buffer_load:
3390   case Intrinsic::amdgcn_struct_buffer_load:
3391     return legalizeBufferLoad(MI, MRI, B, false, false);
3392   case Intrinsic::amdgcn_raw_buffer_load_format:
3393   case Intrinsic::amdgcn_struct_buffer_load_format:
3394     return legalizeBufferLoad(MI, MRI, B, true, false);
3395   case Intrinsic::amdgcn_raw_tbuffer_load:
3396   case Intrinsic::amdgcn_struct_tbuffer_load:
3397     return legalizeBufferLoad(MI, MRI, B, true, true);
3398   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3399   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3400   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3401   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3402   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3403   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3404   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3405   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3406   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3407   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3408   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3409   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3410   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3411   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3412   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3413   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3414   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3415   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3416   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3417   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3418   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3419   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3420   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3421   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3422   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3423   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3424     return legalizeBufferAtomic(MI, B, IntrID);
3425   case Intrinsic::amdgcn_atomic_inc:
3426     return legalizeAtomicIncDec(MI, B, true);
3427   case Intrinsic::amdgcn_atomic_dec:
3428     return legalizeAtomicIncDec(MI, B, false);
3429   default: {
3430     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
3431             AMDGPU::getImageDimIntrinsicInfo(IntrID))
3432       return legalizeImageIntrinsic(MI, B, Observer, ImageDimIntr);
3433     return true;
3434   }
3435   }
3436 
3437   return true;
3438 }
3439